# Ceneo Scraper

# Załadowanie bibliotek

In [6]:
import os
import json
import requests
from bs4 import BeautifulSoup

# Wysłanie zapytania do serwera


## funkcja

In [7]:
def extract(ancestor, selector = None, attribute = None, return_list = False):
    if selector:
        if return_list:
            if attribute:
                return [tag[attribute].strip() for tag in ancestor.select(selector)]
            return [tag.text.strip() for tag in ancestor.select(selector)]
        if attribute:
            try:
                return ancestor.select_one(selector)[attribute].strip()
            except TypeError:
                return None
        try:
            return ancestor.select_one(selector).text.strip()
        except AttributeError:
            return None
    if attribute:
        return ancestor[attribute]
    return ancestor.text.strip()

## Ekstrakcja składowej pojedynczych opinii
| Składowa | Selektor | Zmienna |
|----------|--------|---------|
| id_opinii |div.js_product-review['data-entry-id']  | opinion_id | 
| autor |span.post_author-name | author | 
| rekomendacje |span.user-post_author-recomendation | recommendations | 
| gwiazdki |span.user-post_score-count |  stars | 
| treść |div.user-post_text | content | 
| lista zalet |div.review-feature_item-positives ~ div.review-feature_item | pros| 
| lista wad |div.review-feature_item-negatives ~ div.review-feature_item | cons |
| dla ilu przydatna |button.vote-yes > span | helpful |
| dla ilu nieprzydatna |button.vote-no > span | unhelpful |
| data wystawienia | user-post_published > time:nth-child(1)["datatime"] | publish_date |
| data zakupu |time:nth-child(2)["datatime"]| purchase_date |


In [8]:
selectors = {
    "opinion_id" : (None,"data-entry-id"),
    "author" : ("span.user-post__author-name",),
    "recommendations" : ("span.user-post__author-recomendation",),
    "stars" : ("span.user-post__score-count",),
    "content" : ("div.user-post__text",),
    "pros" : ("div.review-feature__title--positives ~ div.review-feature__item", None, True),
    "cons" : ("div.review-feature__title--negatives ~ div.review-feature__item", None, True),
    "helpful" : ("button.vote-yes > span",),
    "unhelpful" : ("button.vote-no > span",),
    "publish_date" : ('span.user-post__published > time:nth-child(1)',"datetime"),
    "purchase_date" : ('span.user-post__published > time:nth-child(2)',"datetime"),
}

In [9]:
# product_id = input("Podaj kod produktu: ")
# product_id = "138331381"
product_id = "39562616"
url = f"https://www.ceneo.pl/{product_id}#tab=reviews"

## Pobranie wszystkich opinii o produkcie 

In [10]:
all_opinions = []

while(url):
    response = requests.get(url)
    page_dom = BeautifulSoup(response.text, "html.parser")
    opinions = page_dom.select("div.js_product-review")
    for opinion in opinions:
        single_opinion = {
            key: extract(opinion, *value)
                for key, value in selectors.items()
        }
        all_opinions.append(single_opinion)
    try:
        url = "https://www.ceneo.pl" + page_dom.select_one("a.pagination__next")["href"].strip()
    except TypeError: 
        url = None
 
    print(url)

https://www.ceneo.pl/39562616/opinie-2
https://www.ceneo.pl/39562616/opinie-3
https://www.ceneo.pl/39562616/opinie-4
https://www.ceneo.pl/39562616/opinie-5
https://www.ceneo.pl/39562616/opinie-6
https://www.ceneo.pl/39562616/opinie-7
https://www.ceneo.pl/39562616/opinie-8
https://www.ceneo.pl/39562616/opinie-9
https://www.ceneo.pl/39562616/opinie-10
https://www.ceneo.pl/39562616/opinie-11
https://www.ceneo.pl/39562616/opinie-12
https://www.ceneo.pl/39562616/opinie-13
https://www.ceneo.pl/39562616/opinie-14
None


## Zapis do JSON

In [11]:
if not os.path.exists("opinions"):
    os.makedirs("opinions")
with open(f"opinions/{product_id}.json","w",encoding="UTF-8") as jf:
    json.dump(all_opinions, jf,indent=4, ensure_ascii=False)   
