# Ceneo Scraper

## Extract  components for single opinion

|Component|Selector|Variable|
|---------|--------|--------|
|opinion ID |  ["data-entry-id"]  |opinion_id |
|opinion’s author |  span.user-post__author-name | author|
|author’s recommendation | span.user-post__author-recomendation > em |recommendations|
|score expressed in number of stars | span.user-post__score-count |score|
|opinion’s content | div.user-post__text |content|
|list of product advantages | div.review-feature__title--positives ~ div.review.-feature__items |pros|
|list of product disadvantages |div.review-feature__title--negatives ~ div.review.-feature__items |cons|
|how many users think that opinion was helpful | button.vote-yes > span |helpful|
|how many users think that opinion was unhelpful |button.vote-no > span | unhelpful|
|publishing date | span.user-post__published > time:nth-child(1)["datetime"]  |publish_date|
|purchase date | span.user-post__published > time:nth-child(2)["datetime"] |purchase_date|


# imports

In [77]:
import os
import json
import requests
from bs4 import BeautifulSoup
from deep_translator import GoogleTranslator

## Definition of extracting content function


In [78]:
def extract_content(ancestor,selector,attribute=None,return_list=False):
            if selector:
                if return_list:
                    if attribute:
                        return [tag[attribute].strip() for tag in ancestor.select(selector)]
                    return [tag.text.strip() for tag in ancestor.select(selector)]
                if attribute:
                    try:
                        return ancestor.select_one(selector)[attribute].strip()
                    except TypeError:
                        return None
                try:
                    return ancestor.select_one(selector).text.strip()
                except AttributeError:
                    return None
            if attribute:
                  return ancestor[attribute]
            return ancestor[attribute]


## Opinion Structure

In [79]:
selectors={
"opinion_id":(None,'data-entry-id'),
"author":("span.user-post__author-name",),
"recommendation":("span.user-post__author-recommendation > em",),
"stars" :("span.user-post__score-count",),
"content" :("div.user-post__text",),
"pros" :("div.review-feature__title--positives ~ div.review-feature__item",None,True),
"cons" :("div.review-feature__title--negatives ~ div.review-feature__item",None,True),
"helpful" :("button.vote-yes > span",),
"unhelpful" :("button.vote-no > span",),
"publish_date": ("span.user-post__published > time:nth-child(1)","datetime"),
"purchase_date":("span.user-post__published > time:nth-child(2)","datetime"),
}

## Transformation function


In [80]:
def score(score:str) -> float:
    s = score.split("/")
    return float(s[0].replace(",","."))/float(s[1])

def translate(text, lang_from = "pl", lang_to ="en"):
    if isinstance(text,list):
        return [GoogleTranslator(source=lang_from, target=lang_to).translate(t) for t in text]
    return GoogleTranslator(source=lang_from, target=lang_to).translate(text)


## Dictionary with transformations

In [81]:
transformations = {
    "recommendation": lambda r: True if r == "Polecam" else False if r == "nie polecam" else None,
    "stars": score,
    "helpful": int,
    "unhelpful": int,
    "content": translate,
    "pros":translate,
    "cons": translate,
}

# URl addres for the first page with opinions about product

In [82]:
product_id=input("Enter product code")
url=f"https://www.ceneo.pl/{product_id}#tab-reviews"
response = requests.get(url)
response.status_code


200

## Convert plain text HTML code into DOM structure

In [83]:
page_dom = BeautifulSoup(response.text,"html.parser")
opinions =page_dom.select("div.js_product-review")
all_opinions = []


## Exctract all opinions


In [84]:
all_opinions = []
while(url):
    response = requests.get(url)
    response.status_code
    page_dom = BeautifulSoup(response.text,"html.parser")
    opinions =page_dom.select("div.js_product-review")

    for opinion in opinions:
        single_opinion={
            key: extract_content(opinion, *value)
                for key,value in selectors.items()
            
        }
        for key,value in transformations.items():
            single_opinion[key] = value(single_opinion[key])
        all_opinions.append(single_opinion)
    try:
        url = "https://www.ceneo.pl"+extract_content(page_dom, "a.pagination__next", "href")
    except TypeError:
        url = None

    

ChunkedEncodingError: Response ended prematurely

## Extract all components of single opinion

In [None]:
if not os.path.exists("opinions"):
    os.mkdir("opinions")

with open(f"opinions/{product_id}.json","w",encoding="UTF-8") as jf:
    json.dump(all_opinions, jf, indent=4, ensure_ascii = False)

In [None]:
len(all_opinions)

0