### Imports

In [16]:
import json
import os
import requests
from bs4 import BeautifulSoup
from deep_translator import GoogleTranslator

### Utils

In [17]:
def extract(ancestor, selector = None, attribute = None, multiple = False):
    if selector:
        if multiple:
            if attribute:
                return [tag[attribute].strip() for tag in ancestor.select(selector)]
            return [tag.get_text().strip() for tag in ancestor.select(selector)]
        if attribute:
            try:
                return ancestor.select_one(selector)[attribute].strip()
            except TypeError:
                return None
        try:
            return ancestor.select_one(selector).get_text().strip()
        except AttributeError:
            return None
    try:
        return ancestor[attribute].strip()
    except (TypeError,KeyError):
        return None

In [18]:
def translate(text, source="pl",target = "en"):
    return GoogleTranslator(source,target).translate(text=text)

In [19]:
selectors = {
    "opinion_id" : (None, "data-entry-id"),
    "author"  : ("span.user-post__author-name",),
    "recommendation" : ( "span.user-post__author-recomendation > em",),
    "stars" : ("span.user-post__score-count",),
    "content_pl" : ("div.user-post__text",),
    "pros_pl" : ("div.review-feature__item--positive",None,True),
    "cons_pl" : ("div.review-feature__item--negative",None,True),
    "vote_yes" : ( "button.vote-yes", "data-total-vote"),
    "vote_no" : ( "button.vote-no", "data-total-vote"),
    "published" : ("span.user-post__published > time:nth-child(1)","datetime"),
    "purchased" : ("span.user-post__published > time:nth-child(2)", "datetime"),
}

### Extraction of opinions

In [20]:
with open("./cookie.json","r") as jsonFile:
    headers = json.load(jsonFile)

In [None]:
productID = input("Enter product id")
next_page = f"https://www.ceneo.pl/{productID}#tab=reviews_scroll"
allOpinions = []
while next_page:
    response = requests.get(next_page)
    if response.status_code == 200:
        page_dom = BeautifulSoup(response.text, 'html.parser')
        opinions = page_dom.select("div.js_product-review:not(.user-post--highlight)")
        for opinion in opinions:
            singleOpinion = {
                key: extract(opinion, *value)
                for key,value in selectors.items()
            }
            singleOpinion["content_en"] = translate(singleOpinion['content_pl'])
            singleOpinion["pros_en"] = [translate(pros) for pros in singleOpinion['pros_pl']]
            singleOpinion["cons_en"] = [translate(cons) for cons in singleOpinion['cons_pl']]
            singleOpinion['recommendation'] = True if singleOpinion['recommendation'] == "Polecam" else False if singleOpinion['recommendation'] == "Nie polecam" else None
            singleOpinion['stars'] = float(singleOpinion['stars'].split("/")[0].replace(",","."))
            singleOpinion['vote_yes'] = int(singleOpinion['vote_yes']) 
            singleOpinion['vote_no'] = int(singleOpinion['vote_no']) 
            allOpinions.append(singleOpinion)
        try:
            next_page = "https://www.ceneo.pl" + page_dom.select_one("a.pagination__next")["href"]
        except TypeError:
            next_page = None

### Export opinions to json file

In [22]:
if not os.path.exists("./opinions"):
    os.mkdir("./opinions")
    
with open(f"./opinions/{productID}.json","w",encoding="UTF-8") as jsonFile:
    json.dump(allOpinions, jsonFile, ensure_ascii = False, indent = 4)