# Ceneo Scraper


## Components of single opinion
|Component|Selector|Variable|
|---------|--------|--------|
|Opinion ID|page_dom['data-entry-id']|opinion_id|
|opinion's author|span.user-post__author-name|author|
|author's recommendation|span.user-post__author-recommendation>em |recommendation|
|score expressed in number of stars|span.user-post__score-count|score|
|opinion’s content|div.user-post__text|content|
| list of product advantages|div.review-feature__title review-feature__title--positives ~ div.review-feature__item|pros|
| list of product disadvantages|div.review-feature__title review-feature__title--negatives ~ div.review-feature__item|cons|
| how many users think that opinion was helpful|span[id="votes-yes]|helpful|
| how many users think that opinion was unhelpful|span[id="votes-no]|unhelpful|
| publishing date|span.user-post__published > time:nth-child(1)|publish_date|
| purchase date|span.user-post__published >time:nth-child(2)|purchase_date|



# Structure of single opinion

In [None]:
selectors={
    "opinion_id":[None,'data-entry-id'],
    "author":["span.user-post__author-name"],
    "recommendation":["span.user-post__author-recomendation>em"],
    "score":["span.user-post__score-count"],
    "content":["div.user-post__text"],
    "pros":["div.review-feature__title--positives ~ div.review-feature__item",None,True],
    "cons":["div.review-feature__title--negatives ~ div.review-feature__item",None,True],
    "helpful":["button.vote-yes >span"],
    "unhelpful":["button.vote-no >span"],
    "publish_date":["span.user-post__published > time:nth-child(1)","datetime"],
    "purchase_date":["span.user-post__published >time:nth-child(2)","datetime"],
}


## Loading Libraries

In [None]:
import json
from deep_translator import GoogleTranslator
import os
import requests
from bs4 import BeautifulSoup

## Function that extracts HTML

In [None]:
def extract(ancestor, selector, attribute = None , return_list = False):
    if return_list:
        if attribute:
            return [p[attribute] for p in ancestor.select(selector)]
        return [p.get_text().strip() for p in ancestor.select(selector)]
    if selector:
        if attribute:
            try:
                return ancestor.select_one(selector)[attribute]
            except TypeError:
                return  None
        try:
            return ancestor.select_one(selector).get_text().strip()
        except AttributeError:
            return None
    if attribute:
        return ancestor[attribute]
    return None

## TRANSFORMATION FUNCTIONS

In [None]:
def rate(score):
    score = score.split("/")
    return float(score[0].replace(",","."))/float(score[1].replace(",","."))
def reccomend(reccomendation):
    return True if reccomendation == 'Polecam' else False if reccomendation == 'Nie polecam' else None

## Translation

In [None]:

def translate(text,to_Lang="en",from_lang="pl"):
    if text:
        if isinstance(text,list):
            return {from_lang:text,
                    to_Lang:[GoogleTranslator(source=from_lang,target=to_Lang).translate(i) for i in text]}
        return {from_lang:text,
                to_Lang:GoogleTranslator(source=from_lang,target=to_Lang).translate(text)}
    return None

# saving all opinions to JSON file

In [None]:
transformations = {
    "recommendation" :  reccomend,
    "score" :  rate,
    "helpful" :  int,
    "unhelpful" : int,
    "content":translate,
    "pros":translate,
    "cons":translate
}

## URL for the first page with opinions of product

In [None]:
product_id =input("Please provide Ceneo.pl code: ")
url = f"https://www.ceneo.pl/{product_id}#tab=reviews"

# Extracting opinions from HTML code

In [None]:
all_opinions = []
while(url):
    print(url)
    response = requests.get(url)
    page_dom = BeautifulSoup(response.text , 'html.parser')
    opinions = page_dom.select('.js_product-review')
    for opinion in opinions:
        single_opinion = {
            key : extract(opinion , *value)
                for key, value in selectors.items()
        }
        for key, value in transformations.items():
            single_opinion[key] = value(single_opinion[key])
            print(single_opinion[key])
        all_opinions.append(single_opinion)
    try:
        url = "https://www.ceneo.pl"+extract(page_dom, 'a.pagination__next', 'href')
    except TypeError:
        url = None


## Extracting components for single opinion

In [None]:
if not os.path.exists('opinions'):
    os.mkdir('opinions')
jf = open(f'opinions/{product_id}.json' , 'w' , encoding='UTF-8')
json.dump(all_opinions, jf, indent=4, ensure_ascii=False)
jf.close()