# Web-scraping script to get smartphone reviews data from site Rozetka.ua.

In [None]:
# Import necessary libraries
import pandas as pd
import scrapy
from scrapy import Selector
import requests
from bs4 import BeautifulSoup
import re
from datetime import date

In [32]:
# Get a list of links to products of particular category (smartphones)
fulllinks = []
for i in range (1, 19):
    url = 'https://rozetka.com.ua/ua/mobile-phones/c80003/page={};preset=smartfon/'.format(i)
    html = requests.get( url ).content
    sel = Selector( text=html )
    for link in sel.xpath('//a[@class="goods-tile__heading"]/@href').getall():
        fulllinks.append(link)

In [33]:
# Count number of links retrived
len(pd.DataFrame(fulllinks)[0].unique())

960

In [34]:
# Iterate through list of links to products, parse html to retrieve needed data and save it as list of lists  
dt = []
for i in fulllinks:
    html = requests.get( i +'/comments/' ).content
    html = html.replace(b'<br>', b'')
    html = html.replace(b'<br/>', b'')
    html = html.replace(b'\n', b'')
    sel = Selector( text=html )
    
    link_id = sel.xpath('//a[@class="button button_type_link comment__link"]/@href').getall()
    link_id = set(link_id)
    ids=[]
    for i in link_id:
        _, j = i.split('=')
        ids.append(j)
        
    title = sel.xpath('//h1[@class="product__title"]/text()').extract_first()
    price = sel.xpath('//script [@type="application/ld+json"][@data-seo="Product"]/text()').extract_first()
    price = re.search('"price":"(.*)","priceCurrency"', price).group(1)
    
    for id_number in ids:
        rewiew_date = sel.xpath('//a[@class="button button_type_link comment__link" and contains(@href,"{}")]//preceding::time[@class="comment__date"]/text()'.format(id_number)).getall()[-1]
        rewiew_text_in = sel.xpath('//div[@class="comment"]//a[@class="button button_type_link comment__link" and contains(@href,"{}")]/following::rz-comment-rating/following::*[1]'.format(id_number)).extract_first()

        rating_numbers=[]
        for star_number in range (5):
            path = str(sel.xpath('//rz-rating-stars[@class="comment__rating-stars"][@id="{}"]/*/li[@class="rating-stars__item"][@data-index-number="{}"]/*/path'.format(id_number, star_number)).extract_first())
            if path !="None":
                _, j = path.split('#')
                j, _ = j.split(')')
                rating_numbers.append(int(j))
        rewiew_feedback_usefull = sel.xpath('//a[@class="button button_type_link comment__link" and contains(@href,"{}")]//following::button[@aria-label="Полезный отзыв"]/text()'.format(id_number)).extract_first()
        rewiew_feedback_notusefull = sel.xpath('//a[@class="button button_type_link comment__link" and contains(@href,"{}")]//following::button[@aria-label="Неполезный отзыв"]/text()'.format(id_number)).extract_first()
        if rewiew_text_in.startswith("<p class"):
            rewiew_text = re.search('<p class="comment__text">(.*)</p>', rewiew_text_in).group(1)
            product_advatages_disadvantages = sel.xpath('//a[@class="button button_type_link comment__link" and contains(@href,"{}")]/following::p[@class="comment__text"]/following::*[1]'.format(id_number)).extract_first()
        else:
            rewiew_text = "None"
            product_advatages_disadvantages = rewiew_text_in
        
        if "Переваги" in product_advatages_disadvantages:
            product_advatages = re.search('Переваги: </dt><dd>(.*)</dd>', product_advatages_disadvantages).group(1)
        else:
            product_advatages = 'None'
        if "Недоліки" in product_advatages_disadvantages:
            product_disadvantages = re.search('Недоліки: </dt><dd>(.*)</dd>', product_advatages_disadvantages).group(1)
        else:
            product_disadvantages = 'None'
        link = re.sub(r'#id=[0-9]*', '#id={}'.format(id_number), i)
        
        dt.append([link, title, price, id_number, rewiew_date, rewiew_text, sum(rating_numbers), rewiew_feedback_usefull, rewiew_feedback_notusefull, product_advatages_disadvantages, product_advatages, product_disadvantages])    

In [35]:
# Transform retrieved  data to DataFrame
data = pd.DataFrame(dt, columns = ['comment_link', 'product_title', 'product_price','review_id', 'review_date', 'review_text', 'review_rating', 'thumbs_up', 'thumbs_down', 'product_advatages_disadvantages', 'product_advatages', 'product_disadvantages'])

In [36]:
# View characteristics of retrieved data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6440 entries, 0 to 6439
Data columns (total 12 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   comment_link                     6440 non-null   object
 1   product_title                    6440 non-null   object
 2   product_price                    6440 non-null   object
 3   review_id                        6440 non-null   object
 4   review_date                      6440 non-null   object
 5   review_text                      6440 non-null   object
 6   review_rating                    6440 non-null   int64 
 7   thumbs_up                        6440 non-null   object
 8   thumbs_down                      6440 non-null   object
 9   product_advatages_disadvantages  6440 non-null   object
 10  product_advatages                6440 non-null   object
 11  product_disadvantages            6440 non-null   object
dtypes: int64(1), object(11)
memory usa

In [37]:
# View first 5 rows of retrieved data
data.head(5)

Unnamed: 0,comment_link,product_title,product_price,review_id,review_date,review_text,review_rating,thumbs_up,thumbs_down,product_advatages_disadvantages,product_advatages,product_disadvantages
0,https://rozetka.com.ua/ua/samsung_galaxy_a52_8...,Мобільний телефон Samsung Galaxy A52 8/256 GB...,11999.0,49828473,сьогодні,Хороший телефон!,5,0,0,"<dl class=""comment__essentials""><!----><div cl...","Камера дисплей</dd></div><!----><div class=""co...",Нет
1,https://rozetka.com.ua/ua/samsung_galaxy_a52_8...,Мобільний телефон Samsung Galaxy A52 8/256 GB...,11999.0,49766292,01 квітня 2021,"Цена сильно завышена для пластикового корпуса,...",0,16,13,"<dl class=""comment__essentials""><!----><div cl...","За эту цену нет</dd></div><!----><div class=""c...","Проц, дисплей, корпус."
2,https://rozetka.com.ua/ua/samsung_galaxy_a52_8...,Мобільний телефон Samsung Galaxy A52 8/256 GB...,11999.0,49755293,31 березня 2021,"Хотел сегодня заказать два таких смарта, себе ...",0,13,29,"<dl class=""comment__essentials""><!----><!---->...",,"Даже не хотел смотреть, на сколько он крут в н..."
3,https://rozetka.com.ua/ua/samsung_galaxy_a52_8...,Мобільний телефон Samsung Galaxy A52 8/256 GB...,11999.0,49780976,03 квітня 2021,Телефон ужас что батарея что камера буду покуп...,0,2,12,"<rz-comment-conformity _nghost-sc118=""""><!----...",,
4,https://rozetka.com.ua/ua/samsung_galaxy_a52_8...,Мобільний телефон Samsung Galaxy A52 8/256 GB...,11999.0,49643404,20 березня 2021,"Вы шо травите!!! 8gb озу и 256 память, пушка.....",5,37,21,"<rz-comment-conformity _nghost-sc118=""""><!----...",,


In [42]:
# Write created DaraFrame to csv file
data.to_csv('./reviews_data/smartphone_reviews_{}.csv'.format(date.today()), index=False)

In [43]:
# View “prettified” html of product link with Beautiful soup (useful during debagging of html parsing)
text = sel.xpath('//*').getall()
listToStr = ' '.join(map(str, text))
soup = BeautifulSoup(listToStr)
soup =soup.prettify() 
print(soup)

<html lang="ru">
 <head>
  <title>
   ROZETKA  | Відгуки про Мобільний телефон Nubia Play 5G 8/128GB Black: обговорення, фото, відео. Купити Мобільний телефон Nubia Play 5G 8/128GB Black в Києві
  </title>
  <meta content=" Відгуки про Мобільний телефон Nubia Play 5G 8/128GB Black купити на ROZETKA. Оперативна доставка ✈ Гарантія якості ☑ Найкраща ціна $" name="description"/>
  <meta content="відгуки про Мобільний телефон Nubia Play 5G 8/128GB Black" name="keywords"/>
  <meta charset="utf-8"/>
  <script>
   dataLayer = [];
  </script>
  <script>
   const Cart = new Object({            purchases   : [],            setPurchases: function (purchases) {                this.purchases = purchases;            },            getPurchases: function () {                return this.purchases;            },        });
  </script>
  <script async="async" src="https://www.googletagservices.com/tag/js/gpt.js">
  </script>
  <script>
   var googletag = googletag || {};        googletag.cmd = googletag.