In [1]:
import pandas as pd
import numpy as np
import requests
from lxml import html
import xml.etree.ElementTree as et
from decimal import Decimal
from ratelimit import limits, sleep_and_retry

In [2]:
# ------------------ GET PRODUCT URLs
# define url - site map of all the products
url = 'https://www.rohlik.cz/sitemap.xml'

# create http response object
resp = requests.get(url)

# parse xml
tree = et.fromstring(resp.content)
child = tree.getchildren()

# extract product URLs
products = [i[0].text for i in child]

# convert to panda series (it supports regex)
products = pd.Series(products)

# use regex to extract URLs that contain products with their prices - defined as 7 numbers and text, divided by '-'
products = products[products.str.contains('https://www.rohlik.cz/[0-9]{1,8}-*')]

  # Remove the CWD from sys.path while we load stuff.


In [3]:
products[:10]

0    https://www.rohlik.cz/1296727-nivea-men-silver...
1    https://www.rohlik.cz/1296729-nivea-for-men-in...
2    https://www.rohlik.cz/1296749-nivea-intimo-sen...
3    https://www.rohlik.cz/1296751-nivea-creme-care...
4    https://www.rohlik.cz/1296953-odol-stoma-parad...
5    https://www.rohlik.cz/1296965-racio-chlebicky-...
6    https://www.rohlik.cz/1296977-nescafe-dolce-gu...
7    https://www.rohlik.cz/1297003-alpro-kokosovy-n...
8    https://www.rohlik.cz/1297005-alpro-sojovy-nap...
9    https://www.rohlik.cz/1297007-wasa-delikatess-...
dtype: object

In [4]:
# ------------------ TRY THE SCRIPT FOR SCRAPING PRICES ON 1 PRODUCT:
prod_1 = products[160]
pageContent = requests.get(prod_1).content
tree = html.fromstring(pageContent)

prod_name = tree.xpath('//*[@class="redirect_link disabled"]//text()')

nopromo = tree.xpath('//*[@class="currentPrice"]//text()')

if(len(nopromo) == 0):
  nopromo = tree.xpath('//*//del/text()')

if(len(nopromo) == 2 and ('cca' in nopromo[0])):
  nopromo = nopromo[1]

if((isinstance(nopromo, (list,)) and len(nopromo) > 1) or (len(nopromo) == 0)):
    nopromo = np.nan

promo = tree.xpath('//*[@class="actionPrice"]//text()')

if(len(promo) == 0):
    promo = np.nan 

prod_quantity = tree.xpath('//*[@class="detailQuantity"]//text()')
if(len(prod_quantity) == 0):
    prod_quantity = np.nan 

prod_category = tree.xpath('//*[contains(@class, "redirect_link") and contains(@href, "/c")]//text()')
prod_cat = '-'.join(prod_category)

# create dictionary from scraped data: 
data = {'Product name': prod_name,
'Regular price': nopromo,
'Promo price': promo, 
'Quantity': prod_quantity, 
'Category': prod_cat}

# if df does not exist, create it: 
df = pd.DataFrame(data)
# else append the data to the existing data frame:
#df = df.append(pd.DataFrame(data))

In [5]:
df

Unnamed: 0,Product name,Regular price,Promo price,Quantity,Category
0,HiPP Bio Boloňské lasagne,"69,90 Kč","59,90 Kč",250 g,Příkrmy-Masové


In [None]:
# ----------- FOR LOOP 
df = pd.DataFrame()

for x in products[50:70]:
  pageContent = requests.get(x).content
  tree = html.fromstring(pageContent)
  prod_name = tree.xpath('//*[@class="redirect_link disabled"]//text()')

  nopromo = tree.xpath('//*[@class="currentPrice"]//text()')

  if(len(nopromo) == 0):
    nopromo = tree.xpath('//*[@class="actionPrice"]/del//text()')

  if(len(nopromo) == 2 and ('cca' in nopromo[0])):
    nopromo = nopromo[1]

  promo = tree.xpath('//*[@class="actionPrice"]//text()')

  if(len(promo) == 0):
    promo = np.nan 
  
  if(len(nopromo) == 0):
    nopromo = np.nan

  prod_quantity = tree.xpath('//*[@class="detailQuantity"]//text()')
  if(len(prod_quantity) == 0):
    prod_quantity = np.nan

  prod_category = tree.xpath('//*[@class="sc-1ywzolw-1 jQxyMp"]//text()')
  prod_cat = '-'.join(prod_category)

  data = {'Product name': prod_name,
          'Regular price': nopromo, 
          'Promo price': promo,
          'Quantity': prod_quantity, 
          'Category': prod_cat}
  df = df.append(pd.DataFrame(data))

In [15]:
# ----------- DEFINE A FUNCTION TO EXTRACT DATA
@limits(calls = 10, period = 60) # set limit on scraping: 10 products per minute ~ 25.5 hours of scraping
def extract_rohlik(x):
    pageContent = requests.get(x).content
    tree = html.fromstring(pageContent)
    prod_name = tree.xpath('//*[@class="redirect_link disabled"]//text()')

    nopromo = tree.xpath('//*[@class="currentPrice"]//text()')

    if(len(nopromo) == 0):
      nopromo = tree.xpath('//*//del/text()')

    if(len(nopromo) == 2 and ('cca' in nopromo[0])):
      nopromo = nopromo[1]
    
    if((isinstance(nopromo, (list,)) and len(nopromo) > 1) or (len(nopromo) == 0)):
      nopromo = np.nan

    promo = tree.xpath('//*[@class="actionPrice"]//text()')

    if(len(promo) == 0):
      promo = np.nan 

    prod_quantity = tree.xpath('//*[@class="detailQuantity"]//text()')
    if(len(prod_quantity) == 0):
      prod_quantity = np.nan 

    prod_category = tree.xpath('//*[contains(@class, "redirect_link") and contains(@href, "/c")]//text()')
    prod_cat = '-'.join(prod_category)

    data = {'Product name': prod_name,
            'Regular price': nopromo, 
            'Promo price': promo,
            'Quantity': prod_quantity, 
            'Category': prod_cat, 
            'URL': x}
    return(pd.DataFrame(data))

In [17]:
# try the function:
a = [extract_rohlik(x) for x in products[160:170]]
pd.concat(a, axis = 0)

Unnamed: 0,Product name,Regular price,Promo price,Quantity,Category,URL
0,HiPP Bio Boloňské lasagne,"69,90 Kč","59,90 Kč",250 g,Příkrmy-Masové,https://www.rohlik.cz/1298009-hipp-bio-bolonsk...
0,HiPP Bio Jablečno - hroznová šťáva,"34,90 Kč",,200 ml,Nápoje-Ovocné šťávy,https://www.rohlik.cz/1298011-hipp-bio-jablecn...
0,HiPP Bio Hrušková šťáva,"34,90 Kč",,200 ml,Nápoje-Ovocné šťávy,https://www.rohlik.cz/1298013-hipp-bio-hruskov...
0,HiPP BIO Meruňkovo-Hruškový nektar,"34,90 Kč",,200 ml,Nápoje-Ovocné šťávy,https://www.rohlik.cz/1298015-hipp-bio-merunko...
0,HiPP Bio Švestkový nektar,"34,90 Kč",,200 ml,Nápoje-Ovocné šťávy,https://www.rohlik.cz/1298017-hipp-bio-svestko...
0,HiPP Bio Šťáva z červených plodů ovoce,"74,90 Kč",,500 ml,Nápoje-Ovocné šťávy,https://www.rohlik.cz/1298019-hipp-bio-stava-z...
0,HiPP Bio Jablečno - hroznová šťáva,"74,90 Kč",,500 ml,Nápoje-Ovocné šťávy,https://www.rohlik.cz/1298021-hipp-bio-jablecn...
0,HiPP Bio Jablečná šťáva s fenyklovým čajem,"49,90 Kč","46,90 Kč",500 ml,Nápoje-Ovocné šťávy,https://www.rohlik.cz/1298023-hipp-bio-jablecn...
0,HiPP Mama Nápoj v prášku pro kojící matky,"139,90 Kč",,200 g,Pro maminky a těhotné-Kojící čaje a nápoje,https://www.rohlik.cz/1298025-hipp-mama-napoj-...
0,HiPP Mamasanft Masážní olej na strie,"249,90 Kč",,100 ml,"Pro maminky a těhotné-Krémy na strie, celulitidu",https://www.rohlik.cz/1298029-hipp-mamasanft-m...
