In [80]:
import pandas as pd
import numpy as np
import requests
from lxml import html
import xml.etree.ElementTree as et
from decimal import Decimal

In [4]:
# ------------------ GET PRODUCT URLs

# define url - site map of all the products
url = 'https://www.rohlik.cz/sitemap.xml'

# create http response object
resp = requests.get(url)

# parse xml
tree = et.fromstring(resp.content)
child = tree.getchildren()

# extract product URLs
products = [i[0].text for i in child]

# convert to panda series (it supports regex)
products = pd.Series(products)

# use regex to extract URLs that contain products with their prices - defined as 7 numbers and text, divided by '-'
products = products[products.str.contains('https://www.rohlik.cz/[0-9]{1,8}-*')]

  # This is added back by InteractiveShellApp.init_path()


In [6]:
products.head()

0    https://www.rohlik.cz/1296727-nivea-men-silver...
1    https://www.rohlik.cz/1296729-nivea-for-men-in...
2    https://www.rohlik.cz/1296749-nivea-intimo-sen...
3    https://www.rohlik.cz/1296751-nivea-creme-care...
4    https://www.rohlik.cz/1296953-odol-stoma-parad...
dtype: object

In [81]:
# ------------------ TRY THE SCRIPT FOR SCRAPING PRICES ON 1 PRODUCT:
prod_1 = products[6]
pageContent = requests.get(prod_1).content
tree = html.fromstring(pageContent)

prod_name = tree.xpath('//*[@class="ProductDetail__productName__link redirect_link disabled"]//text()')

promo = tree.xpath('//*[@class="ProductDetail__actionPrice"]//text()')
if(len(promo) == 0):
    promo = np.nan # print(Decimal('nan')) - this prints NaN directly into console

nopromo = tree.xpath('//*[@class="ProductDetail__commonPrice"]//text()')

if(len(nopromo) == 0):
    nopromo = tree.xpath('//*[@class="ProductDetail__currentPrice"]//text()')

prod_quantity = tree.xpath('//*[@class="ProductDetail__quantity"]//text()')

if(len(prod_quantity) == 0):
    prod_quantity = np.nan # print(Decimal('nan')) - this prints NaN directly into console
    
prod_category = tree.xpath('//*[@class="ProductDetail__categoryList"]//text()')

prod_cat = '-'.join(prod_category)

print([prod_name, promo, nopromo, prod_quantity, prod_cat])

NaN
[['Nescafé Dolce Gusto Lungo Intenso 16ks'], nan, ['149,90\xa0Kč\xa0'], None, 'Káva-Kapsle a pody']


In [58]:
# create dictionary from scraped data: 
data = {'Product name': prod_name,
        'Promo price': promo,
        'Regular price': nopromo, 
        'Quantity': prod_quantity, 
        'Category': prod_cat}

# if df does not exist, create it: 
df = pd.DataFrame(data)

# else append the data to the existing data frame:
df.append(pd.DataFrame(data))

Unnamed: 0,Product name,Promo price,Regular price,Quantity,Category
0,Nescafé Dolce Gusto Lungo Intenso 16ks,,"149,90 Kč",,Káva-Kapsle a pody
0,Nescafé Dolce Gusto Lungo Intenso 16ks,,"149,90 Kč",,Káva-Kapsle a pody


In [84]:
# ----------- FOR LOOP

df = pd.DataFrame()

for x in products[:10]:
    pageContent = requests.get(x).content
    tree = html.fromstring(pageContent)
    
    prod_name = tree.xpath('//*[@class="ProductDetail__productName__link redirect_link disabled"]//text()')
        
    nopromo = tree.xpath('//*[@class="ProductDetail__commonPrice"]//text()')
    if(len(nopromo) == 0):
        nopromo = tree.xpath('//*[@class="ProductDetail__currentPrice"]//text()')
    
    promo = tree.xpath('//*[@class="ProductDetail__actionPrice"]//text()')
    if(len(promo) == 0):
        promo = np.nan # print(Decimal('nan')) - this prints NaN directly into console
        
    prod_quantity = tree.xpath('//*[@class="ProductDetail__quantity"]//text()')
    if(len(prod_quantity) == 0):
        prod_quantity = np.nan # print(Decimal('nan')) - this prints NaN directly into console
        
    prod_category = tree.xpath('//*[@class="ProductDetail__categoryList"]//text()')
    prod_cat = '-'.join(prod_category)
    
    data = {'Product name': prod_name,
            'Regular price': nopromo,
            'Promo price': promo, 
            'Quantity': prod_quantity, 
            'Category': prod_cat}
    
    df = df.append(pd.DataFrame(data))

In [86]:
df

Unnamed: 0,Product name,Regular price,Promo price,Quantity,Category
0,Nivea Men Silver Protect Kuličkový antiperspirant,"89,90 Kč","84,90 Kč",50 ml,Pánské-Kuličkové
0,Nivea For Men Invisible for black & white anti...,"89,90 Kč","84,90 Kč",150 ml,Pánské-Ve spreji
0,Nivea Intimo Sensitive sprchová emulze pro int...,"119,90 Kč",,250 ml,Dámské hygienické potřeby-Intimní hygiena
0,Nivea Creme Care tekuté mýdlo na ruce,"59,90 Kč",,250 ml,Mýdla-Tekutá
0,Odol Stoma Paradentol Ústní voda pro zdravé dásně,"89,90 Kč","84,90 Kč",500 ml,Ústní hygiena-Ústní vody
0,RACIO Chlebíčky rýžové,"13,90 Kč",,130 g,Racio a Knäckebrot-Pufované pečivo
0,Nescafé Dolce Gusto Lungo Intenso 16ks,"149,90 Kč",,,Káva-Kapsle a pody
0,Alpro Kokosový nápoj Original s rýží,"69,90 Kč","54,90 Kč",1 l,Mléko a mléčné nápoje-Rostlinné nápoje
0,Alpro Sójový Nápoj Original,"59,90 Kč",,1 l,Mléko a mléčné nápoje-Rostlinné nápoje
0,Wasa Delikatess celozrnný žitný křupavý chléb,"54,90 Kč",,270 g,Racio a Knäckebrot-Knäckebrot


In [None]:
# rate limit - balíček na nastavení, kolik cen se bude stahovat (limit rohliku: 10/min, potřebujeme 10.4 aby se to stahlo za 24h, to snad projde:)
