In [None]:
import pandas as pd
import numpy as np
import requests
from lxml import html
import xml.etree.ElementTree as et
from decimal import Decimal

In [None]:
# ------------------ GET PRODUCT URLs
# define url - site map of all the products
url = 'https://www.rohlik.cz/sitemap.xml'

# create http response object
resp = requests.get(url)

# parse xml
tree = et.fromstring(resp.content)
child = tree.getchildren()

# extract product URLs
products = [i[0].text for i in child]

# convert to panda series (it supports regex)
products = pd.Series(products)

# use regex to extract URLs that contain products with their prices - defined as 7 numbers and text, divided by '-'
products = products[products.str.contains('https://www.rohlik.cz/[0-9]{1,8}-*')]

In [None]:
products[:10]

In [None]:
# ------------------ TRY THE SCRIPT FOR SCRAPING PRICES ON 1 PRODUCT:
prod_1 = products[1]
pageContent = requests.get(prod_1).content
tree = html.fromstring(pageContent)

prod_name = tree.xpath('//*[@class="redirect_link disabled"]//text()')

nopromo = tree.xpath('//*[@class="currentPrice"]//text()')

if(len(nopromo) == 0):
  nopromo = tree.xpath('//*[@class="actionPrice"]/del//text()')

if(len(nopromo) == 2 and ('cca' in nopromo[0])):
  nopromo = nopromo[1]

promo = tree.xpath('//*[@class="actionPrice"]//text()')

if(len(promo) == 0):
    promo = np.nan # print(Decimal('nan')) - this prints NaN directly into console
  
if(len(nopromo) == 0):
    nopromo = np.nan

prod_quantity = tree.xpath('//*[@class="detailQuantity"]//text()')
if(len(prod_quantity) == 0):
    prod_quantity = np.nan # print(Decimal('nan')) - this prints NaN directly into console

prod_category = tree.xpath('//*[@class="sc-1ywzolw-1 jQxyMp"]//text()')
prod_cat = '-'.join(prod_category)

# create dictionary from scraped data: 
data = {'Product name': prod_name,
'Promo price': promo,
'Regular price': nopromo, 
'Quantity': prod_quantity, 
'Category': prod_cat}

# if df does not exist, create it: 
df = pd.DataFrame(data)
# else append the data to the existing data frame:
df.append(pd.DataFrame(data))