In [1]:
import re
from lxml.html import fromstring
from unidecode import unidecode
from requests import Session

In [2]:
def get_tree_from_url(session, url, cookies={}):
    response = session.get(url, cookies=cookies)
    return fromstring(response.text)


def extract_price(price):
    return int(re.sub(r"\s|R|\$|,", "", price)) / 100


def process_text(text):
    text = unidecode(text)
    text = text.replace(',', '')
    text = text.lower()
    return text

In [3]:
# Global vars
BASE_URL = 'https://www.arasuper.com.br/'
ALL_DEPS = '//ul[@class="menu-principal__items menu-principal__col menu-principal__col--1"]'
DEP_NAME = 'a/span/span'
DEP_URL = 'a/@href'
SUB_DEPS = 'div/div/ul/li' 

PRODUCT_NAME = 'div/div[3]/div/p[@class="item-produto__name"]'
PRODUCT_BRAND = 'div/div[3]/div/p[@class="item-produto__brand"]'
PRICE_ONE = 'div/div[3]/div/p[@class="item-produto__price just-one"]'
PRICE_TWO = 'div/div[3]/div/p[@class="item-produto__price"]'
PRICE_LEVE = 'div/div[3]/div/p[@class="item-produto__price price-leve"]'
PRICE_REGULAR = 'span[@class="item-produto__price-de"]'
PRICE = 'span[@class="item-produto__price-por"]'
PRODUCTS =  '//div[@class="products-list"]/a'

In [4]:
def get_departments(session):
    """
    Extract all departments from BASE_URL
    """
    departments = {}
    tree = get_tree_from_url(session, BASE_URL)
    for dep in tree.xpath(ALL_DEPS)[1]:
        depName = dep.xpath(DEP_NAME)[0].text_content()    
        subDeps = dep.xpath(SUB_DEPS)
        
        if subDeps != []: # Exists subDeps
            for sub in subDeps:
                subcontent = sub.text_content().split()
                if not(subcontent in [['Todos'],['Voltar']]):     # Erro Dept Todos e Voltar
                    subName = sub.xpath(DEP_NAME)[0].text_content()
                    subSubDeps = sub.xpath(SUB_DEPS)              
                    if subSubDeps != []: # Exists subSubDeps
                        for subsub in subSubDeps:
                            subsubcontent = subsub.text_content().split()
                            if not(subsubcontent in [['Todos'],['Voltar']]): # Insert DEP > SUB > SUBSUB
                                subSubName = subsub.xpath('a/div')[1].text_content().strip()
                                url = BASE_URL + subsub.xpath(DEP_URL)[0]
                                departments[url] = depName + ' > ' + subName + ' > ' + subSubName # Insert into DEPTS TABLE
                    else: # Insert DEP > SUB
                        url = BASE_URL + sub.xpath(DEP_URL)[0]
                        departments[url] = depName + ' > ' + subName # Insert into DEPTS TABLE
        else: # Insert DEP
            url = BASE_URL + dep.xpath(DEP_URL)[0]
            departments[url] = depName # Insert into DEPTS TABLE  
    return departments

In [11]:
def get_pages(session, department_url):
    """
    Get all pages from a departmente given an department_url
    """
    i = 1
    while True:
        new_url = department_url + '?page=' + str(i)
        tree = get_tree_from_url(session, new_url)
        page_empty = tree.xpath('//div[@class="produto-lista empty-content"]')
        if page_empty != []:
            break
        else:
            yield tree
        i = i + 1

In [6]:
def extract_products(page):
    products_info = []
    for product in page.xpath(PRODUCTS): 
        product_url = BASE_URL + product.xpath('@href')[0]
        product_name = product.xpath(PRODUCT_NAME)[0].text_content()
        product_brand = product.xpath(PRODUCT_BRAND)[0].text_content()
        
        if product.xpath(PRICE_ONE): # One price
            regular_price = None
            price = product.xpath(PRICE_ONE)[0].text_content()
            price = extract_price(price)
            
        elif product.xpath(PRICE_TWO): # Regular Price and Price
            prices = product.xpath(PRICE_TWO)[0]
            regular_price = prices.xpath(PRICE_REGULAR)[0].text_content()
            regular_price = extract_price(regular_price)
            price = prices.xpath(PRICE)[0].text_content()
            price = extract_price(price)
        
        else: # Only price
            prices = product.xpath(PRICE_LEVE)[0]
            regular_price = prices.xpath(PRICE_REGULAR)[0].text_content()
            regular_price = extract_price(regular_price)
            price = prices.xpath(PRICE)[0].text_content()
            price = extract_price(price)

        
        # INSERT PRODUCT IN TABLE
        products_info.append(
            {
                'url': product_url,
                'name': product_name,
                'brand': product_brand,
                'price': price,
                'regular_price': regular_price
            }
        )

    return products_info


In [7]:
session = Session()

In [12]:
    for department_url in get_departments(session).keys():
        print(department_url)
        # Add timer here 
        for page in get_pages(session, department_url):
            products = extract_products(page)
            for i in products:
                print(i)
            print('--------------------------------')

https://www.arasuper.com.br/c/massas-resfriadas/580/
{'url': 'https://www.arasuper.com.br/p/massa-de-pastel-grande-romanha-500g/16331/', 'name': 'Massa De Pastel Grande Romanha 500g', 'brand': 'ROMANHA', 'price': 8.49, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/massa-de-lasanha-romanha-500g/16324/', 'name': 'Massa de Lasanha Romanha 500g', 'brand': 'ROMANHA', 'price': 9.99, 'regular_price': None}
--------------------------------
https://www.arasuper.com.br/c/torradas/562/
{'url': 'https://www.arasuper.com.br/p/torrada-bauducco-cereale-integral-128g/18849/', 'name': 'Torrada Bauducco Cereale Integral 128G', 'brand': 'BAUDUCCO', 'price': 5.39, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/torrada-bauducco-multigraos-142g/1077/', 'name': 'Torrada Bauducco Multigrãos 142G', 'brand': 'BAUDUCCO', 'price': 6.15, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/torrada-marilan-magic-toast-original-150g/1071/', 'name': 'Torrada Marilan Magic Toast

https://www.arasuper.com.br/c/polpa-de-tomate/475/
{'url': 'https://www.arasuper.com.br/p/pururuca-campilar-embalagem-140g/3518/', 'name': 'Pururuca Campilar Embalagem 140G', 'brand': 'CAMPILAR', 'price': 5.75, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/polpa-de-tomate-ole-520g/1086/', 'name': 'Polpa De Tomate Olé 520G', 'brand': 'OLE', 'price': 4.5, 'regular_price': None}
--------------------------------
https://www.arasuper.com.br/c/molho-tomate/474/
{'url': 'https://www.arasuper.com.br/p/extrato-tomate-dajuda-200g/19596/', 'name': 'Extrato Tomate Dajuda 200G', 'brand': 'DAJUDA', 'price': 2.25, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/molho-tomate-tarantella-ervas-sach-300g/19271/', 'name': 'Molho Tomate Tarantella Ervas Sach 300G ', 'brand': 'TARANTELLA', 'price': 2.99, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/molho-tomate-tarantella-tradicao-sach-300g/19273/', 'name': 'Molho Tomate Tarantella Tradicao Sach 300G ', 'brand'

https://www.arasuper.com.br/c/diet/426/
https://www.arasuper.com.br/c/instantaneas/455/
{'url': 'https://www.arasuper.com.br/p/macarrao-de-arroz-instantaneo-karui-sabor-galinha-caipira-sem-gluten-78g/19859/', 'name': 'Macarrão De Arroz Instantâneo Karui Sabor Galinha Caipira Sem Glúten 78G ', 'brand': 'KARUI', 'price': 2.99, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/macarrao-de-arroz-instantaneo-karui-sabor-carne-sem-gluten-78g/19844/', 'name': 'Macarrão De Arroz Instantâneo Karui Sabor Carne Sem Glúten 78G ', 'brand': 'KARUI', 'price': 2.99, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/mac-inst-ufo-nissin-95g-carne/17820/', 'name': 'MAC INST UFO NISSIN 95G CARNE', 'brand': 'NISSIN', 'price': 8.55, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/mac-inst-ufo-nissin-95g-curry/17819/', 'name': 'MAC INST UFO NISSIN 95G CURRY', 'brand': 'NISSIN', 'price': 8.55, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/macarrao-instanta

https://www.arasuper.com.br/c/sem-gluten/454/
{'url': 'https://www.arasuper.com.br/p/macarrao-de-arroz-integral-pena-urbano-sem-gluten-500g/7955/', 'name': 'Macarrão De Arroz Integral Pena Urbano Sem Glúten 500G', 'brand': 'URBANO', 'price': 5.3, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/macarrao-de-arroz-urbano-padre-nosso-pacote-500g/6683/', 'name': 'Macarrão De Arroz Urbano Padre Nosso Pacote 500G', 'brand': 'URBANO', 'price': 4.37, 'regular_price': 5.29}
{'url': 'https://www.arasuper.com.br/p/macarrao-de-arroz-integral-pena-urbano-sem-gluten-500g/3476/', 'name': 'Macarrão De Arroz Integral Pena Urbano Sem Glúten  500G', 'brand': 'URBANO', 'price': 7.49, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/macarrao-de-arroz-integral-parafuso-fusilli-urbano-embalagem-500g/3473/', 'name': 'Macarrão De Arroz Integral Parafuso/Fusilli Urbano Embalagem 500G', 'brand': 'URBANO', 'price': 7.49, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/massa

https://www.arasuper.com.br/c/preto/366/
{'url': 'https://www.arasuper.com.br/p/feijao-preto-zaeli-1kg/2364/', 'name': 'Feijão Preto Zaeli 1Kg', 'brand': 'ZAELI', 'price': 10.79, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/feijao-preto-tia-eliza-1kg/2365/', 'name': 'Feijão Preto Tia Eliza 1Kg', 'brand': 'TIA ELIZA', 'price': 8.99, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/feijao-preto-kumbuca-embalagem-1kg/253/', 'name': 'Feijão Preto Kumbuca Embalagem 1Kg', 'brand': 'KUMBUCA', 'price': 8.49, 'regular_price': None}
--------------------------------
https://www.arasuper.com.br/c/praia/365/
{'url': 'https://www.arasuper.com.br/p/feijao-de-praia-tipo-1-embalagem-1kg/254/', 'name': 'Feijão De Praia Tipo 1 Embalagem 1Kg', 'brand': 'KUMBUCA', 'price': 6.29, 'regular_price': None}
{'url': 'https://www.arasuper.com.br/p/feijao-praia-tia-eliza-1kg/139/', 'name': 'Feijão Praia Tia Eliza 1Kg', 'brand': 'TIA ELIZA', 'price': 6.79, 'regular_price': None}
-------

KeyboardInterrupt: 