In [1]:
import re
from requests import Session
from lxml.html import fromstring
from urllib.parse import urljoin

In [2]:
# Global vars
BASE_URL = 'https://www.arasuper.com.br/'
ALL_DEPS = '//ul[@class="menu-principal__items menu-principal__col menu-principal__col--1"]'
DEP_NAME = 'a/span/span'
DEP_URL = 'a/@href'
SUB_DEPS = 'div/div/ul/li' 

PRODUCT_NAME = 'div/div[3]/div/p[@class="item-produto__name"]'
PRODUCT_BRAND = 'div/div[3]/div/p[@class="item-produto__brand"]'
PRICE_ONE = 'div/div[3]/div/p[@class="item-produto__price just-one"]'
PRICE_TWO = 'div/div[3]/div/p[@class="item-produto__price"]'
PRICE_LEVE = 'div/div[3]/div/p[@class="item-produto__price price-leve"]'
PRICE_REGULAR = 'span[@class="item-produto__price-de"]'
PRICE = 'span[@class="item-produto__price-por"]'
PRODUCTS =  '//div[@class="products-list"]/a'

In [3]:
# Utils
def get_tree_from_url (session, url):
    response = session.get(url)
    return fromstring(response.text)

def extract_price(price):
    return int(re.sub(r"\s|R|\$|,", "", price)) / 100

def extract_url_code(url):
    code_match = re.search(r'/(\d+)/$', url)
    code = code_match.group(1)
    return code

def get_pages(url):
    i = 1
    while True:
        new_url = url + '?page=' + str(i)
        tree = get_tree_from_url(session, new_url)
        page_empty = tree.xpath('//div[@class="produto-lista empty-content"]')
        if page_empty != []:
            break
        else:
            yield tree
        i = i + 1
        


In [14]:
def get_departments(session):
    """
    Extract all departments from BASE_URL
    """
    departments = {}
    tree = get_tree_from_url(session, BASE_URL)
    for dep in tree.xpath(ALL_DEPS)[1]:
        depName = dep.xpath(DEP_NAME)[0].text_content()    
        subDeps = dep.xpath(SUB_DEPS)
        
        if subDeps != []: # Exists subDeps
            for sub in subDeps:
                subcontent = sub.text_content().split()
                if not(subcontent in [['Todos'],['Voltar']]):     # Erro Dept Todos e Voltar
                    subName = sub.xpath(DEP_NAME)[0].text_content()
                    url = BASE_URL + sub.xpath(DEP_URL)[0]
                    hierarchy = depName + ' > ' + subName
                    dep_id = f"COMP_{extract_url_code(url)}"
                    # INSERT ROW 
                    yield (url, hierarchy, dep_id)
        
        else: # Insert DEP
            url = BASE_URL + dep.xpath(DEP_URL)[0]
            dep_id = f"COMP_{extract_url_code(url)}"
            # INSERT ROW 
            yield (url, depName, dep_id)
    return departments

In [5]:
session = Session()

In [15]:
for x in get_departments(session):
    print(x)

('https://www.arasuper.com.br/c/massas-resfriadas/580/', 'Alimentos > Massas Resfriadas', 'COMP_580')
('https://www.arasuper.com.br/c/torradas/562/', 'Alimentos > Torradas', 'COMP_562')
('https://www.arasuper.com.br/c/sal/546/', 'Alimentos > Sal', 'COMP_546')
('https://www.arasuper.com.br/c/queijo-ralado/530/', 'Alimentos > Queijo Ralado', 'COMP_530')
('https://www.arasuper.com.br/c/oleo/484/', 'Alimentos > Óleo', 'COMP_484')
('https://www.arasuper.com.br/c/mostarda/481/', 'Alimentos > Mostarda', 'COMP_481')
('https://www.arasuper.com.br/c/polpa-de-tomate/475/', 'Alimentos > Polpa de Tomate', 'COMP_475')
('https://www.arasuper.com.br/c/molho-tomate/474/', 'Alimentos > Molho Tomate', 'COMP_474')
('https://www.arasuper.com.br/c/manteigas/460/', 'Alimentos > Manteigas', 'COMP_460')
('https://www.arasuper.com.br/c/maionese/457/', 'Alimentos > Maionese', 'COMP_457')
('https://www.arasuper.com.br/c/frutas/453/', 'Alimentos > Frutas', 'COMP_453')
('https://www.arasuper.com.br/c/leite-de-coco/

In [7]:
def get_pages(url):
    i = 1
    while True:
        new_url = url + '?page=' + str(i)
        print(new_url)
        tree = get_tree_from_url(session, new_url)
        page_empty = tree.xpath('//div[@class="produto-lista empty-content"]')
        if page_empty != []:
            break
        else:
            yield tree
        i = i + 1

In [8]:
TEST_URL = 'https://www.arasuper.com.br/c/vinhos/351/'

for i in get_pages(TEST_URL):
    print(i)

https://www.arasuper.com.br/c/vinhos/351/?page=1
<Element html at 0x22388879ae0>
https://www.arasuper.com.br/c/vinhos/351/?page=2
<Element html at 0x22387874db0>
https://www.arasuper.com.br/c/vinhos/351/?page=3
<Element html at 0x22388879ae0>
https://www.arasuper.com.br/c/vinhos/351/?page=4
<Element html at 0x22388879bd0>
https://www.arasuper.com.br/c/vinhos/351/?page=5
<Element html at 0x2238787d950>
https://www.arasuper.com.br/c/vinhos/351/?page=6
<Element html at 0x22388879bd0>
https://www.arasuper.com.br/c/vinhos/351/?page=7


In [17]:
# DONT NEED
def extract_products(page):
    products_info = []
    for product in page.xpath(PRODUCT_XPATH): 
        product_url = BASE_URL + product.xpath('@href')[0]
        product_name = product.xpath(NAME_XPATH)[0].text_content()
        product_brand = product.xpath(BRAND_XPATH)[0].text_content()
        
        if product.xpath(PRODUCT_ONE_XPATH): # One price
            regular_price = product.xpath(PRODUCT_ONE_XPATH)[0].text_content()
            regular_price = extract_price(regular_price)
            price = None
            exclusive_price = None
            
        elif product.xpath(PRODUCT_TWO_XPATH): # Regular Price and Price
            prices = product.xpath(PRODUCT_TWO_XPATH)[0]
            regular_price = prices.xpath(REGULAR_PRICE_XPATH)[0].text_content()
            regular_price = extract_price(regular_price)
            price = prices.xpath(PRICE_XPATH)[0].text_content()
            price = extract_price(price)
            exclusive_price = None
        
        else: # Only price LEVE
            prices = product.xpath(PRODUCT_LEVE_XPATH)[0]
            regular_price = prices.xpath(REGULAR_PRICE_XPATH)[0].text_content()
            regular_price = extract_price(regular_price)
            exclusive_price = prices.xpath(PRICE_XPATH)[0].text_content()
            exclusive_price = extract_price(exclusive_price)
            price = None
            print('LEVE', price, regular_price, exclusive_price)

        products_info.append(
            {
                'url': product_url,
                'name': product_name,
                'brand': product_brand,
                'price': price,
                'exclusive_price': exclusive_price,
                'regular_price': regular_price
            }
        )

    return products_info

In [18]:
for i in get_pages('https://www.arasuper.com.br/c/vinhos/351/'):
    print(i)
    for p in extract_products(i):
        print(p)
    print()

https://www.arasuper.com.br/c/vinhos/351/?page=1
<Element html at 0x22388919e50>
{'url': 'https://www.arasuper.com.br/p/vinho-nacional-galiotto-branco-seco-750ml/19485/', 'name': 'Vinho Nacional Galiotto Branco Seco 750ML  ', 'brand': 'GALIOTTO', 'price': None, 'exclusive_price': None, 'regular_price': 26.98}
{'url': 'https://www.arasuper.com.br/p/vinho-nacional-galiotto-tinto-seco-750ml/19483/', 'name': 'Vinho Nacional Galiotto Tinto Seco 750ML ', 'brand': 'GALIOTTO', 'price': None, 'exclusive_price': None, 'regular_price': 26.98}
{'url': 'https://www.arasuper.com.br/p/vinho-tinto-suave-nacional-galiotto-750ml/19482/', 'name': 'Vinho Tinto Suave Nacional Galiotto 750ML  ', 'brand': 'GALIOTTO', 'price': None, 'exclusive_price': None, 'regular_price': 26.98}
{'url': 'https://www.arasuper.com.br/p/vinho-tinto-seco-nacional-collina-750ml/19462/', 'name': 'Vinho Tinto Seco Nacional Collina 750ML ', 'brand': 'COLLINA', 'price': None, 'exclusive_price': None, 'regular_price': 17.99}
{'url': 

<Element html at 0x223878459a0>
LEVE None 17.98 15.99
{'url': 'https://www.arasuper.com.br/p/vinho-tinto-espanhol-dinastia-de-reyes-750ml/18461/', 'name': 'Vinho Tinto Espanhol Dinastia De Reyes 750ML ', 'brand': 'DINASTIA DE REYES', 'price': None, 'exclusive_price': None, 'regular_price': 71.9}
{'url': 'https://www.arasuper.com.br/p/vinho-espanhol-don-luciano-tempranillo-750ml/18447/', 'name': 'Vinho Espanhol Don Luciano Tempranillo 750ML', 'brand': 'DON LUCIANO', 'price': None, 'exclusive_price': None, 'regular_price': 35.79}
{'url': 'https://www.arasuper.com.br/p/vinho-chileno-casillero-diablo-sauvignon-blanco-750ml/18436/', 'name': 'Vinho Chileno Casillero Diablo Sauvignon Blanco 750ML ', 'brand': 'DIABLO', 'price': None, 'exclusive_price': None, 'regular_price': 85.35}
{'url': 'https://www.arasuper.com.br/p/vinho-tinto-suave-nacional-collina-750ml/18431/', 'name': 'Vinho Tinto Suave Nacional Collina 750ML ', 'brand': 'COLLINA', 'price': None, 'exclusive_price': None, 'regular_pric

<Element html at 0x2238787d8b0>
{'url': 'https://www.arasuper.com.br/p/vinho-tinto-brasileiro-marcus-james-pinotage-garrafa-750ml/3018/', 'name': 'Vinho Tinto Brasileiro Marcus James Pinotage Garrafa 750Ml', 'brand': 'MARCUS JAMES', 'price': None, 'exclusive_price': None, 'regular_price': 34.98}
{'url': 'https://www.arasuper.com.br/p/vinho-rose-quinta-do-morgado-suave-garrafa-750ml/2984/', 'name': 'Vinho Rosé Quinta Do Morgado Suave Garrafa 750Ml', 'brand': 'QUINTA DO MORGADO', 'price': None, 'exclusive_price': None, 'regular_price': 22.98}
{'url': 'https://www.arasuper.com.br/p/vinho-tinto-quinta-do-morgado-bordo-meio-seco-750ml/2924/', 'name': 'Vinho Tinto Quinta Do Morgado Bordo Meio Seco 750Ml', 'brand': 'QUINTA DO MORGADO', 'price': None, 'exclusive_price': None, 'regular_price': 28.98}
{'url': 'https://www.arasuper.com.br/p/vinho-jurupinga-dinalle-suave-branco-975ml/2727/', 'name': 'Vinho Jurupinga Dinalle Suave Branco 975Ml', 'brand': 'JURUPINGA', 'price': None, 'exclusive_price