# BOT AMAZON web scraping

## COMPILADO

In [145]:
import requests  # Biblioteca para fazer requisições HTTP, usada para acessar as páginas da Amazon
from lxml import html  # Biblioteca para parsing de HTML, usada para extrair informações de páginas
import pandas as pd  # Biblioteca para manipulação de dados, usada para criar e manipular DataFrames
import time  # Biblioteca para gerenciar o tempo, usada para controlar intervalos entre requisições
import re  # Biblioteca de expressões regulares, usada para manipulação de strings
from datetime import datetime  # Biblioteca para manipular datas, usada para adicionar data aos dados
import logging  # Biblioteca para criação de logs, usada para registrar eventos e erros
import random  # Biblioteca para gerar números aleatórios, usada para intervalos aleatórios entre requisições
import os  # Biblioteca para manipulação de sistemas operacionais, usada para criar diretórios

# Diretório para salvar os logs
log_directory = "C:/Users/ThiagoBizacha/Desktop/Projeto_Automacao_Coleta_Dados/logs/"
os.makedirs(log_directory, exist_ok=True)  # Cria o diretório de logs se ele ainda não existir

# Configuração do logging para salvar no diretório especificado
logging.basicConfig(
    filename=os.path.join(log_directory, 'amazon_scraper.log'),  # Define o nome e local do arquivo de log
    level=logging.INFO,  # Define o nível de log (INFO), para registrar eventos gerais
    format='%(asctime)s - %(levelname)s - %(message)s'  # Define o formato da mensagem de log: data/hora, nível e mensagem
)

# Função para obter os links das categorias da página inicial (Elektronica, Software ou Boeken)
def get_category_links(url):
    """
    Faz uma requisição à página principal e extrai os links das categorias.
    
    Parâmetros:
        url (str): URL da página principal da Amazon (ex: Best Sellers)

    Retorno:
        list: Lista de dicionários contendo o nome e o link completo de cada categoria
    """
    # Define o cabeçalho da requisição para simular um navegador e evitar bloqueios
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    
    try:
        # Faz a requisição à URL com o cabeçalho definido
        response = requests.get(url, headers=headers, timeout=10)  # Timeout de 10 segundos para evitar travamentos
        response.raise_for_status()  # Gera um erro se o status da requisição for diferente de 200 (OK)
    except requests.exceptions.RequestException as e:
        logging.error(f"Erro ao acessar a URL {url}: {e}")  # Registra o erro nos logs
        return []  # Retorna uma lista vazia caso haja falha na requisição

    # Faz o parsing do conteúdo HTML da página
    tree = html.fromstring(response.content)
    
    # Usa XPath para encontrar os links das categorias
    categories = tree.xpath('//div[contains(@class, "_p13n-zg-nav-tree-all_style_zg-browse-group__88fbz")]//a')
    
    # Cria uma lista de dicionários contendo o nome e o link completo de cada categoria
    category_links = [{'category': cat.text_content().strip(), 'link': 'https://www.amazon.nl' + cat.get('href')} for cat in categories]

    return category_links  # Retorna a lista de categorias

# Função para extrair detalhes dos produtos de uma categoria específica
def extract_product_details(items, category):
    """
    Extrai detalhes dos produtos de uma categoria (ex: Best Sellers, Movers and Shakers).

    Parâmetros:
        items (list): Lista de elementos de produtos extraídos da página da categoria
        category (str): Nome da categoria de produtos

    Retorno:
        list: Lista de dicionários contendo os detalhes de cada produto
    """
    products = []  # Lista onde serão armazenados os detalhes dos produtos
    today_date = datetime.today().strftime('%Y-%m-%d')  # Data atual formatada para ser usada em cada produto

    # Itera sobre cada item da lista de produtos
    for index, item in enumerate(items, start=1):
        # Extrai o ID do produto (ASIN)
        product_id = item.xpath('.//@data-asin')
        product_id = product_id[0] if product_id else "No ID"  # Usa "No ID" caso o ID não seja encontrado

        # Extrai a posição no ranking (por exemplo, #1, #2, etc.)
        position = item.xpath('.//span[@class="zg-bdg-text"]/text()')
        position = position[0].strip() if position else str(index)  # Se a posição não estiver disponível, usa o índice do loop

        # Extrai o link da imagem do produto
        image = item.xpath('.//img[contains(@class, "a-dynamic-image")]/@src')
        image_link = image[0] if image else "No image link"  # Caso não haja imagem, usa "No image link"

        # Extrai o título do produto
        title = item.xpath('.//a/span/div/text()')
        title = title[0].strip() if title else "No title"  # Se não houver título, usa "No title"

        # Extrai o link do produto
        link = item.xpath('.//a[contains(@class, "a-link-normal")]/@href')
        product_link = "https://www.amazon.nl" + link[0] if link else "No product link"

        # Extrai o nome do produto (parte da URL)
        name = item.xpath('.//a[@class="a-link-normal aok-block"]/@href')
        name = name[0].split('/')[1] if name else "No name"

        # Extrai a classificação/avaliação do produto
        rating = item.xpath('.//span[contains(@class, "a-icon-alt")]/text()')
        rating = rating[0].strip() if rating else "No rating"

        # Extrai o número de avaliações (quantidade de reviews)
        reviews = item.xpath('.//span[@class="a-size-small"]/text()')
        reviews = reviews[0].strip() if reviews else "No reviews"

        # Extrai o preço do produto
        price = item.xpath('.//span[contains(@class, "p13n-sc-price")]/text()')
        if price:
            price = price[0].strip()
            currency_symbol = ''.join(re.findall(r'[^\d.,]', price))  # Extrai o símbolo da moeda
            value = ''.join(re.findall(r'[\d.,]+', price))  # Extrai o valor numérico do preço
        else:
            currency_symbol = "Not Available"
            value = "Not Available"

        # Adiciona os detalhes do produto à lista de produtos
        products.append({
            "category": category,
            "rank": position,
            "asin": product_id,
            "name": name,
            "title": title,
            "rating": rating,
            "reviews": reviews,
            "symbol": currency_symbol,
            "value": value,
            "image": image_link,
            "link": product_link,        
            "date": today_date
        })

    return products  # Retorna a lista de produtos

# Função que obtém os produtos mais vendidos em uma categoria específica, com retries para evitar falhas temporárias
def get_amazon_bestsellers(url, category, retries=5, backoff_factor=0.3):
    """
    Faz a requisição para uma categoria específica da Amazon e coleta os produtos. Tenta novamente em caso de falhas.

    Parâmetros:
        url (str): URL da categoria da Amazon
        category (str): Nome da categoria de produtos (Best Sellers, Movers and Shakers, etc.)
        retries (int): Número máximo de tentativas de requisição
        backoff_factor (float): Fator de tempo para espera entre retries

    Retorno:
        list: Lista de produtos da categoria, ou lista vazia em caso de falha
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }

    # Loop para realizar retries em caso de falha na requisição
    for i in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)  # Faz a requisição com um timeout de 10 segundos
            response.raise_for_status()  # Gera um erro se o status não for 200 (OK)
            break  # Se a requisição for bem-sucedida, sai do loop
        except requests.exceptions.RequestException as e:
            logging.error(f"Erro ao acessar {url} na tentativa {i+1}/{retries}: {e}")  # Registra o erro nos logs
            time.sleep(backoff_factor * (2 ** i))  # Aumenta o tempo de espera exponencialmente (backoff)
        if i == retries - 1:
            return []  # Retorna lista vazia após o número máximo de retries

    # Faz o parsing do conteúdo da página
    tree = html.fromstring(response.content)
    
    # Extrai os produtos da categoria com base no identificador "p13n-asin-index"
    items = tree.xpath('//div[contains(@id, "p13n-asin-index")]')
    logging.info(f"Encontrados {len(items)} itens na categoria {category}.")

    # Retorna os detalhes dos produtos extraídos
    return extract_product_details(items, category)

# Função para salvar os dados consolidados em Excel
def save_to_excel_consolidated(products, directory_path):
    """
    Salva os produtos extraídos em um único arquivo Excel consolidado.

    Parâmetros:
        products (list): Lista de produtos de todas as categorias
        directory_path (str): Caminho para salvar o arquivo Excel
    """
    if not products:
        logging.warning("Nenhum produto encontrado nas categorias.")  # Registra um aviso se não houver produtos
        return

    # Cria um DataFrame a partir da lista de produtos
    df = pd.DataFrame(products)
    
    # Define o nome do arquivo com base na data atual
    today_date = datetime.today().strftime('%Y-%m-%d')
    filename = f"{directory_path}/extract_amazon_full_TESTE{today_date}.xlsx"
    
    # Salva o DataFrame em um arquivo Excel
    df.to_excel(filename, index=False)
    logging.info(f"Dados consolidados salvos em {filename}")  # Registra a conclusão da tarefa

# Função que processa uma categoria específica e adiciona a coluna "origin"
def process_category(category_url, category_type):
    """
    Processa uma categoria específica, extrai os produtos e adiciona uma coluna "origin" para identificar a origem.

    Parâmetros:
        category_url (str): URL da categoria
        category_type (str): Tipo de categoria (ex: Best Sellers, Movers and Shakers)

    Retorno:
        list: Lista de produtos com a coluna "origin" indicando a origem dos dados
    """
    # Obtém os links das subcategorias
    category_links = get_category_links(category_url)
    all_products = []

    # Processa cada subcategoria
    for category_link in category_links:
        category = category_link['category']  # Nome da subcategoria
        link = category_link['link']  # Link da subcategoria
        logging.info(f"Processando categoria: {category}")
        
        # Extrai os produtos da subcategoria
        products = get_amazon_bestsellers(link, category)
        
        # Adiciona a origem dos dados na coluna "origin"
        for product in products:
            product['origin'] = category_type  # Adiciona a origem (ex: best_sellers)
        
        all_products.extend(products)  # Adiciona os produtos à lista geral
        
        # Pausa com intervalo aleatório para evitar sobrecarga no servidor
        time.sleep(random.uniform(0, 1))

    return all_products  # Retorna a lista de produtos processados

# Função principal que coordena todo o processo
def extract_data():
    directory_path = "C:/Users/ThiagoBizacha/Desktop/Projeto_Automacao_Coleta_Dados/data/output/bot_amazon"

    # Lista para armazenar os produtos de todas as categorias
    all_products = []

    # Processa Best Sellers e adiciona à lista consolidada
    base_url_bestsellers = "https://www.amazon.nl/gp/bestsellers/"
    all_products.extend(process_category(base_url_bestsellers, "best_sellers"))

    # Processa Movers and Shakers e adiciona à lista consolidada
    #base_url_movers_shakers = "https://www.amazon.nl/gp/movers-and-shakers/"
    #all_products.extend(process_category(base_url_movers_shakers, "movers_and_shakers"))

    # Processa New Releases e adiciona à lista consolidada
    #base_url_new_releases = "https://www.amazon.nl/gp/new-releases/"
    #all_products.extend(process_category(base_url_new_releases, "new_releases"))

    # Cria um DataFrame com todos os produtos
    df_consolidated = pd.DataFrame(all_products)
    df_consolidated['reviews'] = df_consolidated['reviews'].apply(lambda x: re.sub(r'[.,]', '', str(x)))
    df_consolidated['reviews'] = df_consolidated['reviews'].replace("No reviews", 0)  # Substituir "No reviews" por 0
    df_consolidated['reviews'] = df_consolidated['reviews'].astype(int)

    # Exibe o DataFrame final para visualização
    #print(df_final)

    # Salva todos os dados consolidados em um arquivo Excel único
    save_to_excel_consolidated(all_products, directory_path)

    logging.info("Processo finalizado!")  # Registra a finalização do processo

    return df_consolidated

if __name__ == "__main__":
    extract_data()


## TRATAMENTO

In [1]:
import pandas as pd
import logging
import os

# Constantes
LOG_DIR = "C:/Users/ThiagoBizacha/Desktop/Projeto_Automacao_Coleta_Dados/logs/"
LOG_FILE = os.path.join(LOG_DIR, "transform_data.log")
DATA_FILE = "C:/Users/ThiagoBizacha/Desktop/Projeto_Automacao_Coleta_Dados/data/output/bot_amazon/extract_amazon_full_TESTE2024-09-16.xlsx"

# Criar diretório de logs, se não existir
os.makedirs(LOG_DIR, exist_ok=True)

# Configuração do logging
logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def load_data(file_path):
    """Carrega os dados de um arquivo Excel."""
    try:
        df = pd.read_excel(file_path)
        logging.info(f"Dados carregados com sucesso de {file_path}")
        return df
    except Exception as e:
        logging.error(f"Erro ao carregar dados de {file_path}: {e}")
        return None

def clean_data(df):
    """Limpa e formata os dados extraídos da Amazon."""
    df['rank'] = df['rank'].str.replace('#', '', regex=False)

    df['rating'] = df['rating'].str.replace(' van 5 sterren', '').replace('No rating', "3").str.replace(",", ".").astype(float)

    df['reviews'] = df['reviews'].str.replace('.', '', regex=False)  # Remove separadores de milhar
    df['reviews'] = pd.to_numeric(df['reviews'], errors='coerce')  # Converte para numérico (coerce converte inválidos para NaN)

    df['value'] = pd.to_numeric(df['value'].str.replace(',', '.'), errors='coerce').round(2)
    df['currency'] = df['symbol'].str.replace('€', 'EUR')
    
    # Aqui filtramos e redefinimos o DataFrame, evitando cópias implícitas
    df = df[df['value'] <= 200]
    
    logging.info('Dados limpos e valores inválidos removidos.')
    return df

def normalize_column(df, column):
    """Normaliza uma coluna usando min-max normalization."""
    return (df[column] - df[column].min()) / (df[column].max() - df[column].min())

def calculate_fields(df):
    """Adiciona campos normalizados e calcula o score e value_total."""
    df['normal_rating'] = normalize_column(df, 'rating')
    df['normal_reviews'] = normalize_column(df, 'reviews')
    df['normal_value'] = normalize_column(df, 'value')

    df['score'] = ((0.05 * df['normal_rating'] + 
                    0.7 * df['normal_reviews'] + 
                    0.25 * df['normal_value']) * 1000).astype(int)
    df['value_total'] = (df['value'] * df['reviews']).round(2)

    logging.info('Campos calculados adicionados com sucesso.')
    return df

def transform_data(df):
    """Executa o pipeline completo de transformação."""
    df_cleaned = clean_data(df)
    return calculate_fields(df_cleaned)

if __name__ == "__main__":
    df = load_data(DATA_FILE)

    if df is not None:
        df_transformed = transform_data(df)
    else:
        logging.error("Falha ao carregar os dados.")

print("tratamento finalizado")




tratamento finalizado


In [149]:
df.head()

Unnamed: 0,category,rank,asin,name,title,rating,reviews,symbol,value,image,link,date,origin,currency
0,Amazon Renewed,1,B08TJ2LGB8,Apple-AirPods-Pro-generatie-Refurbished,Apple AirPods Pro (1e generatie) (Refurbished),3.6,805,€,198.99,https://images-eu.ssl-images-amazon.com/images...,https://www.amazon.nl/Apple-AirPods-Pro-genera...,2024-09-16,best_sellers,EUR
1,Amazon Renewed,2,B00NBR7962,Sony-Mdr-Zx110-Opvouwbare-Instapkoptelefoon-Ui...,Sony Mdr-Zx110/Wc(Ae) Opvouwbare Instapkoptele...,4.5,15905,€,13.9,https://images-eu.ssl-images-amazon.com/images...,https://www.amazon.nl/Sony-Mdr-Zx110-Opvouwbar...,2024-09-16,best_sellers,EUR
2,Amazon Renewed,3,B0C6PCB61S,Soundcore-draadloze-waterbestendig-microfoons-...,Soundcore by Anker P20i True draadloze Earbuds...,4.4,486,€,15.99,https://images-eu.ssl-images-amazon.com/images...,https://www.amazon.nl/Soundcore-draadloze-wate...,2024-09-16,best_sellers,EUR
3,Amazon Renewed,4,B09K1RVTV6,2021-Apple-iPad-10-2%E2%80%91inch-Wi-Fi,"2021 Apple iPad (10.2‑inch, Wi-Fi, 64GB) - Spa...",4.2,330,€,295.89,https://images-eu.ssl-images-amazon.com/images...,https://www.amazon.nl/2021-Apple-iPad-10-2%E2%...,2024-09-16,best_sellers,EUR
4,Amazon Renewed,5,B0CFFKHZCR,Seagate-Enterprise-Capacity-ST12000NM0127-Refu...,Seagate Enterprise Capacity v7 ST12000NM0127 -...,4.2,463,€,134.9,https://images-eu.ssl-images-amazon.com/images...,https://www.amazon.nl/Seagate-Enterprise-Capac...,2024-09-16,best_sellers,EUR


## SALVAR EXCEL


In [110]:
from datetime import datetime
 # Define o nome do arquivo com base na data atual
directory_path = "C:/Users/ThiagoBizacha/Desktop/Projeto_Automacao_Coleta_Dados/data/output/bot_amazon"
today_date = datetime.today().strftime('%Y-%m-%d')
filename = f"{directory_path}/base_amazon_full_teste{today_date}.xlsx"
    
    # Salva o DataFrame em um arquivo Excel
df.to_excel(filename, index=False)



In [None]:
df

## CARGA POSTGRESQL

In [24]:
import pandas as pd

df = pd.read_excel("C:\\Users\\ThiagoBizacha\\Desktop\\Projeto_Automacao_Coleta_Dados\\data\\output\\bot_amazon\\base_amazon_final_2024-09-16.xlsx")

In [25]:
import psycopg2

def load_to_postgresql(df):
    """Carrega os dados do DataFrame no banco de dados PostgreSQL."""
    conn = psycopg2.connect(
        host="localhost",
        database="proj_dropshipping",
        user="postgres",
        password="admin"
    )
    print("Conexão com o banco de dados realizada com sucesso!")

    cursor = conn.cursor()

    for index, row in df.iterrows():
        cursor.execute("""
            INSERT INTO public.amazon_nl_final (
                category, rank, asin, name, title, rating, reviews, currency, value, image, link, date, origin,  
                normal_rating, normal_reviews, normal_value, score, value_total
            ) VALUES (%s, %s, %s, %s, %s,%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            """, (
            row['category'], row['rank'], row['asin'], row['name'], row['title'], row['rating'], row['reviews'],
            row['currency'], row['value'], row['image'], row['link'], row['date'], row['origin'], row['normal_rating'], 
            row['normal_reviews'], row['normal_value'], row['score'], row['value_total']
        ))

    conn.commit()
    cursor.close()
    conn.close()
    print("Dados carregados com sucesso no PostgreSQL.")
    
load_to_postgresql(df)

Conexão com o banco de dados realizada com sucesso!
Dados carregados com sucesso no PostgreSQL.


## BEST SELLERS

In [7]:
import requests
from lxml import html
import pandas as pd
import time
import re
from datetime import datetime

def get_category_links(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

    tree = html.fromstring(response.content)
    categories = tree.xpath('//div[contains(@class, "_p13n-zg-nav-tree-all_style_zg-browse-group__88fbz")]//a')
    category_links = [{'category': cat.text_content().strip(), 'link': 'https://www.amazon.nl' + cat.get('href')} for cat in categories]

    return category_links

def extract_product_details(items, category):
    products = []
    # Obter a data de hoje
    today_date = datetime.today().strftime('%Y-%m-%d')

    for index, item in enumerate(items, start=1):
        # Produto Individual (data-asin está em um elemento pai)
        product_id = item.xpath('.//@data-asin')
        product_id = product_id[0] if product_id else "No ID"
        print(f"Product ID: {product_id}")

        # Posição na Lista
        position = item.xpath('.//span[@class="zg-bdg-text"]/text()')
        position = position[0].strip() if position else str(index)
        print(f"Posição: {position}")

        # Imagem do Produto
        image = item.xpath('.//img[contains(@class, "a-dynamic-image")]/@src')
        image_link = image[0] if image else "No image link"
        print(f"Image link: {image_link}")

        # Título e Link
        title = item.xpath('.//a/span/div/text()')
        title = title[0].strip() if title else "No title"
        print(f"Title: {title}")

        link = item.xpath('.//a[contains(@class, "a-link-normal")]/@href')
        product_link = "https://www.amazon.nl" + link[0] if link else "No product link"
        print(f"Product link: {product_link}")

        # Extrair Name
        name = item.xpath('.//a[@class="a-link-normal aok-block"]/@href')
        if name:
            name = name[0].split('/')[1]
        else:
            name = "No name"
        print(f"Name: {name}")

        # Avaliações
        rating = item.xpath('.//span[contains(@class, "a-icon-alt")]/text()')
        rating = rating[0].strip() if rating else "No rating"
        print(f"Rating: {rating}")

        reviews = item.xpath('.//span[@class="a-size-small"]/text()')
        reviews = reviews[0].strip() if reviews else "No reviews"
        print(f"Reviews: {reviews}")

        # Preço
        price = item.xpath('.//span[contains(@class, "p13n-sc-price")]/text()')
        if price:
            price = price[0].strip()
            # Separar símbolo e valor usando expressões regulares
            currency_symbol = ''.join(re.findall(r'[^\d.,]', price))
            value = ''.join(re.findall(r'[\d.,]+', price))
        else:
            currency_symbol = "Not Available"
            value = "Not Available"
        print(f"Currency Symbol: {currency_symbol}, Value: {value}")

        products.append({
            "category": category,
            "rank": position,
            "asin": product_id,
            "name": name,
            "title": title,
            "rating": rating,
            "reviews": reviews,
            "symbol": currency_symbol,
            "value": value,
            "image": image_link,
            "link": product_link,        
            "date": today_date
        })

    return products

def get_amazon_bestsellers(url, category, retries=5, backoff_factor=0.3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    
    for i in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            break
        elif response.status_code == 503:
            print(f"Failed to retrieve the page. Status code: 503. Retrying {i+1}/{retries}...")
            time.sleep(backoff_factor * (2 ** i))  # Exponential backoff
        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return []
    
    if response.status_code != 200:
        print(f"Failed to retrieve the page after {retries} retries.")
        return []
    
    tree = html.fromstring(response.content)
    items = tree.xpath('//div[contains(@id, "p13n-asin-index")]')
    print(f"Found {len(items)} items in category {category}.")
    
    products = extract_product_details(items, category)
    return products

def save_to_excel(products, directory_path):
    df = pd.DataFrame(products)
    today_date = datetime.today().strftime('%Y-%m-%d')
    filename = f"{directory_path}/bot_amazon_best_sellers_{today_date}.xlsx"
    df.to_excel(filename, index=False)
    print(f"Products saved to {filename}")

if __name__ == "__main__":
    base_url = "https://www.amazon.nl/gp/bestsellers/"
    category_links = get_category_links(base_url)

    all_products = []
    for category_link in category_links:
        category = category_link['category']
        link = category_link['link']
        print(f"Processing category: {category}")
        products = get_amazon_bestsellers(link, category)
        all_products.extend(products)
        time.sleep(10)  # Adicionando tempo de espera maior entre as requisições para melhor performance

    # Caminho do diretório onde o arquivo Excel será salvo
    directory_path = "C:/Users/ThiagoBizacha/Desktop/Projeto_Automacao_Coleta_Dados/data/output/bot_amazon"
    save_to_excel(all_products, directory_path)
    print("Finalizado!")


Processing category: Amazon Renewed
Found 30 items in category Amazon Renewed.
Product ID: B00NBR7962
Posição: #1
Image link: https://images-eu.ssl-images-amazon.com/images/I/61ZbQQiPo4L._AC_UL300_SR300,200_.jpg
Title: Sony Mdr-Zx110/Wc(Ae) Opvouwbare Instapkoptelefoon Met Uitstekend Geluid (30 Mm Driver), Wit
Product link: https://www.amazon.nl/Sony-Mdr-Zx110-Opvouwbare-Instapkoptelefoon-Uitstekend/dp/B00NBR7962/ref=zg_bs_g_amazon-renewed_d_sccl_1/257-2945123-2991735?psc=1
Name: Sony-Mdr-Zx110-Opvouwbare-Instapkoptelefoon-Uitstekend
Rating: 4,5 van 5 sterren
Reviews: 15.897
Currency Symbol: € , Value: 13,90
Product ID: B08TJ2LGB8
Posição: #2
Image link: https://images-eu.ssl-images-amazon.com/images/I/71eGsUPZm2L._AC_UL300_SR300,200_.jpg
Title: Apple AirPods Pro (1e generatie) (Refurbished)
Product link: https://www.amazon.nl/Apple-AirPods-Pro-generatie-Refurbished/dp/B08TJ2LGB8/ref=zg_bs_g_amazon-renewed_d_sccl_2/257-2945123-2991735?psc=1
Name: Apple-AirPods-Pro-generatie-Refurbished

## NEW RELEASES

In [5]:
import requests
from lxml import html
import pandas as pd
import time
import re
from datetime import datetime

def get_category_links(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

    tree = html.fromstring(response.content)
    categories = tree.xpath('//div[contains(@class, "_p13n-zg-nav-tree-all_style_zg-browse-group__88fbz")]//a')
    category_links = [{'category': cat.text_content().strip(), 'link': 'https://www.amazon.nl' + cat.get('href')} for cat in categories]

    return category_links

def extract_product_details(items, category):
    products = []
    # Obter a data de hoje
    today_date = datetime.today().strftime('%Y-%m-%d')

    for index, item in enumerate(items, start=1):
        # Produto Individual (data-asin está em um elemento pai)
        product_id = item.xpath('.//@data-asin')
        product_id = product_id[0] if product_id else "No ID"
        print(f"Product ID: {product_id}")

        # Posição na Lista
        position = item.xpath('.//span[@class="zg-bdg-text"]/text()')
        position = position[0].strip() if position else str(index)
        print(f"Posição: {position}")

        # Imagem do Produto
        image = item.xpath('.//img[contains(@class, "a-dynamic-image")]/@src')
        image_link = image[0] if image else "No image link"
        print(f"Image link: {image_link}")

        # Título e Link
        title = item.xpath('.//a/span/div/text()')
        title = title[0].strip() if title else "No title"
        print(f"Title: {title}")

        link = item.xpath('.//a[contains(@class, "a-link-normal")]/@href')
        product_link = "https://www.amazon.nl" + link[0] if link else "No product link"
        print(f"Product link: {product_link}")

        # Extrair Name
        name = item.xpath('.//a[@class="a-link-normal aok-block"]/@href')
        if name:
            name = name[0].split('/')[1]
        else:
            name = "No name"
        print(f"Name: {name}")

        # Avaliações
        rating = item.xpath('.//span[contains(@class, "a-icon-alt")]/text()')
        rating = rating[0].strip() if rating else "No rating"
        print(f"Rating: {rating}")

        reviews = item.xpath('.//span[@class="a-size-small"]/text()')
        reviews = reviews[0].strip() if reviews else "No reviews"
        print(f"Reviews: {reviews}")

        # Preço
        price = item.xpath('.//span[contains(@class, "p13n-sc-price")]/text()')
        if price:
            price = price[0].strip()
            # Separar símbolo e valor usando expressões regulares
            currency_symbol = ''.join(re.findall(r'[^\d.,]', price))
            value = ''.join(re.findall(r'[\d.,]+', price))
        else:
            currency_symbol = "Not Available"
            value = "Not Available"
        print(f"Currency Symbol: {currency_symbol}, Value: {value}")

        products.append({
            "category": category,
            "rank": position,
            "asin": product_id,
            "name": name,
            "title": title,
            "rating": rating,
            "reviews": reviews,
            "symbol": currency_symbol,
            "value": value,
            "image": image_link,
            "link": product_link,        
            "date": today_date
        })
        
    return products

def get_amazon_bestsellers(url, category, retries=5, backoff_factor=0.3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    
    for i in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            break
        elif response.status_code == 503:
            print(f"Failed to retrieve the page. Status code: 503. Retrying {i+1}/{retries}...")
            time.sleep(backoff_factor * (2 ** i))  # Exponential backoff
        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return []
    
    if response.status_code != 200:
        print(f"Failed to retrieve the page after {retries} retries.")
        return []
    
    tree = html.fromstring(response.content)
    items = tree.xpath('//div[contains(@id, "p13n-asin-index")]')
    print(f"Found {len(items)} items in category {category}.")
    
    products = extract_product_details(items, category)
    return products

def save_to_excel(products, directory_path):
    df = pd.DataFrame(products)
    today_date = datetime.today().strftime('%Y-%m-%d')
    filename = f"{directory_path}/bot_amazon_new_releases_{today_date}.xlsx"
    df.to_excel(filename, index=False)
    print(f"Products saved to {filename}")

if __name__ == "__main__":
    base_url = "https://www.amazon.nl/gp/new-releases/"
    category_links = get_category_links(base_url)

    all_products = []
    for category_link in category_links:
        category = category_link['category']
        link = category_link['link']
        print(f"Processing category: {category}")
        products = get_amazon_bestsellers(link, category)
        all_products.extend(products)
        time.sleep(10)  # Adicionando tempo de espera maior entre as requisições para melhor performance

    # Caminho do diretório onde o arquivo Excel será salvo
    directory_path = "C:/Users/ThiagoBizacha/Desktop/Projeto_Automacao_Coleta_Dados/data/output/bot_amazon"
    save_to_excel(all_products, directory_path)
    print("Finalizado!")


Processing category: Amazon Renewed
Found 18 items in category Amazon Renewed.
Product ID: B0D97YRH4L
Posição: #1
Image link: https://images-eu.ssl-images-amazon.com/images/I/51g1kjXSdDL._AC_UL300_SR300,200_.jpg
Title: Lenovo ThinkCentre M75q-1 Tiny PC Computer AMD Ryzen 5 Pro 3400GE, 16 GB RAM, 512 GB SSD, HDMI, Windows 11 Pro (gereviseerd)
Product link: https://www.amazon.nl/Lenovo-ThinkCentre-Computer-Windows-gereviseerd/dp/B0D97YRH4L/ref=zg_bsnr_g_amazon-renewed_d_sccl_1/261-6258010-4544442?psc=1
Name: Lenovo-ThinkCentre-Computer-Windows-gereviseerd
Rating: No rating
Reviews: No reviews
Currency Symbol: € , Value: 224,90
Product ID: B094PWBNSD
Posição: #2
Image link: https://images-eu.ssl-images-amazon.com/images/I/71BpqDlssjL._AC_UL300_SR300,200_.jpg
Title: Seagate IronWolf Pro ST16000NE000 interrne NAS Festplatte 16TB HDD, 3.5 Zoll, 7200 U/Min, CMR, 256 MB Cache, SATA 6GB/S, ST16000NE000 (Refurbished)
Product link: https://www.amazon.nl/Seagate-IronWolf-ST16000NE000-Festplatte-Re

## MOVERS AND SHAKERS

In [6]:
import requests
from lxml import html
import pandas as pd
import time
import re
from datetime import datetime

def get_category_links(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

    tree = html.fromstring(response.content)
    categories = tree.xpath('//div[contains(@class, "_p13n-zg-nav-tree-all_style_zg-browse-group__88fbz")]//a')
    category_links = [{'category': cat.text_content().strip(), 'link': 'https://www.amazon.nl' + cat.get('href')} for cat in categories]

    return category_links

def extract_product_details(items, category):
    products = []
    # Obter a data de hoje
    today_date = datetime.today().strftime('%Y-%m-%d')

    for index, item in enumerate(items, start=1):
        # Produto Individual (data-asin está em um elemento pai)
        product_id = item.xpath('.//@data-asin')
        product_id = product_id[0] if product_id else "No ID"
        print(f"Product ID: {product_id}")

        # Posição na Lista
        position = item.xpath('.//span[@class="zg-bdg-text"]/text()')
        position = position[0].strip() if position else str(index)
        print(f"Posição: {position}")

        # Imagem do Produto
        image = item.xpath('.//img[contains(@class, "a-dynamic-image")]/@src')
        image_link = image[0] if image else "No image link"
        print(f"Image link: {image_link}")

        # Título e Link
        title = item.xpath('.//a/span/div/text()')
        title = title[0].strip() if title else "No title"
        print(f"Title: {title}")

        link = item.xpath('.//a[contains(@class, "a-link-normal")]/@href')
        product_link = "https://www.amazon.nl" + link[0] if link else "No product link"
        print(f"Product link: {product_link}")

        # Extrair Name
        name = item.xpath('.//a[@class="a-link-normal aok-block"]/@href')
        if name:
            name = name[0].split('/')[1]
        else:
            name = "No name"
        print(f"Name: {name}")

        # Avaliações
        rating = item.xpath('.//span[contains(@class, "a-icon-alt")]/text()')
        rating = rating[0].strip() if rating else "No rating"
        print(f"Rating: {rating}")

        reviews = item.xpath('.//span[@class="a-size-small"]/text()')
        reviews = reviews[0].strip() if reviews else "No reviews"
        print(f"Reviews: {reviews}")

        # Preço
        price = item.xpath('.//span[contains(@class, "p13n-sc-price")]/text()')
        if price:
            price = price[0].strip()
            # Separar símbolo e valor usando expressões regulares
            currency_symbol = ''.join(re.findall(r'[^\d.,]', price))
            value = ''.join(re.findall(r'[\d.,]+', price))
        else:
            currency_symbol = "Not Available"
            value = "Not Available"
        print(f"Currency Symbol: {currency_symbol}, Value: {value}")

        products.append({
            "category": category,
            "rank": position,
            "asin": product_id,
            "name": name,
            "title": title,
            "rating": rating,
            "reviews": reviews,
            "symbol": currency_symbol,
            "value": value,
            "image": image_link,
            "link": product_link,        
            "date": today_date
        })
        
    return products

def get_amazon_bestsellers(url, category, retries=5, backoff_factor=0.3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    
    for i in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            break
        elif response.status_code == 503:
            print(f"Failed to retrieve the page. Status code: 503. Retrying {i+1}/{retries}...")
            time.sleep(backoff_factor * (2 ** i))  # Exponential backoff
        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return []
    
    if response.status_code != 200:
        print(f"Failed to retrieve the page after {retries} retries.")
        return []
    
    tree = html.fromstring(response.content)
    items = tree.xpath('//div[contains(@id, "p13n-asin-index")]')
    print(f"Found {len(items)} items in category {category}.")
    
    products = extract_product_details(items, category)
    return products

def save_to_excel(products, directory_path):
    df = pd.DataFrame(products)
    today_date = datetime.today().strftime('%Y-%m-%d')
    filename = f"{directory_path}/bot_amazon_movers_and_shakers_{today_date}.xlsx"
    df.to_excel(filename, index=False)
    print(f"Products saved to {filename}")

if __name__ == "__main__":
    base_url = "https://www.amazon.nl/gp/movers-and-shakers/"
    category_links = get_category_links(base_url)

    all_products = []
    for category_link in category_links:
        category = category_link['category']
        link = category_link['link']
        print(f"Processing category: {category}")
        products = get_amazon_bestsellers(link, category)
        all_products.extend(products)
        time.sleep(10)  # Adicionando tempo de espera maior entre as requisições para melhor performance

    # Caminho do diretório onde o arquivo Excel será salvo
    directory_path = "C:/Users/ThiagoBizacha/Desktop/Projeto_Automacao_Coleta_Dados/data/output/bot_amazon"
    save_to_excel(all_products, directory_path)
    print("Finalizado!")


Products saved to C:/Users/ThiagoBizacha/Desktop/Projeto_Automacao_Coleta_Dados/data/output/bot_amazon/bot_amazon_movers_and_shakers_2024-09-13.xlsx
Finalizado!


## BACKUP

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

def get_amazon_bestsellers(url, retries=5, backoff_factor=0.3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    
    for i in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            break
        elif response.status_code == 503:
            print(f"Failed to retrieve the page. Status code: 503. Retrying {i+1}/{retries}...")
            time.sleep(backoff_factor * (2 ** i))  # Exponential backoff
        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return []
    
    if response.status_code != 200:
        print(f"Failed to retrieve the page after {retries} retries.")
        return []
    
    soup = BeautifulSoup(response.content, "html.parser")
    products = []

    # Verificar a estrutura HTML da página para encontrar o seletor correto
    items = soup.select("li.a-carousel-card")  # Ajuste o seletor para os itens de produto
    print(f"Found {len(items)} items.")

    for item in items:
        # Ajuste os seletores conforme necessário
        title = item.select_one("div.p13n-sc-truncate-desktop-type2")
        price = item.select_one("span._cDEzb_p13n-sc-price_3mJ9Z")
        rating = item.select_one("span.a-icon-alt")
        image = item.select_one("img.a-dynamic-image")
        link = item.select_one("a.a-link-normal")

        # Verificação e logs dos elementos encontrados
        if title:
            title = title.get_text(strip=True)
        else:
            title = "No title"
        print(f"Title: {title}")

        if price:
            price = price.get_text(strip=True)
            # Separar o símbolo da moeda e o valor
            currency_symbol = re.findall(r'[^\d.,]+', price)[0]
            value = re.findall(r'[\d.,]+', price)[0]
        else:
            currency_symbol = "Not Available"
            value = "Not Available"
        print(f"Currency Symbol: {currency_symbol}, Value: {value}")

        if rating:
            rating = rating.get_text(strip=True)
        else:
            rating = "No rating"
        print(f"Rating: {rating}")

        if image:
            image_link = image.get('src')
        else:
            image_link = "No image link"
        print(f"Image link: {image_link}")

        if link:
            product_link = "https://www.amazon.nl" + link.get('href')
        else:
            product_link = "No product link"
        print(f"Product link: {product_link}")

        products.append({
            "title": title,
            "currency_symbol": currency_symbol,
            "price_value": value,
            "rating": rating,
            "image_link": image_link,
            "product_link": product_link
        })

    return products

def save_to_excel(products, filename):
    df = pd.DataFrame(products)
    df.to_excel(filename, index=False)
    print(f"Products saved to {filename}")

if __name__ == "__main__":
    url = "https://www.amazon.nl/gp/new-releases"
    products = get_amazon_bestsellers(url)
    save_to_excel(products, "amazon_new_releases.xlsx")


Found 36 items.
Title: BSITSSS Watersensorische mat voor huisdier spelen, watersensorische speelmat voor katten en honden, verdikte sensorische watermat comfort, verkoelende kat waterspeelmat, speelgoed, grappige watermat
Currency Symbol: € , Value: 10,20
Rating: 2,1 van 5 sterren
Image link: https://images-eu.ssl-images-amazon.com/images/I/71Yr73bmSpL._AC_UL225_SR225,160_.jpg
Product link: https://www.amazon.nl/BSITSSS-Watersensorische-watersensorische-sensorische-waterspeelmat/dp/B0D7DKB389/ref=zg_bsnr_c_pet-supplies_d_sccl_1/259-0152082-0893144?pd_rd_w=VaZWc&content-id=amzn1.sym.f882a860-19f1-44df-b232-144e06421629&pf_rd_p=f882a860-19f1-44df-b232-144e06421629&pf_rd_r=JS8WCYZ26C2D8XAD6R28&pd_rd_wg=qXOon&pd_rd_r=f3d5b8fe-9092-44a0-91b0-c0b73434e4c1&pd_rd_i=B0D7DKB389&psc=1
Title: Draadloze Kat Water Fontein: Batterij Betrokken Roestvrij staal Pet Fontein, Automatische Draadloze Waterdispenser Binnenshuis, Metalen Kraan Fles voor Drinken, Oplaadbare Hond Water Bowl met 1 Filter
Currenc

### V2

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from lxml import html

def get_category_links(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")
    categories = soup.select('div._p13n-zg-nav-tree-all_style_zg-browse-group__88fbz a')
    category_links = [{'category': cat.get_text(strip=True), 'link': 'https://www.amazon.nl' + cat['href']} for cat in categories]

    return category_links

def get_amazon_bestsellers(url, category, retries=5, backoff_factor=0.3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    
    for i in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            break
        elif response.status_code == 503:
            print(f"Failed to retrieve the page. Status code: 503. Retrying {i+1}/{retries}...")
            time.sleep(backoff_factor * (2 ** i))  # Exponential backoff
        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return []
    
    if response.status_code != 200:
        print(f"Failed to retrieve the page after {retries} retries.")
        return []
    
    tree = html.fromstring(response.content)
    products = []

    items = tree.xpath('//div[@class="zg-grid-general-faceout"]')
    print(f"Found {len(items)} items in category {category}.")

    for item in items:
        #name = item.xpath('.//a[@class="a-link-normal aok-block"]/@href')
        #title = item.xpath('.//div[@class="_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y"]/text()')
        #brand = item.xpath('.//div[@class="_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y"]/text()')
        #price = item.xpath('.//span[contains(@class,"_cDEzb_p13n-sc-price_3mJ9Z") or contains(@class,"p13n-sc-price")]/text()')
        #rating = item.xpath('.//span[@class="a-icon-alt"]/text()')
        #reviews = item.xpath('.//a[@class="a-size-small a-link-normal"]/text()')
        #platform = item.xpath('.//div[@class="_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y"]/text()')
        #image = item.xpath('.//img[contains(@class,"a-dynamic-image") or contains(@class,"s-image")]/@src')
        #link = item.xpath('.//a[@class="a-link-normal aok-block"]/@href')

        name = item.xpath('.//a[@class="a-link-normal aok-block"]/@href')
        title = item.xpath('.//div[contains(@class, "p13n-sc-css-line-clamp-1")]/text()')
        if not title:
            title = item.xpath('.//span[contains(@class, "p13n-sc-css-line-clamp")]/text()')
        if not title:
            title = item.xpath('.//h2[contains(@class, "p13n-sc-css-line-clamp")]/text()')
        if not title:
            title = item.xpath('.//h3[contains(@class, "p13n-sc-css-line-clamp")]/text()')
        if not title:
            title = ['No title']
        brand = item.xpath('.//span[contains(@class, "p13n-sc-text")]/text()')
        price = item.xpath('.//span[contains(@class, "p13n-sc-price") or contains(@class, "p13n-sc-price-3mJ9Z")]/text()')
        rating = item.xpath('.//span[contains(@class, "a-icon-alt")]/text()')
        reviews = item.xpath('.//a[contains(@class, "a-size-small") and contains(@class, "a-link-normal")]/text()')
        platform = item.xpath('.//span[contains(@class, "p13n-sc-text")]/text()')
        image = item.xpath('.//img[contains(@class, "a-dynamic-image") or contains(@class, "s-image")]/@src')
        link = item.xpath('.//a[contains(@class, "a-link-normal") and contains(@class, "aok-block")]/@href')


        if name:
            name = name[0].split('/')[1]
        else:
            name = "No name"
        print(f"Name: {name}")

        if title:
            title = title[0].strip()
        else:
            title = "No title"
        print(f"Title: {title}")

        if brand:
            brand = brand[1].strip() if len(brand) > 1 else "No brand"
        else:
            brand = "No brand"
        print(f"Brand: {brand}")

        if price:
            price = price[0].strip()
            currency_symbol = re.findall(r'[^\d.,]+', price)[0]
            value = re.findall(r'[\d.,]+', price)[0]
        else:
            currency_symbol = "Not Available"
            value = "Not Available"
        print(f"Currency Symbol: {currency_symbol}, Value: {value}")

        if rating:
            rating = rating[0].strip()
        else:
            rating = "No rating"
        print(f"Rating: {rating}")

        if reviews:
            reviews = reviews[0].strip()
        else:
            reviews = "No reviews"
        print(f"Reviews: {reviews}")

        if platform:
            platform = platform[-1].strip()
        else:
            platform = "No platform"
        print(f"Platform: {platform}")

        if image:
            image_link = image[0]
        else:
            image_link = "No image link"
        print(f"Image link: {image_link}")

        if link:
            product_link = "https://www.amazon.nl" + link[0]
        else:
            product_link = "No product link"
        print(f"Product link: {product_link}")

        products.append({
            "name": name,
            "title": title,
            "brand": brand,
            "currency_symbol": currency_symbol,
            "price_value": value,
            "rating": rating,
            "reviews": reviews,
            "platform": platform,
            "image_link": image_link,
            "product_link": product_link,
            "category": category
        })

    return products

def save_to_excel(products, filename):
    df = pd.DataFrame(products)
    df.to_excel(filename, index=False)
    print(f"Products saved to {filename}")

if __name__ == "__main__":
    base_url = "https://www.amazon.nl/gp/bestsellers/"
    category_links = get_category_links(base_url)

    all_products = []
    for category_link in category_links:
        category = category_link['category']
        link = category_link['link']
        print(f"Processing category: {category}")
        products = get_amazon_bestsellers(link, category)
        all_products.extend(products)
        time.sleep(10)  # Adicionando tempo de espera maior entre as requisições para melhor performance

    save_to_excel(all_products, "amazon_bestsellers_by_category.xlsx")
    print("Finalizado!")


Processing category: Amazon Renewed
Found 30 items in category Amazon Renewed.
Name: Sony-Mdr-Zx110-Opvouwbare-Instapkoptelefoon-Uitstekend
Title: No title
Brand: No brand
Currency Symbol: € , Value: 14,95
Rating: 4,5 van 5 sterren
Reviews: No reviews
Platform: No platform
Image link: https://images-eu.ssl-images-amazon.com/images/I/61ZbQQiPo4L._AC_UL300_SR300,200_.jpg
Product link: https://www.amazon.nl/Sony-Mdr-Zx110-Opvouwbare-Instapkoptelefoon-Uitstekend/dp/B00NBR7962/ref=zg_bs_g_amazon-renewed_d_sccl_1/261-9612622-4262127?psc=1
Name: Apple-Magic-Keyboard-voor-9%E2%80%91inch
Title: No title
Brand: No brand
Currency Symbol: € , Value: 229,89
Rating: 4,4 van 5 sterren
Reviews: No reviews
Platform: No platform
Image link: https://images-eu.ssl-images-amazon.com/images/I/81UDF62AqHS._AC_UL300_SR300,200_.jpg
Product link: https://www.amazon.nl/Apple-Magic-Keyboard-voor-9%E2%80%91inch/dp/B0B2XSLDL7/ref=zg_bs_g_amazon-renewed_d_sccl_2/261-9612622-4262127?psc=1
Name: Seagate-Enterprise-Cap

KeyboardInterrupt: 

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from lxml import html

def get_category_links(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")
    categories = soup.select('div._p13n-zg-nav-tree-all_style_zg-browse-group__88fbz a')
    category_links = [{'category': cat.get_text(strip=True), 'link': 'https://www.amazon.nl' + cat['href']} for cat in categories]

    return category_links

def get_amazon_bestsellers(url, category, retries=5, backoff_factor=0.3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    
    for i in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            break
        elif response.status_code == 503:
            print(f"Failed to retrieve the page. Status code: 503. Retrying {i+1}/{retries}...")
            time.sleep(backoff_factor * (2 ** i))  # Exponential backoff
        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return []
    
    if response.status_code != 200:
        print(f"Failed to retrieve the page after {retries} retries.")
        return []
    
    tree = html.fromstring(response.content)
    products = []

    items = tree.xpath('//div[@class="zg-grid-general-faceout"]')
    print(f"Found {len(items)} items in category {category}.")

    for index, item in enumerate(items, start=1):
        
        title = item.xpath('.//div[contains(@class, "p13n-sc-css-line-clamp-1")]/text()')
        if not title:
            title = item.xpath('.//span[contains(@class, "p13n-sc-css-line-clamp")]/text()')
        if not title:
            title = item.xpath('.//h2[contains(@class, "p13n-sc-css-line-clamp")]/text()')
        if not title:
            title = item.xpath('.//h3[contains(@class, "p13n-sc-css-line-clamp")]/text()')
        if not title:
            title = ['No title']
        brand = item.xpath('.//span[contains(@class, "p13n-sc-text")]/text()')
        price = item.xpath('.//span[contains(@class, "p13n-sc-price") or contains(@class, "p13n-sc-price-3mJ9Z")]/text()')
        rating = item.xpath('.//span[contains(@class, "a-icon-alt")]/text()')
        reviews = item.xpath('.//a[contains(@class, "a-size-small") and contains(@class, "a-link-normal")]/text()')
        image = item.xpath('.//img[contains(@class, "a-dynamic-image") or contains(@class, "s-image")]/@src')
        link = item.xpath('.//a[contains(@class, "a-link-normal") and contains(@class, "aok-block")]/@href')
        name = item.xpath('.//a[@class="a-link-normal aok-block"]/@href')
        if name:
            name = name[0].split('/')[1]
        else:
            name = "No name"
        print(f"Name: {name}")

        if title:
            title = title[0].strip()
        else:
            title = "No title"
        print(f"Title: {title}")

        if brand:
            brand = brand[1].strip() if len(brand) > 1 else "No brand"
        else:
            brand = "No brand"
        print(f"Brand: {brand}")

        if price:
            price = price[0].strip()
            currency_symbol = re.findall(r'[^\d.,]+', price)[0]
            value = re.findall(r'[\d.,]+', price)[0]
        else:
            currency_symbol = "Not Available"
            value = "Not Available"
        print(f"Currency Symbol: {currency_symbol}, Value: {value}")

        if rating:
            rating = rating[0].strip()
        else:
            rating = "No rating"
        print(f"Rating: {rating}")

        if reviews:
            reviews = reviews[0].strip()
        else:
            reviews = "No reviews"
        print(f"Reviews: {reviews}")

        if image:
            image_link = image[0]
        else:
            image_link = "No image link"
        print(f"Image link: {image_link}")

        if link:
            product_link = "https://www.amazon.nl" + link[0]
        else:
            product_link = "No product link"
        print(f"Product link: {product_link}")

        products.append({
            "Produto Individual": name,
            "Posição na Lista": index,
            "Imagem do Produto": image_link,
            "Título": title,
            "Link": product_link,
            "Numero de estrelas": rating,
            "Numero de avaliações": reviews,
            "Preço": f"{currency_symbol} {value}",
            "category": category
        })

    return products

def save_to_excel(products, filename):
    df = pd.DataFrame(products)
    df.to_excel(filename, index=False)
    print(f"Products saved to {filename}")

if __name__ == "__main__":
    base_url = "https://www.amazon.nl/gp/bestsellers/"
    category_links = get_category_links(base_url)

    all_products = []
    for category_link in category_links:
        category = category_link['category']
        link = category_link['link']
        print(f"Processing category: {category}")
        products = get_amazon_bestsellers(link, category)
        all_products.extend(products)
        time.sleep(10)  # Adicionando tempo de espera maior entre as requisições para melhor performance

    save_to_excel(all_products, "amazon_bestsellers_by_category.xlsx")
    print("Finalizado!")


Processing category: Amazon Renewed
Found 30 items in category Amazon Renewed.
Name: Apple-10-2-inch-Wi-Fi-Spacezwart-Refurbished
Title: No title
Brand: No brand
Currency Symbol: € , Value: 244,89
Rating: 4,4 van 5 sterren
Reviews: No reviews
Image link: https://images-eu.ssl-images-amazon.com/images/I/71F8udBqz3L._AC_UL300_SR300,200_.jpg
Product link: https://www.amazon.nl/Apple-10-2-inch-Wi-Fi-Spacezwart-Refurbished/dp/B08N89P2QZ/ref=zg_bs_g_amazon-renewed_d_sccl_1/258-7813849-1708323?psc=1
Name: Apple-iPhone-128GB-Sierra-Blue
Title: No title
Brand: No brand
Currency Symbol: € , Value: 595,00
Rating: 3,9 van 5 sterren
Reviews: No reviews
Image link: https://images-eu.ssl-images-amazon.com/images/I/61RAsVPOjxL._AC_UL300_SR300,200_.jpg
Product link: https://www.amazon.nl/Apple-iPhone-128GB-Sierra-Blue/dp/B09ML78C2J/ref=zg_bs_g_amazon-renewed_d_sccl_2/258-7813849-1708323?psc=1
Name: Apple-iPad-Air-64GB-Wi-Fi
Title: No title
Brand: No brand
Currency Symbol: € , Value: 211,89
Rating: 4,3 

KeyboardInterrupt: 

In [19]:
import requests
from lxml import html
import pandas as pd
import time
import re

def get_category_links(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

    tree = html.fromstring(response.content)
    categories = tree.xpath('//div[contains(@class, "_p13n-zg-nav-tree-all_style_zg-browse-group__88fbz")]//a')
    category_links = [{'category': cat.text_content().strip(), 'link': 'https://www.amazon.nl' + cat.get('href')} for cat in categories]

    return category_links

def extract_product_details(items):
    products = []

    for index, item in enumerate(items, start=1):
        # Produto Individual (data-asin está em um elemento pai)
        product_id = item.xpath('.//@data-asin')
        product_id = product_id[0] if product_id else "No ID"
        print(f"Product ID: {product_id}")

        # Posição na Lista
        position = item.xpath('.//span[@class="zg-bdg-text"]/text()')
        position = position[0].strip() if position else str(index)
        print(f"Posição: {position}")

        # Imagem do Produto
        image = item.xpath('.//img[contains(@class, "a-dynamic-image")]/@src')
        image_link = image[0] if image else "No image link"
        print(f"Image link: {image_link}")

        # Título e Link
        title = item.xpath('.//a/span/div/text()')
        title = title[0].strip() if title else "No title"
        print(f"Title: {title}")

        link = item.xpath('.//a[contains(@class, "a-link-normal")]/@href')
        product_link = "https://www.amazon.nl" + link[0] if link else "No product link"
        print(f"Product link: {product_link}")

        # Avaliações
        rating = item.xpath('.//span[contains(@class, "a-icon-alt")]/text()')
        rating = rating[0].strip() if rating else "No rating"
        print(f"Rating: {rating}")

        reviews = item.xpath('.//span[@class="a-size-small"]/text()')
        reviews = reviews[0].strip() if reviews else "No reviews"
        print(f"Reviews: {reviews}")

        # Preço
        price = item.xpath('.//span[contains(@class, "p13n-sc-price")]/text()')
        if price:
            price = price[0].strip()
            # Separar símbolo e valor usando expressões regulares
            currency_symbol = ''.join(re.findall(r'[^\d.,]', price))
            value = ''.join(re.findall(r'[\d.,]+', price))
        else:
            currency_symbol = "Not Available"
            value = "Not Available"
        print(f"Currency Symbol: {currency_symbol}, Value: {value}")

        products.append({
            "Produto Individual": product_id,
            "Posição na Lista": position,
            "Imagem do Produto": image_link,
            "Título": title,
            "Link": product_link,
            "Numero de estrelas": rating,
            "Numero de avaliações": reviews,
            "Símbolo da Moeda": currency_symbol,
            "Valor": value
        })

    return products

def get_amazon_bestsellers(url, category, retries=5, backoff_factor=0.3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    
    for i in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            break
        elif response.status_code == 503:
            print(f"Failed to retrieve the page. Status code: 503. Retrying {i+1}/{retries}...")
            time.sleep(backoff_factor * (2 ** i))  # Exponential backoff
        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return []
    
    if response.status_code != 200:
        print(f"Failed to retrieve the page after {retries} retries.")
        return []
    
    tree = html.fromstring(response.content)
    items = tree.xpath('//div[contains(@id, "p13n-asin-index")]')
    print(f"Found {len(items)} items in category {category}.")
    
    products = extract_product_details(items)
    return products

def save_to_excel(products, filename):
    df = pd.DataFrame(products)
    df.to_excel(filename, index=False)
    print(f"Products saved to {filename}")

if __name__ == "__main__":
    base_url = "https://www.amazon.nl/gp/bestsellers/"
    category_links = get_category_links(base_url)

    all_products = []
    for category_link in category_links:
        category = category_link['category']
        link = category_link['link']
        print(f"Processing category: {category}")
        products = get_amazon_bestsellers(link, category)
        all_products.extend(products)
        time.sleep(10)  # Adicionando tempo de espera maior entre as requisições para melhor performance

    save_to_excel(all_products, "amazon_bestsellers_by_category.xlsx")
    print("Finalizado!")


Processing category: Amazon Renewed
Found 30 items in category Amazon Renewed.
Product ID: B08N89P2QZ
Posição: #1
Image link: https://images-eu.ssl-images-amazon.com/images/I/71F8udBqz3L._AC_UL300_SR300,200_.jpg
Title: 2020 Apple iPad (10.2-inch, Wi-Fi, 32GB) Spacezwart (Refurbished)
Product link: https://www.amazon.nl/Apple-10-2-inch-Wi-Fi-Spacezwart-Refurbished/dp/B08N89P2QZ/ref=zg_bs_g_amazon-renewed_d_sccl_1/259-1305506-5100743?psc=1
Rating: 4,4 van 5 sterren
Reviews: 403
Currency Symbol: € , Value: 244,89
Product ID: B09ML78C2J
Posição: #2
Image link: https://images-eu.ssl-images-amazon.com/images/I/61RAsVPOjxL._AC_UL300_SR300,200_.jpg
Title: Apple iPhone 13 Pro, 128GB, Sierra Blue - (Refurbished)
Product link: https://www.amazon.nl/Apple-iPhone-128GB-Sierra-Blue/dp/B09ML78C2J/ref=zg_bs_g_amazon-renewed_d_sccl_2/259-1305506-5100743?psc=1
Rating: 3,9 van 5 sterren
Reviews: 403
Currency Symbol: € , Value: 595,00
Product ID: B07J4CMSVS
Posição: #3
Image link: https://images-eu.ssl-im

In [21]:
import requests
from lxml import html
import pandas as pd
import time
import re
from datetime import datetime

def get_category_links(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

    tree = html.fromstring(response.content)
    categories = tree.xpath('//div[contains(@class, "_p13n-zg-nav-tree-all_style_zg-browse-group__88fbz")]//a')
    category_links = [{'category': cat.text_content().strip(), 'link': 'https://www.amazon.nl' + cat.get('href')} for cat in categories]

    return category_links

def extract_product_details(items, category):
    products = []
    # Obter a data de hoje
    today_date = datetime.today().strftime('%Y-%m-%d')

    for index, item in enumerate(items, start=1):
        # Produto Individual (data-asin está em um elemento pai)
        product_id = item.xpath('.//@data-asin')
        product_id = product_id[0] if product_id else "No ID"
        print(f"Product ID: {product_id}")

        # Posição na Lista
        position = item.xpath('.//span[@class="zg-bdg-text"]/text()')
        position = position[0].strip() if position else str(index)
        print(f"Posição: {position}")

        # Imagem do Produto
        image = item.xpath('.//img[contains(@class, "a-dynamic-image")]/@src')
        image_link = image[0] if image else "No image link"
        print(f"Image link: {image_link}")

        # Título e Link
        title = item.xpath('.//a/span/div/text()')
        title = title[0].strip() if title else "No title"
        print(f"Title: {title}")

        link = item.xpath('.//a[contains(@class, "a-link-normal")]/@href')
        product_link = "https://www.amazon.nl" + link[0] if link else "No product link"
        print(f"Product link: {product_link}")

        # Extrair Name
        name = item.xpath('.//a[@class="a-link-normal aok-block"]/@href')
        if name:
            name = name[0].split('/')[1]
        else:
            name = "No name"
        print(f"Name: {name}")

        # Avaliações
        rating = item.xpath('.//span[contains(@class, "a-icon-alt")]/text()')
        rating = rating[0].strip() if rating else "No rating"
        print(f"Rating: {rating}")

        reviews = item.xpath('.//span[@class="a-size-small"]/text()')
        reviews = reviews[0].strip() if reviews else "No reviews"
        print(f"Reviews: {reviews}")

        # Preço
        price = item.xpath('.//span[contains(@class, "p13n-sc-price")]/text()')
        if price:
            price = price[0].strip()
            # Separar símbolo e valor usando expressões regulares
            currency_symbol = ''.join(re.findall(r'[^\d.,]', price))
            value = ''.join(re.findall(r'[\d.,]+', price))
        else:
            currency_symbol = "Not Available"
            value = "Not Available"
        print(f"Currency Symbol: {currency_symbol}, Value: {value}")

        products.append({
            "asim": product_id,
            "rank": position,
            "product_photo": image_link,
            "product_title": title,
            "link": product_link,
            "Name": name,
            "rating": rating,
            "reviews": reviews,
            "symbol": currency_symbol,
            "value": value,
            "category": category,
            "date": today_date
        })

    return products

def get_amazon_bestsellers(url, category, retries=5, backoff_factor=0.3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    
    for i in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            break
        elif response.status_code == 503:
            print(f"Failed to retrieve the page. Status code: 503. Retrying {i+1}/{retries}...")
            time.sleep(backoff_factor * (2 ** i))  # Exponential backoff
        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return []
    
    if response.status_code != 200:
        print(f"Failed to retrieve the page after {retries} retries.")
        return []
    
    tree = html.fromstring(response.content)
    items = tree.xpath('//div[contains(@id, "p13n-asin-index")]')
    print(f"Found {len(items)} items in category {category}.")
    
    products = extract_product_details(items, category)
    return products

def save_to_excel(products, filename):
    df = pd.DataFrame(products)
    df.to_excel(filename, index=False)
    print(f"Products saved to {filename}")

if __name__ == "__main__":
    base_url = "https://www.amazon.nl/gp/bestsellers/"
    category_links = get_category_links(base_url)

    all_products = []
    for category_link in category_links:
        category = category_link['category']
        link = category_link['link']
        print(f"Processing category: {category}")
        products = get_amazon_bestsellers(link, category)
        all_products.extend(products)
        time.sleep(10)  # Adicionando tempo de espera maior entre as requisições para melhor performance

    save_to_excel(all_products, "amazon_bestsellers_by_category.xlsx")
    print("Finalizado!")


Processing category: Amazon Renewed
Found 30 items in category Amazon Renewed.
Product ID: B08N89P2QZ
Posição: #1
Image link: https://images-eu.ssl-images-amazon.com/images/I/71F8udBqz3L._AC_UL300_SR300,200_.jpg
Title: 2020 Apple iPad (10.2-inch, Wi-Fi, 32GB) Spacezwart (Refurbished)
Product link: https://www.amazon.nl/Apple-10-2-inch-Wi-Fi-Spacezwart-Refurbished/dp/B08N89P2QZ/ref=zg_bs_g_amazon-renewed_d_sccl_1/259-2930722-3790644?psc=1
Name: Apple-10-2-inch-Wi-Fi-Spacezwart-Refurbished
Rating: 4,4 van 5 sterren
Reviews: 403
Currency Symbol: € , Value: 244,89
Product ID: B09ML78C2J
Posição: #2
Image link: https://images-eu.ssl-images-amazon.com/images/I/61RAsVPOjxL._AC_UL300_SR300,200_.jpg
Title: Apple iPhone 13 Pro, 128GB, Sierra Blue - (Refurbished)
Product link: https://www.amazon.nl/Apple-iPhone-128GB-Sierra-Blue/dp/B09ML78C2J/ref=zg_bs_g_amazon-renewed_d_sccl_2/259-2930722-3790644?psc=1
Name: Apple-iPhone-128GB-Sierra-Blue
Rating: 3,9 van 5 sterren
Reviews: 403
Currency Symbol: €