In [29]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

def get_amazon_bestsellers(url, retries=5, backoff_factor=0.3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    
    for i in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            break
        elif response.status_code == 503:
            print(f"Failed to retrieve the page. Status code: 503. Retrying {i+1}/{retries}...")
            time.sleep(backoff_factor * (2 ** i))  # Exponential backoff
        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return []
    
    if response.status_code != 200:
        print(f"Failed to retrieve the page after {retries} retries.")
        return []
    
    soup = BeautifulSoup(response.content, "html.parser")
    products = []

    # Verificar a estrutura HTML da página para encontrar o seletor correto
    items = soup.select("li.a-carousel-card")  # Ajuste o seletor para os itens de produto
    print(f"Found {len(items)} items.")

    for item in items:
        # Ajuste os seletores conforme necessário
        title = item.select_one("div.p13n-sc-truncate-desktop-type2")
        price = item.select_one("span._cDEzb_p13n-sc-price_3mJ9Z")
        rating = item.select_one("span.a-icon-alt")
        image = item.select_one("img.a-dynamic-image")
        link = item.select_one("a.a-link-normal")

        # Verificação e logs dos elementos encontrados
        if title:
            title = title.get_text(strip=True)
        else:
            title = "No title"
        print(f"Title: {title}")

        if price:
            price = price.get_text(strip=True)
            # Separar o símbolo da moeda e o valor
            currency_symbol = re.findall(r'[^\d.,]+', price)[0]
            value = re.findall(r'[\d.,]+', price)[0]
        else:
            currency_symbol = "Not Available"
            value = "Not Available"
        print(f"Currency Symbol: {currency_symbol}, Value: {value}")

        if rating:
            rating = rating.get_text(strip=True)
        else:
            rating = "No rating"
        print(f"Rating: {rating}")

        if image:
            image_link = image.get('src')
        else:
            image_link = "No image link"
        print(f"Image link: {image_link}")

        if link:
            product_link = "https://www.amazon.nl" + link.get('href')
        else:
            product_link = "No product link"
        print(f"Product link: {product_link}")

        products.append({
            "title": title,
            "currency_symbol": currency_symbol,
            "price_value": value,
            "rating": rating,
            "image_link": image_link,
            "product_link": product_link
        })

    return products

def save_to_excel(products, filename):
    df = pd.DataFrame(products)
    df.to_excel(filename, index=False)
    print(f"Products saved to {filename}")

if __name__ == "__main__":
    url = "https://www.amazon.nl/gp/new-releases"
    products = get_amazon_bestsellers(url)
    save_to_excel(products, "amazon_new_releases.xlsx")


Found 36 items.
Title: Bijengif reparatiecrème, vlekkenverwijderingscrème, bijengif huidtagverwijderaar, hydraterende vochtinbrengende crème voor alle huid
Currency Symbol: € , Value: 3,46
Rating: No rating
Image link: https://images-eu.ssl-images-amazon.com/images/I/61I0uPGWTcL._AC_UL225_SR225,160_.jpg
Product link: https://www.amazon.nl/reparatiecr%C3%A8me-vlekkenverwijderingscr%C3%A8me-huidtagverwijderaar-hydraterende-vochtinbrengende/dp/B0D6VJG91S/ref=zg_bsnr_c_beauty_d_sccl_1/260-0109900-4741351?pd_rd_w=8aMxO&content-id=amzn1.sym.f882a860-19f1-44df-b232-144e06421629&pf_rd_p=f882a860-19f1-44df-b232-144e06421629&pf_rd_r=AESPJ1TRZKA4JPGBCA0W&pd_rd_wg=J3SPl&pd_rd_r=194761f6-532e-444a-abb7-91c0b46acb6e&pd_rd_i=B0D6VJG91S&psc=1
Title: 2 Stuks Feromone Parfum Dames, Venom Feromoon parfum, Feromonen Parfum Voor Dames, Feromonen Parfum, Roll-On Feromon Parfum, Venom voor Haar, Feromon Parfum Vrouwen Parfum Dames
Currency Symbol: € , Value: 9,90
Rating: 3,2 van 5 sterren
Image link: https:/

In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from lxml import html

def get_category_links(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")
    categories = soup.select('div._p13n-zg-nav-tree-all_style_zg-browse-group__88fbz a')
    category_links = [{'category': cat.get_text(strip=True), 'link': 'https://www.amazon.nl' + cat['href']} for cat in categories]

    return category_links

def get_amazon_bestsellers(url, category, retries=5, backoff_factor=0.3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    
    for i in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            break
        elif response.status_code == 503:
            print(f"Failed to retrieve the page. Status code: 503. Retrying {i+1}/{retries}...")
            time.sleep(backoff_factor * (2 ** i))  # Exponential backoff
        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return []
    
    if response.status_code != 200:
        print(f"Failed to retrieve the page after {retries} retries.")
        return []
    
    tree = html.fromstring(response.content)
    products = []

    items = tree.xpath('//div[@class="zg-grid-general-faceout"]')
    print(f"Found {len(items)} items in category {category}.")

    for item in items:
        name = item.xpath('.//a[@class="a-link-normal aok-block"]/@href')
        title = item.xpath('.//div[@class="_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y"]/text()')
        brand = item.xpath('.//div[@class="_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y"]/text()')
        price = item.xpath('.//span[contains(@class,"_cDEzb_p13n-sc-price_3mJ9Z") or contains(@class,"p13n-sc-price")]/text()')
        rating = item.xpath('.//span[@class="a-icon-alt"]/text()')
        reviews = item.xpath('.//a[@class="a-size-small a-link-normal"]/text()')
        platform = item.xpath('.//div[@class="_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y"]/text()')
        image = item.xpath('.//img[contains(@class,"a-dynamic-image") or contains(@class,"s-image")]/@src')
        link = item.xpath('.//a[@class="a-link-normal aok-block"]/@href')

        if name:
            name = name[0].split('/')[1]
        else:
            name = "No name"
        print(f"Name: {name}")

        if title:
            title = title[0].strip()
        else:
            title = "No title"
        print(f"Title: {title}")

        if brand:
            brand = brand[1].strip() if len(brand) > 1 else "No brand"
        else:
            brand = "No brand"
        print(f"Brand: {brand}")

        if price:
            price = price[0].strip()
            currency_symbol = re.findall(r'[^\d.,]+', price)[0]
            value = re.findall(r'[\d.,]+', price)[0]
        else:
            currency_symbol = "Not Available"
            value = "Not Available"
        print(f"Currency Symbol: {currency_symbol}, Value: {value}")

        if rating:
            rating = rating[0].strip()
        else:
            rating = "No rating"
        print(f"Rating: {rating}")

        if reviews:
            reviews = reviews[0].strip()
        else:
            reviews = "No reviews"
        print(f"Reviews: {reviews}")

        if platform:
            platform = platform[-1].strip()
        else:
            platform = "No platform"
        print(f"Platform: {platform}")

        if image:
            image_link = image[0]
        else:
            image_link = "No image link"
        print(f"Image link: {image_link}")

        if link:
            product_link = "https://www.amazon.nl" + link[0]
        else:
            product_link = "No product link"
        print(f"Product link: {product_link}")

        products.append({
            "name": name,
            "title": title,
            "brand": brand,
            "currency_symbol": currency_symbol,
            "price_value": value,
            "rating": rating,
            "reviews": reviews,
            "platform": platform,
            "image_link": image_link,
            "product_link": product_link,
            "category": category
        })

    return products

def save_to_excel(products, filename):
    df = pd.DataFrame(products)
    df.to_excel(filename, index=False)
    print(f"Products saved to {filename}")

if __name__ == "__main__":
    base_url = "https://www.amazon.nl/gp/new-releases/"
    category_links = get_category_links(base_url)

    all_products = []
    for category_link in category_links:
        category = category_link['category']
        link = category_link['link']
        print(f"Processing category: {category}")
        products = get_amazon_bestsellers(link, category)
        all_products.extend(products)
        time.sleep(10)  # Adicionando tempo de espera maior entre as requisições para melhor performance

    save_to_excel(all_products, "amazon_bestsellers_by_category.xlsx")


Processing category: Amazon Renewed
Found 15 items in category Amazon Renewed.
Name: Xbox-Series-X-gecertificeerd-gereviseerd
Title: Amazon Renewed
Brand: Xbox Series X
Currency Symbol: Not Available, Value: Not Available
Rating: 4,7 van 5 sterren
Reviews: No reviews
Platform: Xbox Series X
Image link: https://images-eu.ssl-images-amazon.com/images/I/61BVvNo8E-L._AC_UL300_SR300,200_.jpg
Product link: https://www.amazon.nl/Xbox-Series-X-gecertificeerd-gereviseerd/dp/B0CG9V891L/ref=zg_bsnr_g_amazon-renewed_d_sccl_1/259-4960701-1899950?psc=1
Name: soundcore-omgevingsafhankelijke-ruisonderdrukking-telefoonstandaard-Refurbished
Title: No title
Brand: No brand
Currency Symbol: € , Value: 55,99
Rating: 4,1 van 5 sterren
Reviews: No reviews
Platform: No platform
Image link: https://images-eu.ssl-images-amazon.com/images/I/61e9Npl3upL._AC_UL300_SR300,200_.jpg
Product link: https://www.amazon.nl/soundcore-omgevingsafhankelijke-ruisonderdrukking-telefoonstandaard-Refurbished/dp/B0CYLW9JKL/ref=zg_

In [39]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

def get_category_links(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")
    categories = soup.select('div._p13n-zg-nav-tree-all_style_zg-browse-group__88fbz a')
    category_links = [{'category': cat.get_text(strip=True), 'link': 'https://www.amazon.nl' + cat['href']} for cat in categories]

    return category_links

def get_amazon_bestsellers(url, category, retries=5, backoff_factor=0.3):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    
    for i in range(retries):
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            break
        elif response.status_code == 503:
            print(f"Failed to retrieve the page. Status code: 503. Retrying {i+1}/{retries}...")
            time.sleep(backoff_factor * (2 ** i))  # Exponential backoff
        else:
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
            return []
    
    if response.status_code != 200:
        print(f"Failed to retrieve the page after {retries} retries.")
        return []
    
    soup = BeautifulSoup(response.content, "html.parser")
    products = []

    # Verificar a estrutura HTML da página para encontrar o seletor correto
    items = soup.select("div.zg-grid-general-faceout, li.zg-item-immersion")
    print(f"Found {len(items)} items in category {category}.")

    for item in items:
        # Ajuste os seletores conforme necessário
        title = item.select_one("div.p13n-sc-truncate-desktop-type2, div._cDEzb_p13n-sc-css-line-clamp-1_1Fn1y, span.zg-text-center-align, div.a-section.a-spacing-none.aok-relative h2 a")
        price = item.select_one("span._cDEzb_p13n-sc-price_3mJ9Z, span.p13n-sc-price")
        rating = item.select_one("span.a-icon-alt")
        image = item.select_one("img.a-dynamic-image, img.s-image")
        link = item.select_one("a.a-link-normal")

        # Verificação e logs dos elementos encontrados
        if title:
            title = title.get_text(strip=True)
        else:
            title = "No title"
        print(f"Title: {title}")

        if price:
            price = price.get_text(strip=True)
            # Separar o símbolo da moeda e o valor
            currency_symbol = re.findall(r'[^\d.,]+', price)[0]
            value = re.findall(r'[\d.,]+', price)[0]
        else:
            currency_symbol = "Not Available"
            value = "Not Available"
        print(f"Currency Symbol: {currency_symbol}, Value: {value}")

        if rating:
            rating = rating.get_text(strip=True)
        else:
            rating = "No rating"
        print(f"Rating: {rating}")

        if image:
            image_link = image.get('src')
        else:
            image_link = "No image link"
        print(f"Image link: {image_link}")

        if link:
            product_link = "https://www.amazon.nl" + link.get('href')
        else:
            product_link = "No product link"
        print(f"Product link: {product_link}")

        products.append({
            "title": title,
            "currency_symbol": currency_symbol,
            "price_value": value,
            "rating": rating,
            "image_link": image_link,
            "product_link": product_link,
            "category": category
        })

    return products

def save_to_excel(products, filename):
    df = pd.DataFrame(products)
    df.to_excel(filename, index=False)
    print(f"Products saved to {filename}")

if __name__ == "__main__":
    base_url = "https://www.amazon.nl/gp/new-releases/"
    category_links = get_category_links(base_url)

    all_products = []
    for category_link in category_links:
        category = category_link['category']
        link = category_link['link']
        print(f"Processing category: {category}")
        products = get_amazon_bestsellers(link, category)
        all_products.extend(products)
        time.sleep(5)  # Adicionando tempo de espera maior entre as requisições para melhor performance

    save_to_excel(all_products, "amazon_bestsellers_by_category.xlsx")


Processing category: Amazon Renewed
Found 15 items in category Amazon Renewed.
Title: Amazon Renewed
Currency Symbol: Not Available, Value: Not Available
Rating: 4,7 van 5 sterren
Image link: https://images-eu.ssl-images-amazon.com/images/I/61BVvNo8E-L._AC_UL300_SR300,200_.jpg
Product link: https://www.amazon.nl/Xbox-Series-X-gecertificeerd-gereviseerd/dp/B0CG9V891L/ref=zg_bsnr_g_amazon-renewed_d_sccl_1/262-5009164-3007629?psc=1
Title: No title
Currency Symbol: € , Value: 55,99
Rating: 4,1 van 5 sterren
Image link: https://images-eu.ssl-images-amazon.com/images/I/61e9Npl3upL._AC_UL300_SR300,200_.jpg
Product link: https://www.amazon.nl/soundcore-omgevingsafhankelijke-ruisonderdrukking-telefoonstandaard-Refurbished/dp/B0CYLW9JKL/ref=zg_bsnr_g_amazon-renewed_d_sccl_2/262-5009164-3007629?psc=1
Title: No title
Currency Symbol: Not Available, Value: Not Available
Rating: No rating
Image link: https://images-eu.ssl-images-amazon.com/images/I/71zhnL8uKKS._AC_UL300_SR300,200_.jpg
Product link: 