In [1]:
pip install requests beautifulsoup4



In [11]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# Base URL for the product listings
base_url = "https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%2C283&ref=sr_pg_"

# Number of pages to scrape
num_pages = 20

# Initialize a list to store the scraped data
all_products = []

# Function to scrape individual product pages and extract additional information
def scrape_product_page(product_url):
    headers = {
    'Accept-Language': 'en-US,en;q=0.9',
    'authority': 'www.amazon.com',

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',

}

    response = requests.get(product_url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract additional product details from the soup
        # Modify these selectors according to the actual structure of the product page
        description = soup.select_one("#productDescription").get_text(strip=True).strip() if soup.select_one("#productDescription") else "N/A"
        asin = soup.select_one("[data-asin]")["data-asin"].strip() if soup.select_one("[data-asin]") else "N/A"
        product_description = soup.select_one("#productTitle").get_text(strip=True).strip() if soup.select_one("#productTitle") else "N/A"
        manufacturer = soup.select_one("#bylineInfo").get_text(strip=True).strip() if soup.select_one("#bylineInfo") else "N/A"

        return description, asin, product_description, manufacturer
    else:
        print(f"Failed to retrieve product page. Status code: {response.status_code}")
        return "N/A", "N/A", "N/A", "N/A"

# Loop through the pages
for page in range(1, num_pages + 1):
    url = f"{base_url}{page}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}

    response = requests.get(url, headers=headers)

    # Retry mechanism to handle 503 Service Unavailable errors
    max_retries = 3
    retries = 0
    while response.status_code == 503 and retries < max_retries:
        print(f"Failed to retrieve page {page}. Retrying...")
        time.sleep(5)  # Wait for 5 seconds before retrying
        response = requests.get(url, headers=headers)
        retries += 1

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract product information from the soup
        products = soup.select(".s-result-item")

        for product in products:
            try:
                # Extracting product details
                product_url_element = product.select_one(".s-line-clamp-2 a")
                if product_url_element:
                    product_url = "https://www.amazon.in" + product_url_element['href']
                else:
                    product_url = "N/A"

                product_name_element = product.select_one(".a-text-normal")
                product_name = product_name_element.get_text(strip=True).strip() if product_name_element else "N/A"

                product_price_element = product.select_one(".a-offscreen")
                product_price = product_price_element.get_text(strip=True).strip() if product_price_element else "N/A"

                # Some products may not have a rating or number of reviews, so handle the possible absence of these elements
                rating_element = product.select_one(".a-icon-star-small")
                rating = rating_element.get_text(strip=True).strip() if rating_element else "N/A"

                num_reviews_element = product.select_one(".a-size-base")
                num_reviews = num_reviews_element.get_text(strip=True).strip() if num_reviews_element else "0"

                # Scrape the individual product page for additional information
                if product_url != "N/A":
                    description, asin, product_description, manufacturer = scrape_product_page(product_url)
                else:
                    description, asin, product_description, manufacturer = "N/A", "N/A", "N/A", "N/A"

                # Add the extracted data to the all_products list as a dictionary
                all_products.append({
                    "Product URL": product_url,
                    "Product Name": product_name,
                    "Product Price": product_price,
                    "Rating": rating,
                    "Number of reviews": num_reviews,
                    "Description": description,
                    "ASIN": asin,
                    "Product Description": product_description,
                    "Manufacturer": manufacturer
                })

                # Introduce a delay of 1 second between each request to avoid overwhelming the server
                time.sleep(1)

            except Exception as e:
                print(f"Error occurred while processing product on page {page}. Skipping the product.")
                print(f"Error: {e}")
                continue

    else:
        print(f"Failed to retrieve page {page}. Status code: {response.status_code}")

# Save the data to a CSV file
csv_file = "amazon_bags_data.csv"
with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["Product URL", "Product Name", "Product Price", "Rating", "Number of reviews", "Description", "ASIN", "Product Description", "Manufacturer"])
    writer.writeheader()
    writer.writerows(all_products)

print(f"Data successfully scraped and exported to {csv_file}.")


Failed to retrieve page 1. Retrying...
Failed to retrieve page 1. Retrying...
Failed to retrieve product page. Status code: 503
Failed to retrieve product page. Status code: 503
Failed to retrieve product page. Status code: 503
Failed to retrieve product page. Status code: 503
Failed to retrieve page 2. Retrying...
Failed to retrieve page 2. Retrying...
Failed to retrieve product page. Status code: 503
Failed to retrieve product page. Status code: 503
Failed to retrieve product page. Status code: 503
Failed to retrieve product page. Status code: 503
Failed to retrieve page 3. Retrying...
Failed to retrieve product page. Status code: 503
Failed to retrieve product page. Status code: 503
Failed to retrieve product page. Status code: 503
Failed to retrieve product page. Status code: 503
Failed to retrieve product page. Status code: 503
Failed to retrieve product page. Status code: 503
Failed to retrieve product page. Status code: 503
Failed to retrieve product page. Status code: 503
Faile