## Importing required libraries

In [25]:
import requests
import csv
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import random

In [27]:
# Base URL for Amazon's mobile accessories category
BASE_URL = "https://www.amazon.in/s?rh=n%3A6612025031&fs=true&ref=lp_6612025031_sar"


import random

USER_AGENTS = [
    # Desktop User-Agents
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.224 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.6045.123 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.88 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:110.0) Gecko/20100101 Firefox/110.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7; rv:109.0) Gecko/20100101 Firefox/109.0",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/119.0.2140.60 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Version/17.0 Safari/537.36",

    # Mobile User-Agents (Android & iPhone)
    "Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.224 Mobile Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 17_2 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/537.36",
    "Mozilla/5.0 (Linux; Android 12; Samsung Galaxy S22) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.6045.123 Mobile Safari/537.36",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/537.36",
    "Mozilla/5.0 (Linux; U; Android 10; en-us; Redmi Note 9) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Mobile Safari/537.36",
    "Mozilla/5.0 (Linux; U; Android 11; en-us; OnePlus Nord) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.88 Mobile Safari/537.36",
    "Mozilla/5.0 (Linux; Android 14; Pixel 8 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6190.71 Mobile Safari/537.36",
]

def get_random_headers():
    return {
        "User-Agent": random.choice(USER_AGENTS),
        "Accept-Language": "en-US, en;q=0.5",
    }

def get_soup(url):
    """Fetches and parses the HTML content of a given URL."""
    try:
        response = requests.get(url, headers=get_random_headers())
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

def extract_product_info(item):
    """Extracts product details from a single item."""
    product_data = {}
    try:
        rating = item.find("span", class_="a-icon-alt").text.split()[0]
    except AttributeError:
        rating = "N/A"

    try:
        product_url = "https://www.amazon.in" + item.find("a", class_="a-link-normal s-line-clamp-4 s-link-style a-text-normal")["href"]
    except (AttributeError, TypeError):
        return None  # Skip if no product URL

    # Visit product page to get seller info and stock status
    product_soup = get_soup(product_url)
    if product_soup:
        try:
            stock_status = product_soup.find("span", class_="a-size-medium a-color-success").text.strip()
            if "Currently unavailable" in stock_status or "Out of stock" in stock_status:
                return None  # Skip out-of-stock items
        except AttributeError:
            pass
        
        try:
            seller = product_soup.find("a", id="sellerProfileTriggerId").text.strip()
        except AttributeError:
            seller = "N/A"
        
        try:
            name = product_soup.find("span", id="productTitle").text.strip()
        except AttributeError:
            name = "N/A"

        try:
            price = product_soup.find("span", class_="a-price-whole").text.strip()
        except AttributeError:
            price = "N/A"

        product_data = {
            "Product Name": name,
            "Price (INR)": price,
            "Rating": rating,
            "Seller Name": seller
        }

    return product_data

def scrape_amazon():
    """Scrapes multiple pages of Amazon search results and saves data to a CSV file."""
    all_products = []
    page = 1
    max_pages = 5  # Adjust as needed

    with ThreadPoolExecutor(max_workers=10) as executor:
        while page <= max_pages:
            url = f"{BASE_URL}&page={page}"
            print(f"Scraping page {page}...")
            soup = get_soup(url)
            if not soup:
                break

            products = []
            items = soup.find_all("div", class_="s-result-item")
            results = executor.map(extract_product_info, items)  # Parallel processing

            # Collect non-None results
            for product in results:
                if product:
                    products.append(product)

            all_products.extend(products)
            page += 1

    # Save data to CSV
    with open("amazon_products.csv", "w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["Product Name", "Price (INR)", "Rating", "Seller Name"])
        writer.writeheader()
        writer.writerows(all_products)

    print("Scraping complete. Data saved to amazon_products.csv.")
    return all_products

# Execute the scraper
if __name__ == "__main__":
    scrape_amazon()


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping complete. Data saved to amazon_products.csv.
