In [16]:
# Import libraries
import requests
from bs4 import BeautifulSoup

In [23]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from urllib.parse import quote_plus

# ---------- CONFIGURABLE PART ----------
SEARCH_QUERIES = ["laptop", "smartphone", "headphones", "books"]
BASE_URL = "https://www.amazon.in"
MAX_PAGES = 3  # Number of pages to scrape per category
OUTPUT_FILE = "amazon_products_bs4.csv"

HEADERS_LIST = [
    {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept-Language": "en-US,en;q=0.9"
    },
    {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
        "Accept-Language": "en-US,en;q=0.8"
    },
    {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64)",
        "Accept-Language": "en-GB,en;q=0.7"
    }
]

# ---------- FUNCTION TO SCRAPE ONE PAGE ----------
def scrape_page(url, category):
    print(f"Scraping: {url}")
    headers = random.choice(HEADERS_LIST)
    response = requests.get(url, headers=headers)
    time.sleep(random.uniform(2, 4))

    if response.status_code != 200:
        print(f"Failed to fetch page: {url}, Status Code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, "lxml")
    items = soup.select("div.s-main-slot div[data-component-type='s-search-result']")
    product_list = []

    for item in items:
        title_tag = item.select_one("h2 a")
        title = title_tag.get_text(strip=True) if title_tag else None
        link = "https://www.amazon.in" + title_tag['href'] if title_tag and title_tag.has_attr('href') else None

        price_whole = item.select_one("span.a-price-whole")
        price_fraction = item.select_one("span.a-price-fraction")
        price = f"{price_whole.text.strip().replace(',', '')}.{price_fraction.text.strip()}" if price_whole and price_fraction else None

        rating_tag = item.select_one("span.a-icon-alt")
        rating = rating_tag.get_text(strip=True) if rating_tag else None

        reviews_tag = item.select_one("span.a-size-base")
        reviews = reviews_tag.get_text(strip=True) if reviews_tag else None

        badge_tag = item.select_one("span.s-label-popover-default")
        badge = badge_tag.get_text(strip=True) if badge_tag else None

        image_tag = item.select_one("img.s-image")
        image_url = image_tag['src'] if image_tag and image_tag.has_attr('src') else None

        brand_tag = item.select_one("span.a-size-base-plus.a-color-base")
        brand = brand_tag.get_text(strip=True) if brand_tag else None

        delivery_tag = item.select_one("span.a-color-base.a-text-bold")
        delivery = delivery_tag.get_text(strip=True) if delivery_tag else None

        discount_tag = item.select_one("span.a-letter-space + span.a-size-base.a-color-secondary")
        discount = discount_tag.get_text(strip=True) if discount_tag else None

        product = {
            "Category": category,
            "Title": title,
            "Brand": brand,
            "Price (INR)": price,
            "Rating": rating,
            "Review Count": reviews,
            "Product Badge": badge,
            "Product URL": link,
            "Image URL": image_url,
            "Delivery Info": delivery,
            "Discount": discount
        }
        product_list.append(product)

    return product_list

# ---------- MAIN SCRAPER LOOP ----------
def scrape_amazon(queries, max_pages=1):
    all_products = []

    for query in queries:
        print(f"\nScraping category: {query}")
        page = 1

        while page <= max_pages:
            search_url = f"{BASE_URL}/s?k={quote_plus(query)}&page={page}"
            products = scrape_page(search_url, query)

            if not products:
                print("No products found or blocked. Stopping.")
                break

            all_products.extend(products)
            page += 1

    return all_products

# ---------- SAVE RESULTS ----------
def save_to_csv(products, filename):
    df = pd.DataFrame(products)
    df.to_csv(filename, index=False)
    print(f"Saved {len(products)} products to {filename}")

# ---------- RUNNING THE SCRAPER ----------
if __name__ == "__main__":
    print("Scraping Amazon.in for multiple categories using BeautifulSoup...\n")
    data = scrape_amazon(SEARCH_QUERIES, MAX_PAGES)
    save_to_csv(data, OUTPUT_FILE)


Scraping Amazon.in for multiple categories using BeautifulSoup...


Scraping category: laptop
Scraping: https://www.amazon.in/s?k=laptop&page=1
Scraping: https://www.amazon.in/s?k=laptop&page=2
Scraping: https://www.amazon.in/s?k=laptop&page=3

Scraping category: smartphone
Scraping: https://www.amazon.in/s?k=smartphone&page=1
Scraping: https://www.amazon.in/s?k=smartphone&page=2
Scraping: https://www.amazon.in/s?k=smartphone&page=3

Scraping category: headphones
Scraping: https://www.amazon.in/s?k=headphones&page=1
Scraping: https://www.amazon.in/s?k=headphones&page=2
Scraping: https://www.amazon.in/s?k=headphones&page=3

Scraping category: books
Scraping: https://www.amazon.in/s?k=books&page=1
Scraping: https://www.amazon.in/s?k=books&page=2
Scraping: https://www.amazon.in/s?k=books&page=3
Saved 284 products to amazon_products_bs4.csv
