In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime

# Base URL for Amazon search results (modify the search query as needed)
BASE_URL = "https://www.amazon.in/s?bbn=21541572031&rh=n%3A976419031&dc&qid=1733993930&rnid=3576079031&ref=sr_nr_n_0"

# Headers to mimic a browser (avoid being blocked)
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9"
}

# Function to fetch product details
def fetch_product_data(page):
    url = f"{BASE_URL}&page={page}"
    response = requests.get(url, headers=HEADERS)
    if response.status_code != 200:
        print(f"Failed to fetch page {page}: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    products = soup.find_all("div", {"data-component-type": "s-search-result"})

    data = []
    for product in products:
        try:
            # Extract product title
            title_tag = product.find("h2")
            title = title_tag.text.strip() if title_tag else "Title Not Available"

            # Extract price
            price_tag = product.find("span", "a-price-whole")
            price = int(re.sub(r'[^\d]', '', price_tag.text)) if price_tag else None

            # Extract original price
            original_price_tag = product.find("span", "a-price a-text-price")
            original_price = (
                int(re.sub(r'[^\d]', '', original_price_tag.find("span", "a-offscreen").text))
                if original_price_tag and original_price_tag.find("span", "a-offscreen")
                else None
            )

            # Calculate discount percentage
            if price and original_price:
                discount = round(((original_price - price) / original_price) * 100, 2)
            else:
                discount = None

            # Extract rating
            rating_tag = product.find("span", "a-icon-alt")
            rating = rating_tag.text.split()[0] if rating_tag else None

            # Extract number of reviews
            reviews_tag = product.find("span", {"class": "a-size-base"})
            reviews = reviews_tag.text.strip() if reviews_tag else None

            # Extract brand
            brand = title.split()[0] if title != "Title Not Available" else None

            # Append the extracted data
            data.append({
                "Title": title,
                "Price": price,
                "Original Price": original_price,
                "Discount (%)": discount,
                "Rating": rating,
                "Reviews": reviews,
                "Brand": brand,
                "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })
        except Exception as e:
            print(f"Error while processing a product: {e}")

    return data

# Scrape multiple pages
all_data = []
for page in range(1, 21):  # Adjust range to fetch enough products
    print(f"Fetching page {page}...")
    products = fetch_product_data(page)
    all_data.extend(products)

    # Stop if we have enough data
    if len(all_data) >= 200:
        break

    # Pause to avoid overwhelming the server
    time.sleep(2)

# Save the data to a database file (append mode)
database_file = "amazon_products_database.csv"
df = pd.DataFrame(all_data[:200])  # Limit to 200 items

# Append to the CSV file if it exists, otherwise create it
try:
    df.to_csv(database_file, mode='a', index=False, header=not pd.io.common.file_exists(database_file))
    print(f"Data appended to '{database_file}'.")
except Exception as e:
    print(f"Error saving data to the database: {e}")

# Load the database and clean missing values
def clean_database(file):
    try:
        # Load the database
        data = pd.read_csv(file)
        print("Database loaded successfully.")

        # Drop rows with missing values
        data.dropna(inplace=True)

        # Remove noise (e.g., rows with unrealistic prices or ratings)
        data['Price'] = pd.to_numeric(data['Price'], errors='coerce')
        data['Original Price'] = pd.to_numeric(data['Original Price'], errors='coerce')
        data['Discount (%)'] = pd.to_numeric(data['Discount (%)'], errors='coerce')
        data['Rating'] = pd.to_numeric(data['Rating'], errors='coerce')
        data['Reviews'] = pd.to_numeric(data['Reviews'], errors='coerce')

        data.dropna(inplace=True)  # Drop rows with non-numeric data converted to NaN

        # Filter for valid numeric ranges
        data = data[(data['Price'] > 0) & (data['Rating'] <= 5)]

        # Save the cleaned data back to the database
        data.to_csv(file, index=False)
        print("Database cleaned and saved successfully.")
    except Exception as e:
        print(f"Error cleaning the database: {e}")

# Clean the database
clean_database(database_file)


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
Fetching page 18...
Fetching page 19...
Fetching page 20...
Data appended to 'amazon_products_database.csv'.
Database loaded successfully.
Database cleaned and saved successfully.
