In [1]:
from bs4 import BeautifulSoup
import requests
import json
import csv
import time

class AmazonScraper:
    def __init__(self, search, pages):
        # Initialize the scraper with search term and number of pages to scrape
        self.search = search
        self.pages = int(pages)
        self.base_url = "https://www.amazon.in/s?k="
        self.headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"

        }
    
    def scrape(self):
        # Generate URLs for the given number of pages
        urls = self.generate_urls()
        all_product_data = []

        # Loop through each URL and scrape data
        for url in urls:
            soup = self.fetch_html(url)
            product_data = self.extract_product_data(soup)
            all_product_data.extend(product_data)
            time.sleep(1)  # Be polite and avoid getting blocked

        # Save the data to JSON and CSV files
        if all_product_data:
            self.save_to_json(all_product_data)
            self.save_to_csv(all_product_data)
        else:
            print("No information available. Try with a different search input.")
    
    def generate_urls(self):
        # Generate a list of URLs to scrape based on the number of pages
        urls = []
        for page in range(1, self.pages + 1):
            url = f"{self.base_url}{self.search.replace(' ', '+')}&page={page}"
            urls.append(url)
        return urls
    
    def fetch_html(self, url):
        # Fetch the HTML content of the given URL
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return BeautifulSoup(response.content, "lxml")
        except requests.RequestException as e:
            print(f"Request error: {e}")
            return None
    
    def extract_product_data(self, soup):
        # Extract product data from the HTML content
        if not soup:
            return []

        product_data = []
        product_containers = soup.find_all('div', {'data-component-type': 's-search-result'})

        for container in product_containers:
            try:
                title = container.h2.a.text.strip()  # Product title
                link = "https://www.amazon.in" + container.h2.a['href']  # Product link
                image = container.find('img', {'class': 's-image'})['src']  # Image link
                price = self.get_price(container)  # Product price
                rating = container.find('span', {'class': 'a-icon-alt'}).text if container.find('span', {'class': 'a-icon-alt'}) else "No rating available"  # Product rating
                review_count = container.find('span', {'class': 'a-size-base'}).text if container.find('span', {'class': 'a-size-base'}) else "No reviews"  # Number of reviews

                product_data.append({
                    'Title': title,
                    'Link': link,
                    'Image link': image,
                    'Price': price,
                    'Rating': rating,
                    'No of reviews': review_count
                })
            except AttributeError:
                continue

        return product_data

    def get_price(self, container):
        # Extract the price of the product
        try:
            price_whole = container.find('span', {'class': 'a-price-whole'})
            price_fraction = container.find('span', {'class': 'a-price-fraction'})
            price_symbol = container.find('span', {'class': 'a-price-symbol'})
            if price_whole and price_fraction and price_symbol:
                return f"{price_symbol.text}{price_whole.text}.{price_fraction.text}"
            elif price_whole and price_symbol:
                return f"{price_symbol.text}{price_whole.text}"
        except AttributeError:
            return "Price not available"
        return "Price not available"
    
    def save_to_json(self, product_data):
        # Save product data to a JSON file
        with open('scraped_products.json', 'w', encoding='utf-8') as json_file:
            json.dump(product_data, json_file, indent=4)
        print("Data saved to scraped_products.json")
    
    def save_to_csv(self, product_data):
        # Save product data to a CSV file
        keys = product_data[0].keys()
        with open('scraped_products.csv', 'w', newline='', encoding='utf-8') as csv_file:
            dict_writer = csv.DictWriter(csv_file, fieldnames=keys)
            dict_writer.writeheader()
            dict_writer.writerows(product_data)
        print("Data saved to scraped_products.csv")

# Input from the user
search_text = input("Enter the text to be searched: ")
no_of_pages = input("Enter the number of pages to scrape (1-20): ")

if 0 < int(no_of_pages) <= 20:
    scraper = AmazonScraper(search_text, no_of_pages)
    scraper.scrape()
else:
    print("Try again. Page index error.")


Enter the text to be searched:  mobiles
Enter the number of pages to scrape (1-20):  4


Data saved to scraped_products.json
Data saved to scraped_products.csv


In [None]:
Import Statements: Import necessary libraries for web scraping, JSON, CSV handling, and time delay.
Class Initialization: Initialize the class with search term and number of pages to scrape. Set up the base URL and headers.
Scrape Method:
Generate URLs for the given number of pages.
Fetch HTML content and extract product data from each URL.
Sleep for 1 second between requests to avoid getting blocked.
Save the data to JSON and CSV files if available.
Generate URLs Method: Create a list of URLs based on the number of pages.
Fetch HTML Method: Fetch the HTML content of a given URL and handle any request errors.
Extract Product Data Method: Extract necessary product data (title, link, image, price, rating, review count) from the HTML content.
Get Price Method: Extract the price of the product, considering both whole and fractional parts.
Save to JSON Method: Save the extracted product data to a JSON file.
Save to CSV Method: Save the extracted product data to a CSV file.
User Input and Scraper Initialization: Take user input for search term and number of pages, validate the input, and initialize the scraper.
This should make the code easier to understand and maintain.