In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

def get_product_urls(search_url, max_pages=1):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0'
    }
    product_urls = []
    
    for page in range(1, max_pages + 1):
        print(f"Scraping page {page}...")
        paginated_url = f"{search_url}&page={page}"
        
        try:
            response = requests.get(paginated_url, headers=headers, timeout=10)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error accessing {paginated_url}: {e}")
            break

        soup = BeautifulSoup(response.content, 'html.parser')

        # Get product links on this page
        for link in soup.find_all('a', {'class': 'a-link-normal s-no-outline'}):
            product_urls.append('https://www.amazon.in' + link.get('href'))
        
        # Add a random delay to avoid getting blocked
        time.sleep(random.uniform(0.5, 2))
    
    return product_urls

def get_product_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0'
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {url}: {e}")
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the product name
    try:
        product_name = soup.find('span', {'id': 'productTitle'}).get_text().strip()
    except AttributeError:
        product_name = None

    # Extract the ASIN number
    try:
        asin = soup.find('input', {'id': 'ASIN'})['value']
    except (TypeError, AttributeError):
        asin = None

    # Extract the price
    try:
        price = soup.find('span', {'class': 'a-offscreen'}).get_text().strip()
    except AttributeError:
        price = None

    return {'Product Name': product_name, 'ASIN': asin, 'Price': price}

def scrape_multiple_products(search_url, max_pages=1):
    # Step 1: Get a list of product URLs from the search pages
    product_urls = get_product_urls(search_url, max_pages=max_pages)
    product_data_list = []

    # Step 2: Get data for each product
    for url in product_urls:
        product_data = get_product_data(url)
        if product_data:
            product_data_list.append(product_data)
        # Add a random delay to avoid getting blocked
        time.sleep(random.uniform(0.5, 2))  # Random delay between 1 to 3 seconds

    return pd.DataFrame(product_data_list)

# Example usage
search_url = 'https://www.amazon.in/s?k=ipad&crid=2Z6BZUDNJ03FH&sprefix=ipa%2Caps%2C248&ref=nb_sb_noss_2'
# Increase max_pages to scrape more products
product_data_df = scrape_multiple_products(search_url, max_pages=1)
print(product_data_df)


Scraping page 1...
                                         Product Name        ASIN  \
0   Apple iPad (10th Generation): with A14 Bionic ...  B0BJMGXLYZ   
1   Apple iPad Air 11″ (M2): Liquid Retina Display...  B0D3J7HK59   
2   Apple iPad (10th Generation): with A14 Bionic ...  B0BJMGXLYZ   
3   Apple iPad (10th Generation): with A14 Bionic ...  B0BJMSFMHH   
4   Apple iPad (10th Generation): with A14 Bionic ...  B0BJLDVX2S   
5   Apple iPad (10th Generation): with A14 Bionic ...  B0BJM3NBMT   
6   Apple iPad (10th Generation): with A14 Bionic ...  B0BJLDFNVL   
7   Apple iPad Air 11″ (M2): Liquid Retina Display...  B0D3J9HD7K   
8   Apple iPad (10th Generation): with A14 Bionic ...  B0BJLHP48C   
9   Apple iPad Pro 11″ (M4): Ultra Retina XDR Disp...  B0D3J8W62R   
10  Apple iPad (10th Generation): with A14 Bionic ...  B0BJLF8K57   
11  Apple iPad Air 11″ (M2): Liquid Retina Display...  B0D3J6BYPW   
12  Apple iPad Air (5th Generation): with M1 chip,...  B09V4JW485   
13  Apple iPad 

In [3]:
product_data_df.to_csv('product_data.csv', index=False)
print("Data exported to 'product_data.csv'")

Data exported to 'product_data.csv'
