In [1]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Part 1: Scraping product listing pages

base_url = "https://www.amazon.in/s"
search_term = "bags"
pages_to_scrape = 20

data = []

for page in range(1, pages_to_scrape + 1):
    params = {
        "k": search_term,
        "crid": "2M096C61O4MLT",
        "qid": 1653308124,
        "sprefix": "ba,aps,283",
        "ref": f"sr_pg_{page}",
        "page": page
    }
    
    response = requests.get(base_url, params=params)
    soup = BeautifulSoup(response.text, "html.parser")
    
    products = soup.find_all("div", {"data-component-type": "s-search-result"})
    
    for product in products:
        url = product.find("a", class_="a-link-normal")["href"]
        name = product.find("span", class_="a-size-medium").text.strip()
        
        price_element = product.find("span", class_="a-offscreen")
        price = price_element.text.strip() if price_element else ""
        
        rating_element = product.find("span", class_="a-icon-alt")
        rating = rating_element.text.strip().split()[0] if rating_element else ""
        
        reviews_element = product.find("span", class_="a-size-base")
        reviews = reviews_element.text.strip() if reviews_element else ""
        
        data.append({
            "URL": url,
            "Name": name,
            "Price": price,
            "Rating": rating,
            "Reviews": reviews
        })

# Part 2: Scraping product URLs for additional information

additional_data = []

for item in data[:200]:  # Limiting to 200 URLs for demonstration purposes
    url = "https://www.amazon.in" + item["URL"]
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    description = soup.find("div", {"id": "feature-bullets"}).text.strip() if soup.find("div", {"id": "feature-bullets"}) else ""
    asin = soup.find("th", text="ASIN").find_next_sibling("td").text.strip() if soup.find("th", text="ASIN") else ""
    product_description = soup.find("div", {"id": "productDescription"}).text.strip() if soup.find("div", {"id": "productDescription"}) else ""
    manufacturer = soup.find("a", {"id": "bylineInfo"}).text.strip() if soup.find("a", {"id": "bylineInfo"}) else ""
    
    additional_data.append({
        "URL": url,
        "Description": description,
        "ASIN": asin,
        "Product Description": product_description,
        "Manufacturer": manufacturer
    })


In [7]:
# Combining data 
for item in additional_data:
    for i in range(len(data)):
        if data[i]["URL"] == item["URL"]:
            data[i].update(item)
            break

# Exporting to CSV
df = pd.DataFrame(data)
df.to_csv("amazon_products.csv", index=False)