In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.parse import urljoin
import time
import os
from dotenv import load_dotenv

load_dotenv()

class AmazonScraper:
    def __init__(self):
        self.base_url = "https://www.amazon.in"
        self.headers = {
            'User-Agent': os.getenv('user_agent'),
            'Accept-Language': 'en-US, en;q=0.5'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        self.data = []
        
    def _get_soup(self, url):
        """Fetch webpage and return BeautifulSoup object"""
        try:
            response = self.session.get(url)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except Exception as e:
            print(f"Error fetching {url}: {str(e)}")
            return None

    def _extract_product_links(self, search_url):
        """Extract product links from search results page"""
        soup = self._get_soup(search_url)
        if not soup:
            return []
            
        links = []
        for a in soup.find_all("a", attrs={'class':'a-link-normal s-line-clamp-2 s-link-style a-text-normal'}):
            href = a.get('href')
            if href:
                full_url = urljoin(self.base_url, href)
                links.append(full_url)
        return links

    def _extract_title(self, soup):
        """Extract product title"""
        try:
            return soup.find("span", id='productTitle').text.strip()
        except:
            return None

    def _extract_price(self, soup):
        """Extract product price"""
        try:
            price = soup.find("span", class_='a-price-whole').text.strip()
            return price.replace('.', '').replace(',', '').strip()
        except:
            return None

    def _extract_rating(self, soup):
        """Extract product rating"""
        try:
            return soup.find("a", class_='a-popover-trigger a-declarative').find("span", class_="a-size-base a-color-base").text.strip()
        except:
            return None

    def _extract_num_ratings(self, soup):
        """Extract number of ratings"""
        try:
            ratings_text = soup.find("span", class_='a-size-base', id='acrCustomerReviewText').text
            return ratings_text.replace('ratings', '').replace('rating', '').strip()
        except:
            return None

    def _extract_manufacturer(self, soup):
        """Extract manufacturer information"""
        try:
            # for item in soup.select('.a-unordered-list.a-nostyle.a-vertical.a-spacing-none.detail-bullet-list li'):
            #     label_span = item.find('span', class_='a-text-bold')
            #     if label_span:
            #         label = label_span.get_text(strip=True).replace('\u200f', '').replace('\u200e', '').strip(':').strip()
            #         if 'Manufacturer' in label:
            #             value_span = item.find('span', class_=False)
            #             return value_span.get_text(strip=True) if value_span else None
            # return None

            for row in soup.select('tr[class*="po-"]'):
                    label = row.find('span', class_='a-text-bold').get_text(strip=True)
                    if 'Brand' in label:
                        value = row.find('span', class_='po-break-word').get_text(strip=True)
                        brand = value.split()[-1]  # Get last word assuming format "XXX Brand"
                        return brand

        except:
            return None

    def _extract_about_item(self, soup):
        """Extract 'About this item' section"""
        try:
            ul = soup.find('ul', class_='a-unordered-list a-vertical a-spacing-mini')
            about_items = []
            if ul:
                for li in ul.find_all('li', class_='a-spacing-mini'):
                    item = li.find('span', class_='a-list-item')
                    if item:
                        text = item.get_text(strip=True).strip('"').replace('\n', ' ').strip()
                        about_items.append(text)
            return "\n\n".join(about_items) if about_items else None
        except:
            return None

    def scrape_product_page(self, product_url):
        """Scrape all information from a single product page"""
        soup = self._get_soup(product_url)
        if not soup:
            return None
            
        return {
            'title': self._extract_title(soup),
            'price': self._extract_price(soup),
            'rating': self._extract_rating(soup),
            'num_ratings': self._extract_num_ratings(soup),
            'manufacturer': self._extract_manufacturer(soup),
            'about_item': self._extract_about_item(soup),
            'product_url': product_url
        }

    def scrape_search_pages(self, query, pages=18):
        """Main function to scrape multiple search pages"""
        for page in range(1, pages+1):
            print(f"Scraping page {page}...")
            search_url = f"https://www.amazon.in/s?k={query}&page={page}"
            product_links = self._extract_product_links(search_url)
            
            for link in product_links:
                product_data = self.scrape_product_page(link)
                if product_data:
                    self.data.append(product_data)
                time.sleep(1)  # Respectful delay
                
            # Save progress after each page
            self.save_to_csv()
            print(f"Page {page} completed. Total products collected: {len(self.data)}")
            
        return pd.DataFrame(self.data)

    def save_to_csv(self, filename='amazon_products2.csv'):
        """Save current data to CSV"""
        df = pd.DataFrame(self.data)
        df.to_csv(filename, index=False)

# Usage Example
if __name__ == "__main__":
    scraper = AmazonScraper()
    df = scraper.scrape_search_pages(query='photo+camera', pages=18)
    print("Scraping completed. Data saved to amazon_products2.csv")

Scraping page 1...
Page 1 completed. Total products collected: 22
Scraping page 2...
Page 2 completed. Total products collected: 44
Scraping page 3...
Page 3 completed. Total products collected: 66
Scraping page 4...
Page 4 completed. Total products collected: 88
Scraping page 5...
Page 5 completed. Total products collected: 110
Scraping page 6...
Page 6 completed. Total products collected: 132
Scraping page 7...
Page 7 completed. Total products collected: 154
Scraping page 8...
Page 8 completed. Total products collected: 176
Scraping page 9...
Page 9 completed. Total products collected: 198
Scraping page 10...
Page 10 completed. Total products collected: 220
Scraping page 11...
Page 11 completed. Total products collected: 242
Scraping page 12...
Page 12 completed. Total products collected: 264
Scraping page 13...
Page 13 completed. Total products collected: 286
Scraping page 14...
Page 14 completed. Total products collected: 308
Scraping page 15...
Page 15 completed. Total products co

In [11]:
df.head()

Unnamed: 0,title,price,rating,num_ratings,manufacturer,about_item,product_url
0,KODAK Mini Shot 2 Retro 4PASS 2-in-1 Instant C...,13699,3.9,5625,KODAK,Instant Camera + Photo Printer: The Kodak Mini...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...
1,Fujifilm Instax Square SQ1 Camera - Glacier Blue,10499,4.2,2780,instax,Auto Exposure - Capture bright photos even in ...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...
2,"TOYTONIC Kids Digital Camera in Pink with 3MP,...",828,3.9,72,TOYTONIC,📸 3MP High-Quality Camera: Capture sharp and v...,https://www.amazon.in/TOYTONIC-Digital-Camera-...
3,Toy Imagine Kids Digital Camera with 1080P Vid...,823,3.9,9,Imagine,📷 1080P Video & 1200W Photo: Capture precious ...,https://www.amazon.in/Toy-Imagine-Rechargeable...
4,"Focusify Digital Camera, 4K Ultra HD Cameras f...",2999,2.9,56,Focusify,📷[Exceptional Image Resolution]: Elevate your ...,https://www.amazon.in/Focusify-Digital-Photogr...


In [12]:
df.shape

(404, 7)

In [3]:
df["product_url"][4]

'https://www.amazon.in/Focusify-Digital-Photography-Autofocus-Beginners/dp/B0D5H9XTZW/ref=sr_1_5?dib=eyJ2IjoiMSJ9.hctMKIYoKq56aCH6wScfTICo9k6ut27AzkJhwt7iJiGQubQcUcRcOTPbkuFm7DfAoRubvtlyU2NkORiCVWsH6crqx3mwrdAqXYRtwiwsCU3z5zv3m4KGx2cymsEM_UVOiWAByv0fGTSrO8a7IiEqWfh73z9XZcvU07KhLtZkKWniYbdoSthGCW3i71dkAXKAD-TDrx_EREH6rkudglvXnD-dfSwTDO8h1U96OzTl60Y.KN-tPdILM3CRCr5rSWc6tPeCEZLTRXGVDqYe_AVb1mA&dib_tag=se&keywords=photo+camera&qid=1744060436&sr=8-5'

In [13]:
df.head(10)

Unnamed: 0,title,price,rating,num_ratings,manufacturer,about_item,product_url
0,KODAK Mini Shot 2 Retro 4PASS 2-in-1 Instant C...,13699,3.9,5625,KODAK,Instant Camera + Photo Printer: The Kodak Mini...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...
1,Fujifilm Instax Square SQ1 Camera - Glacier Blue,10499,4.2,2780,instax,Auto Exposure - Capture bright photos even in ...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...
2,"TOYTONIC Kids Digital Camera in Pink with 3MP,...",828,3.9,72,TOYTONIC,📸 3MP High-Quality Camera: Capture sharp and v...,https://www.amazon.in/TOYTONIC-Digital-Camera-...
3,Toy Imagine Kids Digital Camera with 1080P Vid...,823,3.9,9,Imagine,📷 1080P Video & 1200W Photo: Capture precious ...,https://www.amazon.in/Toy-Imagine-Rechargeable...
4,"Focusify Digital Camera, 4K Ultra HD Cameras f...",2999,2.9,56,Focusify,📷[Exceptional Image Resolution]: Elevate your ...,https://www.amazon.in/Focusify-Digital-Photogr...
5,TAMRAKTU PU Leather Protective Camera Case Pou...,799,3.3,3,,✨Stylish Protection: This PU leather case is f...,https://www.amazon.in/TAMRAKTU-PU-Leather-Prot...
6,Fujifilm Instax Mini 11 Instant Camera (Sky Blue),5998,4.2,29144,Fujifilm,Live Life and Play with five stylish colors\n\...,https://www.amazon.in/Fujifilm-Instax-Mini-Ins...
7,"Acuvar Digital Camera, 16 Megapixel Photo Came...",7721,3.2,40,Acuvar,GREAT CAMERA QUALITY FOR BEGINNERS AND KIDS: T...,https://www.amazon.in/Acuvar-Digital-Megapixel...
8,"Saneen Digital Camera for Photography, 4K 64MP...",9999,4.1,477,Saneen,4 K & 64 MP Camera for Photography：Explore a v...,https://www.amazon.in/Saneen-Digital-Photograp...
9,"One Click in Kids Digital Camera with 3MP, HD ...",622,5.0,8,Generic,📸 3MP High-Quality Camera: Capture sharp and v...,https://www.amazon.in/Digital-Camera-Screen-Se...


In [15]:
df[df["manufacturer"]=="Sony"]

Unnamed: 0,title,price,rating,num_ratings,manufacturer,about_item,product_url
19,Sony Alpha ILCE 6100L 24.2 MP Mirrorless Digit...,61490,4.5,445,Sony,Real time eye AF for still + Animal Eye AF;Rea...,https://www.amazon.in/Sony-Mirrorless-Real-tim...
37,Sony Alpha ZV-E10L 24.2 Mega Pixel Interchange...,61490,4.4,549,Sony,Interchangeable-lens camera for vlogging\n\nLa...,https://www.amazon.in/Sony-ZV-E10L-Interchange...
71,Sony Alpha ILCE 6100L 24.2 MP Mirrorless Digit...,79920,4.6,3,Sony,Real time eye AF for still + Animal Eye AF;Rea...,https://www.amazon.in/Sony-Mirrorless-Tiltable...
93,Sony Alpha ILCE-6400L 24.2MP Mirrorless Camera...,73963,4.5,676,Sony,Real time eye AF and real time tracking;World ...,https://www.amazon.in/Sony-ILCE-6400L-Mirrorle...
94,Sony New Alpha ILCE-6100X (Previously ILCE-610...,73990,4.5,445,Sony,Real time eye AF for still + Animal Eye AF\n\n...,https://www.amazon.in/Sony-ILCE-6100X-Previous...
112,Sony Digital Camera ZV-1F for Content Creators...,45019,3.8,39,Sony,"Ultra-wide-angle 20 mm prime lens, ideal for g...",https://www.amazon.in/Sony-Creators-Vloggers-U...
136,"Sony Digital Camera ZV-1 Only (Compact, Video ...",58990,4.2,313,Sony,"20.1 MP stacked back illuminated 1"" Exmor RS C...",https://www.amazon.in/Sony-ZV-1-Microphone-Vlo...
