In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Set headers to mimic a browser request
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}

BASE_URL = 'https://www.amazon.sg'
PRODUCT_DICT = {
    'laptop': "https://www.amazon.sg/s?k=laptop",
    #'mobile': "https://www.amazon.sg/s?k=mobile",
    #'ipad': "https://www.amazon.sg/s?k=ipad",
    'grocery':"https://www.amazon.sg/s?k=grocery"
}

def scrape_product_list():
    """Scrape the product listing page for product details."""
    product_data = []
    prod_id = 10000  # Starting ID for products

    for category, url in PRODUCT_DICT.items():
        response = requests.get(url, headers=HEADERS)
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all product containers
        products = soup.find_all('div', {'data-component-type': 's-search-result'})

        # Extract product details
        for product in products:
            title_element = product.find('h2', {'aria-label': True})
            title = title_element['aria-label'] if title_element else "N/A"

            rating_element = product.find('i', {'data-cy': 'reviews-ratings-slot'})
            rating = rating_element.find('span', {'class': 'a-icon-alt'}).text if rating_element else "N/A"

            review_div = product.find('div', class_='a-row a-size-small')
            review_tag = review_div.find('a', class_='a-link-normal s-underline-text s-underline-link-text s-link-style') if review_div else None
            review_count = review_tag.find('span').text.strip() if review_tag else "Not found"

            price_symbol = product.find('span', class_='a-price-symbol')
            price_whole = product.find('span', class_='a-price-whole')
            price_fraction = product.find('span', class_='a-price-fraction')

            price = (price_symbol.text.strip() if price_symbol else "") + \
                    (price_whole.text.strip() if price_whole else "") + \
                    (price_fraction.text.strip() if price_fraction else "")

            delivery_div = product.find('div', {'data-cy': 'delivery-recipe'})
            delivery_date_span = delivery_div.find('span', {'aria-label': True}) if delivery_div else None
            delivery = delivery_date_span['aria-label'] if delivery_date_span else "Not found"

            link_tag = product.find('a', class_='a-link-normal')
            product_link = link_tag['href'] if link_tag else "Not found"
            full_link = BASE_URL + product_link if product_link.startswith("/") else product_link

            prod_id += 1
            product_data.append([prod_id, category, title, rating, review_count, price, delivery, full_link])

    return pd.DataFrame(product_data, columns=['prod_id', 'prod_type', 'title', 'rating', 'review_count', 'price', 'delivery', 'link'])


def scrape_product_details(df):
    """Scrape additional product details for each category."""
    for category in PRODUCT_DICT.keys():
        product_details_list = []

        for _, record in df[df['prod_type'] == category].iterrows():
            response = requests.get(record['link'], headers=HEADERS)
            soup = BeautifulSoup(response.text, "html.parser")

            parent_div = soup.find('div', class_=['a-section a-spacing-small a-spacing-top-small'])
            if not parent_div:
                print(f"Skipping {record['prod_id']}: Table not found")
                continue

            table = parent_div.find('table', class_='a-normal a-spacing-micro')
            if not table:
                print(f"Skipping {record['prod_id']}: Table not found")
                continue

            rows = table.find_all('tr')
            product_details = {'prod_id': record['prod_id']}

            for row in rows:
                key = row.find('td', class_='a-span3').get_text(strip=True)
                value = row.find('td', class_='a-span9').get_text(strip=True)
                product_details[key] = value

            product_details_list.append(product_details)

        df_details = pd.DataFrame(product_details_list)
        save_to_csv(df_details, f'amazon_{category}.csv')


def save_to_csv(df, filename):
    """Save DataFrame to CSV."""
    df.to_csv(filename, index=False)
    print(f"Saved: {filename}")


def main():
    """Main function to execute the workflow."""
    df = scrape_product_list()
    save_to_csv(df, 'amazon_products.csv')
    
    scrape_product_details(df)


if __name__ == "__main__":
    main()


Saved: amazon_products.csv
Skipping 10001: Table not found
Skipping 10002: Table not found
Skipping 10003: Table not found
Skipping 10004: Table not found
Skipping 10022: Table not found
Skipping 10024: Table not found
Skipping 10029: Table not found
Skipping 10035: Table not found
Skipping 10051: Table not found
Skipping 10055: Table not found
Skipping 10059: Table not found
Saved: amazon_mobile.csv
Skipping 10064: Table not found
Skipping 10069: Table not found
Skipping 10083: Table not found
Skipping 10085: Table not found
Skipping 10089: Table not found
Skipping 10090: Table not found
Skipping 10100: Table not found
Saved: amazon_ipad.csv
