In [74]:
# Cell 1: Import required libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time
import random

print("Libraries imported successfully")

Libraries imported successfully


In [75]:
# Cell 2: Headers and base URL for multiple pages
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9'
}

BASE_URL = "https://www.amazon.com/s?k=playstation+5&crid=3G12O79UMR7B1&sprefix=playstation+5%2Caps%2C414&ref=nb_sb_noss_1"
TOTAL_PAGES = 5  # number of pages to scrape

print("Headers and base URL set")

Headers and base URL set


In [79]:
all_links = []

for page in range(1, TOTAL_PAGES + 1):
    url = BASE_URL + str(page)
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Extract product links
    links = soup.select("a.a-link-normal.s-underline-text.s-link-style.a-text-normal")
    
    page_links = []
    for link in links:
        href = link.get('href')
        if href:
            if href.startswith("/"):
                full_url = "https://www.amazon.com" + href
            elif href.startswith("http"):
                full_url = href
            else:
                continue
            page_links.append(full_url)
    
    all_links.extend(page_links)
    print(f"Page {page}: Found {len(page_links)} links")
    
    time.sleep(random.uniform(1, 3))

print(f"Total links collected: {len(all_links)}")

Page 1: Found 5 links
Page 2: Found 15 links
Page 3: Found 7 links
Page 4: Found 7 links
Page 5: Found 5 links
Total links collected: 39


In [80]:
# Cell 4: Functions to extract product details
def get_title(soup):
    try:
        return soup.find("span", attrs={"id": 'productTitle'}).get_text(strip=True)
    except:
        return ""

def get_price(soup):
    try:
        return soup.find("span", attrs={'class':'a-price-whole'}).get_text(strip=True)
    except:
        return ""

def get_rating(soup):
    try:
        return soup.find("span", attrs={'class':'a-icon-alt'}).get_text(strip=True)
    except:
        return ""

def get_review_count(soup):
    try:
        return soup.find("span", attrs={'id':'acrCustomerReviewText'}).get_text(strip=True)
    except:
        return ""

def get_availability(soup):
    try:
        return soup.find("div", attrs={'id':'availability'}).find("span").get_text(strip=True)
    except:
        return "Not Available"

In [81]:
# Cell 5: Scrape product details from all collected links
data = {"title":[], "price":[], "rating":[], "reviews":[], "availability":[]}

# For testing, you can start with first 10 links
# Remove [:10] to scrape all links
# for idx, link in enumerate(all_links[:10], 1):  fatch 10 rows
for idx, link in enumerate(all_links, 1): # fatch all rows
    response = requests.get(link, headers=HEADERS)
    product_soup = BeautifulSoup(response.content, "html.parser")
    
    data['title'].append(get_title(product_soup))
    data['price'].append(get_price(product_soup))
    data['rating'].append(get_rating(product_soup))
    data['reviews'].append(get_review_count(product_soup))
    data['availability'].append(get_availability(product_soup))
    
    print(f"Scraped {idx}/{len(all_links)} products")
    time.sleep(random.uniform(1, 2))

Scraped 1/39 products
Scraped 2/39 products
Scraped 3/39 products
Scraped 4/39 products
Scraped 5/39 products
Scraped 6/39 products
Scraped 7/39 products
Scraped 8/39 products
Scraped 9/39 products
Scraped 10/39 products
Scraped 11/39 products
Scraped 12/39 products
Scraped 13/39 products
Scraped 14/39 products
Scraped 15/39 products
Scraped 16/39 products
Scraped 17/39 products
Scraped 18/39 products
Scraped 19/39 products
Scraped 20/39 products
Scraped 21/39 products
Scraped 22/39 products
Scraped 23/39 products
Scraped 24/39 products
Scraped 25/39 products
Scraped 26/39 products
Scraped 27/39 products
Scraped 28/39 products
Scraped 29/39 products
Scraped 30/39 products
Scraped 31/39 products
Scraped 32/39 products
Scraped 33/39 products
Scraped 34/39 products
Scraped 35/39 products
Scraped 36/39 products
Scraped 37/39 products
Scraped 38/39 products
Scraped 39/39 products


In [82]:
# Cell 6: Save DataFrame to CSV
df = pd.DataFrame.from_dict(data)
df.replace({'title': ''}, np.nan, inplace=True)
df.dropna(subset=['title'], inplace=True)

# In Kaggle, save to /kaggle/working/ to see in Output
df.to_csv("/kaggle/working/amazon_data.csv", index=False)

print(f"Scraped {len(df)} products and saved to amazon_data.csv")

Scraped 39 products and saved to amazon_data.csv


In [83]:
df

Unnamed: 0,title,price,rating,reviews,availability
0,$10 -PlayStation Store Gift Card [Digital Code],10.0,4.6 out of 5 stars,"271,535 ratings",Available now
1,PlayStation DualSense® Wireless Controller - W...,51.0,4.5 out of 5 stars,"5,325 ratings",In Stock
2,$150 PlayStation Store Gift Card [Digital Code],150.0,4.6 out of 5 stars,"271,535 ratings",Available now
3,PlayStation 5 Slim Ultra HD Blu-ray Disc Drive...,93.0,4.8 out of 5 stars,847 ratings,
4,007 First Light - Legacy Edition - PlayStation 5,299.0,4.4 out of 5 stars.,,"This item will be released on March 27, 2026."
5,Call of Duty®: Black Ops 7 - PlayStation 5,69.0,4.5 out of 5 stars.,,"This item will be released on November 14, 2025."
6,PS5/PS5 Slim/Pro Stand with Adjustable Cooling...,31.0,4.5 out of 5 stars,"1,951 ratings",In Stock
7,HDMI 2.1 Certified Cable Supports 10K 8K @60Hz...,69.0,4.7 out of 5 stars,"2,183 ratings",In Stock
8,8K Detachable HDMI Fiber Optic Cable 82Feet/25...,68.0,4.2 out of 5 stars,96 ratings,Only 8 left in stock - order soon.
9,"Wireless Gaming Headset for PC, PS5, PS4, Mac,...",24.0,4.4 out of 5 stars,595 ratings,In Stock
