In [1]:
# Import required libraries
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
import pandas as pd
import regex as re
import numpy as np
import requests
import string
import time

# Confirm successful imports
print("Libraries imported successfully!")


Libraries imported successfully!


In [2]:
# Set up Selenium Chrome Driver
from selenium.webdriver.chrome.options import Options

# Configure Chrome options
options = Options()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--disable-gpu")  # Disable GPU acceleration
options.add_argument("--no-sandbox")  # Avoid sandboxing issues
options.add_argument("--start-maximized")

# Provide the path to your chromedriver executable
service = Service("D:\Games\chromedriver-win64\chromedriver-win64\chromedriver.exe")  # Update with your ChromeDriver path
driver = webdriver.Chrome(service=service, options=options)

# Confirm Selenium driver setup
print("Selenium WebDriver set up successfully!")


  service = Service("D:\Games\chromedriver-win64\chromedriver-win64\chromedriver.exe")  # Update with your ChromeDriver path


Selenium WebDriver set up successfully!


In [4]:
# Function to extract Product Title
def get_title(soup):
    try:
        title = soup.find("span", attrs={"id": "productTitle"}).text.strip()
    except AttributeError:
        title = ""
    return title

# Function to extract Product Price
def get_price(soup):
    try:
        price = soup.find("span", attrs={"class": "a-price-whole"}).text.strip()
        fraction = soup.find("span", attrs={"class": "a-price-fraction"})
        if fraction:
            price += fraction.text.strip()
    except AttributeError:
        price = ""
    return price

# Function to extract Product Rating
def get_rating(soup):
    try:
        rating = soup.find("span", attrs={"class": "a-icon-alt"}).text.strip()
    except AttributeError:
        rating = ""
    return rating

# Function to extract Number of Reviews
def get_review_count(soup):
    try:
        reviews = soup.find("span", attrs={"id": "acrCustomerReviewText"}).text.strip()
    except AttributeError:
        reviews = ""
    return reviews

# Function to extract Availability Status
def get_availability(soup):
    try:
        availability = soup.find("div", attrs={"id": "availability"}).find("span").text.strip()
    except AttributeError:
        availability = "Not Available"
    return availability

print("Scraping functions defined successfully!")


Scraping functions defined successfully!


In [5]:
# Set up headers to mimic a browser
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

# Amazon search URL
search_query = "playstation 4"
URL = f"https://www.amazon.com/s?k={search_query.replace(' ', '+')}"

# Use Selenium to load the page
driver.get(URL)
time.sleep(3)  # Wait for the page to load

# Extract HTML content and parse with BeautifulSoup
html_content = driver.page_source
soup = BeautifulSoup(html_content, "html.parser")

# Find product links
product_links = soup.find_all("a", attrs={"class": "a-link-normal s-no-outline"})
links_list = ["https://www.amazon.com" + link["href"] for link in product_links]

print(f"Found {len(links_list)} product links.")


Found 20 product links.


In [6]:
# Initialize a dictionary to store product details
data = {"title": [], "price": [], "rating": [], "reviews": [], "availability": []}

# Loop through product links to extract details
for link in links_list[:10]:  # Limit to 10 products for testing
    driver.get(link)
    time.sleep(2)  # Wait for the page to load
    product_soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Collect product details
    data["title"].append(get_title(product_soup))
    data["price"].append(get_price(product_soup))
    data["rating"].append(get_rating(product_soup))
    data["reviews"].append(get_review_count(product_soup))
    data["availability"].append(get_availability(product_soup))

print("Product details extracted successfully!")


Product details extracted successfully!


In [6]:
# Convert the data dictionary to a Pandas DataFrame
amazon_df = pd.DataFrame(data)

# Replace empty strings with NaN and drop rows with missing titles
amazon_df.replace("", np.nan, inplace=True)
amazon_df.dropna(subset=["title"], inplace=True)

# Save the data to a CSV file
amazon_df.to_csv("amazon_products.csv", index=False)

# Display the DataFrame
print(amazon_df)


                                               title   price  \
0             Sony Playstation PS4 1TB Black Console  200.00   
1  PlayStation®4 Console – Call of Duty® Modern W...  379.90   
2  Playstation Sony 4, 500GB Slim System [CUH-221...  189.99   
3  PlayStation 4 Slim 1TB Console - Marvel's Spid...  200.00   
4                     PlayStation 4 Slim 1TB Console  369.99   
5                     PlayStation 4 Slim 1TB Console  249.99   
6  PlayStation 4 500GB Console - Call of Duty Bla...  180.00   
7  PlayStation 4 Slim 500GB Console - Uncharted 4...  243.99   
8  Sony PlayStation 4 Pro 1TB Console - Black (PS...  200.00   
9    PlayStation 4 Slim 500GB Console [Discontinued]  156.70   

               rating         reviews                         availability  
0  4.6 out of 5 stars   1,570 ratings   Only 1 left in stock - order soon.  
1  4.5 out of 5 stars     121 ratings                             In Stock  
2  4.3 out of 5 stars     374 ratings   Only 8 left in stock - o

In [7]:
# Close the browser session
driver.quit()
print("Selenium WebDriver session closed!")


Selenium WebDriver session closed!
