In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time

# Function to extract Product Title
def get_title(soup):
    try:
        title = soup.find("span", attrs={"id": 'productTitle'}).text.strip()
    except AttributeError:
        title = ""
    return title

# Function to extract Product Price
def get_price(soup):
    try:
        price = soup.find("span", attrs={'class': 'a-price-whole'}).text.strip()
        # Check if fractional price exists (e.g., ".99")
        fraction = soup.find("span", attrs={'class': 'a-price-fraction'})
        if fraction:
            price += fraction.text.strip()
    except AttributeError:
        price = ""
    return price

# Function to extract Product Rating
def get_rating(soup):
    try:
        rating = soup.find("span", attrs={'class': 'a-icon-alt'}).text.strip()
    except AttributeError:
        rating = ""
    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id': 'acrCustomerReviewText'}).text.strip()
    except AttributeError:
        review_count = ""
    return review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id': 'availability'}).find("span").text.strip()
    except AttributeError:
        available = "Not Available"
    return available

if __name__ == '__main__':
    # Set up Selenium WebDriver
    options = Options()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    service = Service(r"D:\Games\chromedriver-win64\chromedriver-win64\chromedriver.exe")  # Replace with the path to your ChromeDriver
    driver = webdriver.Chrome(service=service, options=options)

    # URL to scrape
    URL = "https://www.amazon.com/s?k=playstation+4&ref=nb_sb_noss_2"

    # Open the URL in Selenium
    driver.get(URL)
    time.sleep(5)  # Allow time for the page to load completely

    # Get the page source and parse with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Fetch product links
    links = soup.find_all("a", attrs={'class': 'a-link-normal s-no-outline'})
    links_list = ["https://www.amazon.com" + link.get('href') for link in links]

    # Data dictionary
    data = {"title": [], "price": [], "rating": [], "reviews": [], "availability": []}

    # Loop through each product link
    for link in links_list:
        driver.get(link)
        time.sleep(3)  # Allow time for the product page to load

        # Parse the product page
        product_soup = BeautifulSoup(driver.page_source, "html.parser")

        # Extract product details
        data['title'].append(get_title(product_soup))
        data['price'].append(get_price(product_soup))
        data['rating'].append(get_rating(product_soup))
        data['reviews'].append(get_review_count(product_soup))
        data['availability'].append(get_availability(product_soup))

    # Close the WebDriver
    driver.quit()

    # Create DataFrame and save to CSV
    amazon_df = pd.DataFrame(data)
    amazon_df['title'].replace('', np.nan, inplace=True)
    amazon_df = amazon_df.dropna(subset=['title'])
    amazon_df.to_csv("amazon_data.csv", header=True, index=False)

    print("Data scraped and saved to amazon_data.csv!")


Data scraped and saved to amazon_data.csv!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  amazon_df['title'].replace('', np.nan, inplace=True)


In [6]:
print(amazon_df)

                                                title   price  \
0              Sony Playstation PS4 1TB Black Console  200.00   
1   Playstation Sony 4, 500GB Slim System [CUH-221...  189.99   
2   PlayStation®4 Console – Call of Duty® Modern W...  379.90   
3   PlayStation 4 Slim 1TB Console - Marvel's Spid...  200.00   
4                      PlayStation 4 Slim 1TB Console  369.99   
5                      PlayStation 4 Slim 1TB Console  249.99   
6   PlayStation 4 500GB Console - Call of Duty Bla...  180.00   
7   Sony PlayStation 4 Pro 1TB Console - Black (PS...  200.00   
8     PlayStation 4 Slim 500GB Console [Discontinued]  156.70   
9   FASIGO PS4 Controller 2 Pack, Wireless PS4 Con...   31.98   
10  PlayStation 4 Slim 1TB Limited Edition Console...  299.99   
11  PlayStation 4 Slim 500GB Console - Uncharted 4...  244.95   
12  PS4 Controller Charger Dock Station, OIVO 1.8H...   14.99   
13    $250 PlayStation Store Gift Card [Digital Code]  250.00   
14  Sony Playstation 4 Sl

In [7]:
print(amazon_df.info())  # Shows column types and non-null counts


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         19 non-null     object
 1   price         19 non-null     object
 2   rating        19 non-null     object
 3   reviews       19 non-null     object
 4   availability  19 non-null     object
dtypes: object(5)
memory usage: 892.0+ bytes
None


In [8]:
from tabulate import tabulate

# Check the DataFrame structure
print(amazon_df)

# Print the table in a formatted way
print(tabulate(amazon_df, headers='keys', tablefmt='grid'))


                                                title   price  \
0              Sony Playstation PS4 1TB Black Console  200.00   
1   Playstation Sony 4, 500GB Slim System [CUH-221...  189.99   
2   PlayStation®4 Console – Call of Duty® Modern W...  379.90   
3   PlayStation 4 Slim 1TB Console - Marvel's Spid...  200.00   
4                      PlayStation 4 Slim 1TB Console  369.99   
5                      PlayStation 4 Slim 1TB Console  249.99   
6   PlayStation 4 500GB Console - Call of Duty Bla...  180.00   
7   Sony PlayStation 4 Pro 1TB Console - Black (PS...  200.00   
8     PlayStation 4 Slim 500GB Console [Discontinued]  156.70   
9   FASIGO PS4 Controller 2 Pack, Wireless PS4 Con...   31.98   
10  PlayStation 4 Slim 1TB Limited Edition Console...  299.99   
11  PlayStation 4 Slim 500GB Console - Uncharted 4...  244.95   
12  PS4 Controller Charger Dock Station, OIVO 1.8H...   14.99   
13    $250 PlayStation Store Gift Card [Digital Code]  250.00   
14  Sony Playstation 4 Sl

In [9]:
from tabulate import tabulate

# Convert DataFrame to a tabular format
print(tabulate(amazon_df, headers='keys', tablefmt='grid'))


+----+---------------------------------------------------------------------------------------------------------------------------------------------------+---------+--------------------+-----------------+------------------------------------+
|    | title                                                                                                                                             |   price | rating             | reviews         | availability                       |
|  0 | Sony Playstation PS4 1TB Black Console                                                                                                            |  200    | 4.6 out of 5 stars | 1,570 ratings   |                                    |
+----+---------------------------------------------------------------------------------------------------------------------------------------------------+---------+--------------------+-----------------+------------------------------------+
|  1 | Playstation Sony 4, 500GB Sli