In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

# Function to extract Product Title
def get_title(soup):
    try:
        title = soup.find("span", attrs={"id":'productTitle'})
        title_value = title.text
        title_string = title_value.strip()
    except AttributeError:
        title_string = ""
    return title_string

# Function to extract Product Price
def get_price(soup):
    try:
        price = soup.find("span", attrs={'id':'priceblock_ourprice'}).string.strip()
    except AttributeError:
        try:
            price = soup.find("span", attrs={'id':'priceblock_dealprice'}).string.strip()
        except:
            price = ""
    return price

# Function to extract Product Rating
def get_rating(soup):
    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""    
    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()
    except AttributeError:
        review_count = ""    
    return review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()
    except AttributeError:
        available = "Not Available"    
    return available

# Function to extract Customer Name from Reviews
def get_customer_names(soup):
    try:
        names = soup.find_all("span", attrs={"class":"a-profile-name"})
        customer_names = [name.text.strip() for name in names]
    except AttributeError:
        customer_names = []
    return customer_names

# Function to extract Comment Tags from Reviews
def get_comment_tags(soup):
    try:
        tags = soup.find_all("span", attrs={"class":"a-size-base a-color-secondary"})
        comment_tags = [tag.text.strip() for tag in tags]
    except AttributeError:
        comment_tags = []
    return comment_tags

if __name__ == '__main__':
    HEADERS = {'User-Agent': 'your-user-agent', 'Accept-Language': 'en-US, en;q=0.5'}
    URL = "https://www.amazon.com/s?k=playstation+4&ref=nb_sb_noss_2"
    
    # HTTP Request
    try:
        webpage = requests.get(URL, headers=HEADERS)
        webpage.raise_for_status()  # To check if request was successful
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        exit()

    soup = BeautifulSoup(webpage.content, "html.parser")
    links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})
    links_list = [link.get('href') for link in links]

    d = {"title": [], "price": [], "rating": [], "reviews": [], "availability": [], "customer_names": [], "comment_tags": []}

    # Loop for extracting product details from each link
    for link in links_list:
        try:
            new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)
            new_webpage.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching {link}: {e}")
            continue

        new_soup = BeautifulSoup(new_webpage.content, "html.parser")
        
        # Function calls to display all necessary product information
        d['title'].append(get_title(new_soup))
        d['price'].append(get_price(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['reviews'].append(get_review_count(new_soup))
        d['availability'].append(get_availability(new_soup))
        d['customer_names'].append(get_customer_names(new_soup))
        d['comment_tags'].append(get_comment_tags(new_soup))

    # Convert to DataFrame and clean
    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['title'].replace('', np.nan, inplace=True)
    amazon_df = amazon_df.dropna(subset=['title'])

    amazon_df.to_csv("amazon_data.csv", header=True, index=False)
    print("Data saved to amazon_data.csv")


Data saved to amazon_data.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  amazon_df['title'].replace('', np.nan, inplace=True)


In [4]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability,customer_names,comment_tags
0,PlayStation 4 Slim 1TB Console,,4.6 out of 5 stars,"15,804 ratings",Not Available,"[PS4 Slim Module5, , , , , , , , , , , Amazon ...","[15,804 global ratings]"
1,PlayStation®4 Console – Call of Duty® Modern W...,,4.5 out of 5 stars,158 ratings,Not Available,[Sony Playstation 4 PS4 Console (BEFORE YOU PU...,[158 global ratings]
2,Flagship Play Station 4 1TB HDD Only on Playst...,,4.5 out of 5 stars,214 ratings,Not Available,"[Amazon Customer, Manuel, Jami, Samuel Rivera,...",[214 global ratings]
3,DualShock 4 Wireless Controller for PlayStatio...,,4.6 out of 5 stars,"138,111 ratings",Not Available,[DualShock 4 Wireless Controller for PlayStati...,"[Color:, 138,111 global ratings]"
4,PlayStation 4 Jet Black (CUH-1200AB01) [Japan ...,,3.8 out of 5 stars,866 ratings,Usually ships within 6 to 7 days,"[Amazon Customer, Andrew Z., netnopaewx, Melan...",[866 global ratings]
5,"PS4 Controller Charger Dock Station, OIVO 1.8H...",,4.7 out of 5 stars,"40,893 ratings",Not Available,"[PS4 Controller Charger Station, , , , , , , ,...","[Color:, 40,893 global ratings]"
6,Wuthur 2 Pack Wireless Controller Compatible w...,,4.3 out of 5 stars,483 ratings,Not Available,[Wuthur 2 Pack Wireless Controller Compatible ...,[483 global ratings]
7,"PS4 Controller Charger Dock Station, PS4 Remot...",,4.7 out of 5 stars,"3,749 ratings",Not Available,"[PS4 Charger Station, , , , , , , , Amazon Cus...","[Color:, 3,749 global ratings]"
8,Mega Man Legacy Collection 2 - PlayStation 4,,4.6 out of 5 stars,503 ratings,In Stock,"[Amazon Customer, Amazon Customer, Rashaud, Ra...",[$18.83 Shipping & Import Fees Deposit to Indi...
9,SHINXIN 2 Pack Wireless Controller Compatible ...,,4.3 out of 5 stars,588 ratings,Not Available,"[2 Pack Wireless Controller, , Amazon Customer...","[Color:, 588 global ratings]"
