<div style= "background-color: #170000; padding: 20px; font-size: 20px;">
<h2 style= "color: white;"> Amazon Product Dataset</h2>
<p style="color: white;">Scrapping Amazon Data into Pandas DataFrame Using Python BeautifulSoup</p>
<img src= "https://m.media-amazon.com/images/G/02/gc/designs/livepreview/a_generic_10_uk_noto_email_v2016_uk-main._CB647744966_.png">
</div>

In [1]:
# Import modules

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import datetime 

In [21]:
# Create a function to extract all the product in the webpage

#Create functions to extract the elements from the webpage
def get_ProductName(soup):

    try:
        ProductName = soup.find('span', attrs={'id': 'productTitle'})

        ProductName_value = ProductName.text

        ProductName_string = ProductName_value.strip()

    except AttributeError:
        ProductName_string = ''

    return ProductName_string
        
# Function to extract product price
def get_Price(soup):

    try:
        Price = soup.find('span', attrs={'class': 'a-offscreen'}).string.strip()

    
    except AttributeError:
        Price = ''

    return Price

# Function to extract customer rteviews

def get_Reviews(soup):

    try:
        Reviews = soup.find('span', attrs={'class': 'a-icon-alt'}).string.strip()

    except AttributeError:
        Reviews =''

    return Reviews

# Get product availability

def get_Availability(soup):

    try:

        Availability = soup.find('div', attrs ={'id': 'availability'})
        Availability = Availability.find('span').string.strip()


    except AttributeError:
        Availability ='Not In Stock'

    return Availability

# Extract purchase pattern

def get_QuantitySold(soup):

    try:
        QuantitySold = soup.find('span', attrs={'id': 'social-proofing-faceout-title-tk_bought'})
        QuantitySold = QuantitySold.find('span').string.strip()

    except AttributeError:
        QuantitySold = ''

    return QuantitySold


In [22]:
if __name__ == '__main__':

    #add user agent to access the webpage
    HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36","Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    # Webpage 
    URL = 'https://www.amazon.com/s?k=kitchen+products&_encoding=UTF8&content-id=amzn1.sym.7e3f7953-c549-4c8e-8fbf-1f916b11db3b&pd_rd_r=7b34ce4b-854c-47a4-b92a-a06683bd7907&pd_rd_w=dAGwy&pd_rd_wg=PWLjn&pf_rd_p=7e3f7953-c549-4c8e-8fbf-1f916b11db3b&pf_rd_r=WYHCHQVP8AGBX40AJDB4&ref=pd_hp_d_atf_unk'

    # HTTPS request using request.get function
    webpage = requests.get(URL, headers=HEADERS)

    # Instantiate the soup object 
    soup = BeautifulSoup(webpage.content, 'html.parser')

    # fetch links from the tag objects
    links = soup.find_all('a', attrs={'class': 'a-link-normal s-no-outline'})
    
    # Store the links
    links_list = []

    # Loop through the links for each product and append in the list object created
    for link in links:
        links_list.append(link.get('href'))

    # Create a dict to extract the header for each element/feature
    heading = {'ProductName': [], 'Price': [], 'Reviews':[], 'Availability':[],'QuantitySold': [] }

    # Loop through to extract product details from each link
    for link in links_list:
        #create a new webpage for each link
        new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)
        new_soup = BeautifulSoup(new_webpage.content, 'html.parser')

        #to call the features of interest
        heading['ProductName'].append(get_ProductName(new_soup))
        heading['Price'].append(get_Price(new_soup))
        heading['Reviews'].append(get_Reviews(new_soup))
        heading['Availability'].append(get_Availability(new_soup))
        heading['QuantitySold'].append(get_QuantitySold(new_soup))
    #Create a pandas dataframe for the dataset
    amazon_df = pd.DataFrame.from_dict(heading)
    # if the productname returns empty, replace with null
    amazon_df['ProductName'].replace('', np.nan, inplace=True)
    # Drop nan values
    amazon_df = amazon_df.dropna(subset=['ProductName'])
    # To replace the "out of stock" prices with null in the 'Price' column
    amazon_df.loc[amazon_df['Availability'] == 'Not In Stock', 'Price'] = 0.00
    # Save data to csv file
    amazon_df.to_csv('amazon_product_data.csv', header=True, index=False)

In [23]:
amazon_df


Unnamed: 0,ProductName,Price,Reviews,Availability,QuantitySold
0,Fullstar Vegetable Chopper - Food Chopper - On...,$29.97,4.5 out of 5 stars,In Stock,20K+
1,"REALINN Under Sink Organizer and Storage, 2 Pa...",$45.99,4.4 out of 5 stars,In Stock,10K+
2,Ninja BL660 Professional Compact Smoothie & Fo...,$99.99,4.7 out of 5 stars,In Stock,6K+ bought in past month
3,KRAUS Kore 30-Inch Undermount Workstation 16 G...,$377.24,4.6 out of 5 stars,Only 2 left in stock - order soon.,200+ bought
4,"Mueller Pro-Series All-in-One, 12 Blade Mandol...",$39.99,4.5 out of 5 stars,In Stock,6K+ bought
...,...,...,...,...,...
59,Mythinglogic Metal Under Sink Organizers and S...,$49.99,4.2 out of 5 stars,In Stock,
60,Moreborn by Neakasa 12L Larger Capacity Electr...,$499.99,4.5 out of 5 stars,In Stock,50+
61,12Pcs Steel Wool Scrubber Pads for Cleaning Di...,$11.99,4.7 out of 5 stars,In Stock,3K+
62,Universal Kitchen Sink Stopper Strainer Combo ...,$9.99,4.2 out of 5 stars,In Stock,50+ bought in past month


In [24]:
amazon_df.head(5)

Unnamed: 0,ProductName,Price,Reviews,Availability,QuantitySold
0,Fullstar Vegetable Chopper - Food Chopper - On...,$29.97,4.5 out of 5 stars,In Stock,20K+
1,"REALINN Under Sink Organizer and Storage, 2 Pa...",$45.99,4.4 out of 5 stars,In Stock,10K+
2,Ninja BL660 Professional Compact Smoothie & Fo...,$99.99,4.7 out of 5 stars,In Stock,6K+ bought in past month
3,KRAUS Kore 30-Inch Undermount Workstation 16 G...,$377.24,4.6 out of 5 stars,Only 2 left in stock - order soon.,200+ bought
4,"Mueller Pro-Series All-in-One, 12 Blade Mandol...",$39.99,4.5 out of 5 stars,In Stock,6K+ bought
