# Amazon Product Scraping

We're extracting Data of Gaming Laptops from website https://www.amazon.in (Amazon) and storing it in .csv file.

You can perform Data Analysis on it and even can buy best suited Laptop yourself😉

We're using BeautifulSoup library to extract the data from website

In [125]:
# Ensure that you have both beautifulsoup and requests installed
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import numpy as np
import re

In [126]:
# In some functions, we need to remove #&lrm; from string

# Function to extract Laptop Title
def get_title(soup):
    try: 
        # Outer Tag Object -> Inner NavigatableString Object -> Title as a string value
        title = soup.find("span", attrs={"id":'productTitle'}).text.strip().replace("#&lrm;", "").encode("ascii", "ignore").decode('ascii')
    except Exception as e:
        title = ""
    return title

# Extracting the Laptop Image
def get_image(soup):
    try:
        image_tag = soup.find("div", attrs={"id": "imgTagWrapperId"})
        image = image_tag.img['src']
    except Exception as e:
        image = "Not Available"
    return image

# Function to extract Laptop's Price
def get_price(soup):
    
    div = soup.find("div", attrs={'id':'corePriceDisplay_desktop_feature_div'})
    
    try:
        # Getting Deal Price
        internal_div = div.find("div", attrs={"class": "a-section a-spacing-none aok-align-center"})
        price = internal_div.find("span", attrs={"class": "a-price-whole"}).string.strip().replace(",", '').encode("ascii", "ignore").decode('ascii')

    except Exception as e:

        try:
            # If there is No Deal Price, then price
            internal_div = div.select("span", attrs={'class':'a-section a-spacing-none aok-align-center'})[1]
            price = internal_div.find("span", attrs={"class": "a-offscreen"}).string.strip()[1:].replace(",", '').encode("ascii", "ignore").decode('ascii')

        except:
            price = "Not Available"

    return price

# Function to extract Laptop's Rating
def get_rating(soup):
    try:
        rating = soup.find("span", attrs= {"class": "a-size-medium a-color-base", "data-hook": "rating-out-of-text"}).text.strip().split(" ")[0]
        
    except Exception as e:
        rating = "0"	

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        count = soup.find("span", attrs={"id": "acrCustomerReviewText"}).string.strip().split(" ")[0].replace(",", '')
    
    except Exception as e:
        count = "0"	

    return count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={"id": "availability"}).text.strip().encode("ascii", "ignore").decode('ascii')
        
    except Exception as e:
        available = "Not Available"
        
    return available

def get_technical_details(df, soup):
    
    table = soup.find("table", attrs={"id": "productDetails_techSpec_section_1"})
    columns = table.find_all("th", attrs={"class": "a-color-secondary a-size-base prodDetSectionEntry"})
    column_names = [column.string.strip() for column in columns]
    rows = table.find_all("td", attrs={"class": "a-size-base prodDetAttrValue"})
    row_values = [row.string.strip().replace("#&lrm;", "").encode("ascii", "ignore").decode('ascii') for row in rows]   
    dictionary = dict(zip(column_names, row_values))
    dataframe = pd.DataFrame([dictionary])
    df = pd.concat([df,dataframe], ignore_index=True)
    return df

In [127]:
if __name__ == '__main__':
    # The webpage URL. There we were total 16 products on each page to parse over and nearly over 1000 Laptops. 
    # Let's work on first 5 pages.
    domain = "https://www.amazon.in"
    url_links = []
    for i in range(1, 6):
        url_links.append(domain + f"/s?k=gaming+laptop&page={i}&crid=2HIK3N7L03AWH&qid=1680982200&sprefix=%2Caps%2C683&ref=sr_pg_{i}")
        
    data = {"Title":[], "Image": [], "Price(Rs)":[], "Rating":[], "Number of Reviews":[], "Availability":[], "Link": []}
    technical_data = pd.DataFrame()
    
    for url in url_links:
        # Add your user agent 
        # Go to Whatismybrowser.com then Homepage -> User Agents -> Parse User Agent
        Header = ({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

        # HTTP Request
        # Using the requests module, we use the "get" function provided to access the webpage provided as an argument to this function.
        website = requests.get(url, headers= Header)
    
        # Let us store the page content of the website accessed from requests to a variable 'source'
        source = website.content
    
        # Soup Object containing all data
        soup = bs(source, "html.parser")
    
        # Fetch links as List of Tag Objects
        links = soup.find_all("a", class_ = "a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal")
        # Extracting links
        link_list = [link['href'] for link in links]

        # Loop for extracting Laptop details from each link 
        for link in link_list:
            new_webpage = requests.get(domain + link, headers= Header)
            new_soup = bs(new_webpage.content, "html.parser")
    
            data['Title'].append(get_title(new_soup))
            data['Image'].append(get_image(new_soup))
            data['Price(Rs)'].append(get_price(new_soup))
            data['Rating'].append(get_rating(new_soup))
            data['Number of Reviews'].append(get_review_count(new_soup))
            data['Availability'].append(get_availability(new_soup))
            data['Link'].append("https://www.amazon.in" + link)
            technical_data = get_technical_details(technical_data, new_soup)

In [128]:
pd.set_option("display.max_columns", None)
technical_data.head(2)

Unnamed: 0,Brand,Manufacturer,Series,Colour,Form Factor,Item Height,Item Width,Standing screen display size,Resolution,Product Dimensions,Batteries,Item model number,Processor Brand,Processor Type,Processor Speed,Processor Count,RAM Size,Memory Technology,Maximum Memory Supported,Memory Clock Speed,Hard Drive Size,Hard Disk Description,Audio Details,Speaker Description,Graphics Coprocessor,Graphics Chipset Brand,Graphics Card Description,Graphics RAM Type,Graphics Card Interface,Number of USB 3.0 Ports,Number of HDMI Ports,Operating System,Are Batteries Included,Lithium Battery Energy Content,Number of Lithium Ion Cells,Included Components,Country of Origin,Item Weight,Model,Model Name,Model Year,Memory Storage Capacity,Memory Slots Available,Flash Memory Installed Size,Ram Memory Installed Size,Ram Memory Technology,Computer Memory Type,Hard Drive Interface,Optical Drive Type,Processor model number,Hardware Platform,Hardware Interface,Graphics Card Ram Size,Compatible Devices,Special Features,Mounting Hardware,Number of items,Software included,Display Technology,Display Type,Audio Output Type,Power Source,Batteries Included,Batteries Required,Battery cell composition,Wireless Type,Refresh Rate,Total USB ports,Keyboard Description,Connector Type,Device interface - primary,Device type,Connectivity Type,Wattage,RAM memory maximum size,Image contrast ratio,Aspect Ratio,Screen Resolution,Microphone format,Microphone technology,Voltage,Battery Average Life,Digital storage capacity,Number of Ports,Response Time,Average Battery Life (in hours),Includes Rechargeable Battery,Number of Lithium Metal Cells,Does it contain liquid?,Data Transfer Rate,Has Auto Focus,Programmable Buttons,Number of USB 2.0 Ports,Hard Disk Rotational Speed,Average Battery Standby Life (in hours),Number of Audio-out Ports,Number of Ethernet Ports,Number of Microphone Ports,Supported Software,Battery Charge Cycles,Lithium Battery Weight,Cellular technology,Package Dimensions,Battery Average Life Standby,Card Reader,Imported By
0,HP,HP,15-fb0777TX,Mica Silver,Netbook,23 Millimeters,24 Centimeters,39.6 Centimetres,1920x1080 Pixels,35.8 x 24 x 2.3 cm; 2.37 Kilograms,1 Lithium Ion batteries required. (included),15-fb0777TX,AMD,Ryzen 5,4.2 GHz,1,8 GB,DDR4,16 GB,3200 MHz,512 GB,SSD,"Headphones, Speakers","Audio by B&O, Dual speakers",AMD Radeon RX 6500M,AMD,Dedicated,GDDR6,PCI Express,2.0,1.0,Windows 11 Home,Yes,52.5 Watt Hours,3.0,"Laptop, Power Adapter, User Manual",China,2 kg 370 g,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,HP,HP,,,Ultra-Portable,,,15.6 Inches,1920 X 1080 (FHD) Pixels,36 x 25.6 x 2.3 cm; 2.23 Kilograms,1 Lithium Ion batteries required. (included),15-dk2012TX,Intel,Core i5,4.4 GHz,1,8 GB,,,,512 GB,SSD,,,NVIDIA GeForce RTX 3050,,Dedicated,GDDR6,PCI Express,,,Windows 10 Home,,,,,China,2 kg 230 g,15-dk2012TX,HP Pavilion Gaming Laptop 15-dk2012TX,2021.0,512 GB,2.0,512 GB,16 GB,DDR4,DDR4 SDRAM,Solid State,No Optical Drive,i5-11300H,PC,USB,4.0,Laptop,Backlit,"Laptop, 200 W Smart AC power adapter, user man...",1.0,Microsoft Office 365,LED,LED,"Headphones, Speakers",Battery Powered,Yes,Yes,Lithium Ion,Bluetooth,144 Hz,3.0,Gaming,"Bluetooth, Wi-Fi, HDMI",Keyboard,Gaming laptop,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Let's Clean our technical_data for further processing

In [129]:
print(technical_data.shape)
percent_missing_col = technical_data.isnull().mean()*100
drop_col = percent_missing_col[percent_missing_col > 0].keys()
drop_col

(134, 106)


Index(['Series', 'Colour', 'Item Height', 'Item Width',
       'Standing screen display size', 'Resolution', 'Product Dimensions',
       'Batteries', 'Item model number', 'Processor Brand',
       ...
       'Number of Ethernet Ports', 'Number of Microphone Ports',
       'Supported Software', 'Battery Charge Cycles', 'Lithium Battery Weight',
       'Cellular technology', 'Package Dimensions',
       'Battery Average Life Standby', 'Card Reader', 'Imported By'],
      dtype='object', length=102)

In [130]:
technical_data['Average Battery Life (in hours)'].replace(np.nan, '', inplace=True)
technical_data['Battery Average Life'].replace(np.nan, '', inplace=True)
technical_data['Average Battery Standby Life (in hours)'].replace(np.nan, '', inplace=True)
technical_data['Battery Average Life Standby'].replace(np.nan, '', inplace=True)

In [139]:
technical_data['Battery Average Life'] = technical_data['Battery Average Life'] + "" + technical_data['Average Battery Life (in hours)'] + "" + technical_data['Average Battery Standby Life (in hours)'] + "" + technical_data['Battery Average Life Standby']

In [140]:
# Keeping only Relevant columns from technical_data
columns = ['Resolution', 'Processor Brand', 'Processor Type', 'Processor Speed', 'Memory Technology', 'Hard Drive Size', 'Hard Disk Description', 'Graphics Coprocessor', 'Graphics Card Description', 'Graphics RAM Type', 'Operating System', 'Item Weight', 'Model Year', 'Computer Memory Type', 'Graphics Card Ram Size', 'Total USB ports', 'Battery Average Life']

In [141]:
# Selecting a sub-dataframe
technical_data = technical_data[columns]

Joining two dataframes.

In [142]:
df = pd.DataFrame.from_dict(data)

df = df.join(technical_data)
df['Title'].replace('', np.nan, inplace=True)
df = df.dropna(subset=['Title'])    #Dropping those products which don't have a title

In [143]:
pd.set_option("display.max_columns", None)
df.head(2)

Unnamed: 0,Title,Image,Price(Rs),Rating,Number of Reviews,Availability,Link,Resolution,Processor Brand,Processor Type,Processor Speed,Memory Technology,Hard Drive Size,Hard Disk Description,Graphics Coprocessor,Graphics Card Description,Graphics RAM Type,Operating System,Item Weight,Model Year,Computer Memory Type,Graphics Card Ram Size,Total USB ports,Battery Average Life
0,HP Victus Gaming Laptop AMD Ryzen 5 5600H 15.6...,https://m.media-amazon.com/images/I/41b3PmlPPu...,59490,4.5,2,In stock,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,1920x1080 Pixels,AMD,Ryzen 5,4.2 GHz,DDR4,512 GB,SSD,AMD Radeon RX 6500M,Dedicated,GDDR6,Windows 11 Home,2 kg 370 g,,,,,
1,HP Pavilion Gaming 11th Gen Intel Core i5 15.6...,https://m.media-amazon.com/images/I/41j6O0FtwT...,69057,3.8,39,In stock,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,1920 X 1080 (FHD) Pixels,Intel,Core i5,4.4 GHz,,512 GB,SSD,NVIDIA GeForce RTX 3050,Dedicated,GDDR6,Windows 10 Home,2 kg 230 g,2021.0,DDR4 SDRAM,4.0,3.0,


Saving the extracted data to a csv file

In [145]:
df.to_csv("Laptop_data.csv", header=True, index = False)

Tip: You may need to clean the Dataset before Data Analysis. All the Best.