In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
# Function to extract Product Title
def get_title(soup):

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'})
        
        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):

    try:
        price = soup.find("span", attrs={'class':'a-price-whole'}).text.strip()

    except AttributeError:

        try:
            # If there is some deal price
            price = soup.find("span", attrs={'class':'a-price-whole'}).text.strip()

        except:
            price = ""

    return price

# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""

    return review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()

    except AttributeError:
        available = "Not Available"

    return available

def get_ASIN(soup):
    try:
        asin = soup.find("div", attrs={'id':'detailBullets_feature_div'})
        if asin is None:
            asin = soup.find("table", attrs={'id':'productDetails_detailBullets_sections1'})
            asin=asin.find_all('tr')[0]
            asin=asin.find_all('td')[0].text
        else:  
            asin = asin.find_all('li')[3]
            asin=asin.find_all('span')[2].text

    except AttributeError:
        asin = "Not Available"

    return asin

def get_prod_desc(soup):
    try:
        prod = soup.find("div", attrs={'id':'productDescription'})
        prod = prod.find_all('span')
        if(len(prod)>0):
            prod=prod[0].text
        else: prod = "Not Available"

    except AttributeError:
        prod = "Not Available"

    return prod

def get_manufacturer(soup):
    try:
        man = soup.find("div", attrs={'id':'detailBullets_feature_div'})
        if man is None:
            man = soup.find("table", attrs={'id':'productDetails_detailBullets_sections1'})
            man=man.find_all('tr')[4]
            man=man.find_all('td')[0].text
        else:
            man = man.find_all('li')[2]
            man=man.find_all('span')[2].text

    except AttributeError:
        man = "Not Available"	

    return man

In [3]:
info = {"Product_URL":[],"Product_Name":[], "Product_Price":[], "Rating":[], "Number_of_Reviews":[],
       "ASIN":[],"Product_desc":[],"Manufacturer":[]}

In [1]:
URL = 'https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%2C283&ref=sr_pg_1'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36","DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

In [9]:
for page in range(1,21):
    pg = requests.get(URL, headers=headers)
    soup=BeautifulSoup(pg.content,"html.parser")

    links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})
    links_list = []
    for link in links:
            links_list.append(link.get('href'))
    
    for link in links_list:
        new_webpage = requests.get("https://www.amazon.in" + link, headers=headers)

        new_soup = BeautifulSoup(new_webpage.content, "html.parser")
        # Function calls to display all necessary product information
        info['Product_URL'].append("https://www.amazon.in" + link)
        info['Product_Name'].append(get_title(new_soup))
        info['Product_Price'].append(get_price(new_soup))
        info['Rating'].append(get_rating(new_soup))
        info['Number_of_Reviews'].append(get_review_count(new_soup))
        info['ASIN'].append(get_ASIN(new_soup))
        info['Product_desc'].append(get_prod_desc(new_soup))
        info['Manufacturer'].append(get_manufacturer(new_soup))

    print(str(page)+" complete")
    if page==1:
        URL=URL[:31]+'page=2&'+URL[31:]
    else:
        URL=URL[:36]+str(page)+URL[37:]

In [7]:
amazon_df = pd.DataFrame.from_dict(info)
amazon_df['Product_Name'].replace('', np.nan, inplace=True)
amazon_df = amazon_df.dropna(subset=['Product_Name'])
amazon_df.to_csv("amazon_data.csv", header=True, index=False)