In [99]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re

In [110]:
# Function to extract Product Title
def get_title(soup):

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'})
        
        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):

    try:
        
        price = soup.find("span", attrs={"class":'a-price aok-align-center'}).find("span", attrs={"class": "a-price-whole"}).get_text()[:-1]

    except AttributeError:

        try:
            # If there is some deal price
            price = soup.find("span", attrs={'id':'priceblock_dealprice'}).string.strip()

        except:
            price = ""

    return price

# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""

    return review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()

    except AttributeError:
        available = "Not Available"

    return available

def get_asin(href,soup):
    try:
        asin = re.search(r'/[dg]p/([^/]+)', href, flags=re.IGNORECASE)
        if asin:
            asin_number=asin.group(1)
        if asin==None:
                s=soup.find("div", attrs={"id":'detailBullets_feature_div'}).ul.text
                L=s.split()
                for i in range(len(L)):
                    if len(L[i])==10:
                        if L[i].isupper()==True:
                            asin_number=L[i]
                            break

                return asin_number  
        else:
            return asin_number           
    except AttributeError:
            asin_number=""
            return asin_number
            
def get_product_description(soup):
    try:
        product_description=soup.find("div", attrs={"id":'feature-bullets'}).text.strip().split("   ")
    except AttributeError:
            product_description=""
    return product_description
    
    
def get_seller(soup):
    try:
        #seller=soup.find("div", attrs={"id":'merchant-info'}).find(attrs={"id":"sellerProfileTriggerId"}).text
        seller=soup.find("div", attrs={"id":'merchant-info'}).text[9:-28]
        return seller
    except AttributeError: 
        seller=""
        return seller
    
    
def get_manufacturer(soup):
    try:
        manufacturer=soup.find("table", attrs={"id":'productDetails_detailBullets_sections1'}).find_all(attrs={"class":'a-size-base prodDetAttrValue'})[2].text
        return manufacturer
    
    except AttributeError:
        try:
            a=len(soup.find("div", attrs={"id":'detailBulletsWrapper_feature_div'}).find_all("span"))
            correct='Manufacturer'
            for i in range(a):
                w=new_soup.find("div", attrs={"id":'detailBulletsWrapper_feature_div'}).find_all("span")[i].text.replace("                                    ","")[:12]
                if w==correct:
                    manufacturer=new_soup.find("div", attrs={"id":'detailBulletsWrapper_feature_div'}).find_all("span")[i+1].text
                    break
            return manufacturer
        except:
            manufacturer = ""
            return manufacturer
    
    

## for main page

In [108]:
if __name__ == '__main__':

    # add your user agent 
    # Headers for request
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 'Accept-Language': 'en-INR, en;q=0.5'}) 

    # The webpage URL
    amazon_url="https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%2C283&ref=sr_pg_1"

    # HTTP Request
    webpage = requests.get(amazon_url, headers=HEADERS)

    # Soup Object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Fetch links as List of Tag Objects
    links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

    # Store the links
    links_list = []

    # Loop for extracting links from Tag Objects
    for link in links:
            links_list.append(link.get('href'))

    d = {"Product URL":[],"Product Name":[], "Product Price":[], "Rating":[], "Number of reviews":[],"Availability":[],"ASIN":[],"Product Description":[],"Sold by":[],"Manufacturer":[]}
    
    # Loop for extracting product details from each link 
    for link in links_list:
        new_webpage = requests.get("https://www.amazon.in" + link, headers=HEADERS)

        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Function calls to display all necessary product information
        d["Product URL"].append("https://www.amazon.in"+link)
        d['Product Name'].append(get_title(new_soup))
        d['Product Price'].append(get_price(new_soup))
        d['Rating'].append(get_rating(new_soup))
        d['Number of reviews'].append(get_review_count(new_soup))
        d['Availability'].append(get_availability(new_soup))
        d['ASIN'].append(get_asin("https://www.amazon.in" + link,new_soup))
        d['Product Description'].append(get_product_description(new_soup))
        d['Sold by'].append(get_seller(new_soup))
        d['Manufacturer'].append(get_manufacturer(new_soup))
        
        


    
    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['Product Name'].replace('', np.nan, inplace=True)
    amazon_df = amazon_df.dropna(subset=['Product Name'])
    amazon_df.to_csv("amazon_data.csv", header=True, index=False)


In [109]:
amazon_df

Unnamed: 0,Product URL,Product Name,Product Price,Rating,Number of reviews,Availability,ASIN,Product Description,Sold by,Manufacturer
0,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,Anti Theft Faux Leather,499.0,3.9 out of 5 stars,51 ratings,In stock.,B08WLS5XBP,[Anti-Theft backpack has added safety features...,Sold by FAST FASHION BAGS and Delive,WE MANUFACTURE ALL KINDS OF BAGS FOR MORE DETA...
1,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,"URBAN TRIBE Havana 15.6"" inch Laptop Backpack ...",998.0,3.9 out of 5 stars,684 ratings,In stock.,B01LXNNFDF,"[Laptop Compatibility: Yes, Laptop size: 15.6 ...",BackpackInternational,Backpack International Pvt Ltd
2,https://www.amazon.in/American-Tourister-AMT-S...,American Tourister 32 Ltrs Black Casual Backpa...,1299.0,4.1 out of 5 stars,"51,370 ratings",In stock.,B07CJCGM1M,"[Laptop Compatibility: No, Strap Type: Adjusta...",RetailEZ Pvt Ltd,Samsonite
3,https://www.amazon.in/Wesley-Milestone-Waterpr...,Wesley Milestone 2.0 Casual Waterproof Laptop ...,565.0,4.3 out of 5 stars,"9,226 ratings",In stock.,B084JGJ8PF,[30L Capacity: The Backpack has a padded lapto...,Appario Retail Private Ltd,"Longani Trading Company, F-82, Sector-1, Bawa..."
4,https://www.amazon.in/Skybags-Brat-Black-Casua...,Skybags Brat Black 46 Cms Casual Backpack,669.0,4.1 out of 5 stars,"3,368 ratings",In stock.,B08Z1HHHTD,[Combination of functional & safety features i...,RetailEZ Pvt Ltd,VIP Industries Ltd
5,https://www.amazon.in/ADISA-Laptop-Backpack-Of...,ADISA 15.6 inch Laptop Backpack Office Bag Col...,499.0,3.9 out of 5 stars,462 ratings,In stock.,B09TPX22NF,[Material: Water Resistant Light-Weight Polyes...,Appario Retail Private Ltd,ADISA
6,https://www.amazon.in/Safari-Backpack-Resistan...,"Safari Flash Casual Backpack, 26 ltr Water Res...",699.0,3.9 out of 5 stars,"2,184 ratings",In stock.,B09B29F66W,[HIGH QUALITY FABRIC: Our lightweight yet dura...,RetailEZ Pvt Ltd,Safari Industries
7,https://www.amazon.in/Wesley-Milestone-Waterpr...,Wesley Unisex Milestone Casual Waterproof Lapt...,498.0,4.3 out of 5 stars,"12,103 ratings",In stock.,B07K8KLB3P,[Pattern Type : Pure Color || Closure Type : Z...,Appario Retail Private Ltd,"Longani Trading Company, F-82, Sector-1, Bawa..."
8,https://www.amazon.in/Wesley-Spartan-Hiking-Ra...,Wesley Spartan Unisex Travel Hiking Laptop Bag...,798.0,4.1 out of 5 stars,912 ratings,In stock.,B098QFF5TJ,[Laptop bag fits upto 17.3 inch laptop],Appario Retail Private Ltd,Wesley
9,https://www.amazon.in/FUR-JADEN-Leatherette-Po...,Fur Jaden Brown Textured Leatherette Stylish &...,889.0,4.3 out of 5 stars,"5,318 ratings",In stock.,B07M9BRCQ5,[MATERIAL - We use Highest Quality of Artifici...,RetailEZ Pvt Ltd,Fur Jaden


# for multiple pages

In [112]:
if __name__ == '__main__':

    # add your user agent 
    # Headers for request
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 'Accept-Language': 'en-INR, en;q=0.5'}) 

    # The webpage URL
    amazon_url="https://www.amazon.in/s?k=bags&crid=2M096C61O4MLT&qid=1653308124&sprefix=ba%2Caps%2C283&ref=sr_pg_1"

    # HTTP Request
    webpage = requests.get(amazon_url, headers=HEADERS)

    # Soup Object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Fetch links as List of Tag Objects
    links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

    # Store the links
    links_list = []

    # Loop for extracting links from Tag Objects
    for link in links:
            links_list.append(link.get('href'))

    d = {"Product URL":[],"Product Name":[], "Product Price":[], "Rating":[], "Number of reviews":[],"Availability":[],"ASIN":[],"Product Description":[],"Sold by":[],"Manufacturer":[]}
    
    # Loop for extracting product details from each link 
    for link in links_list:
        new_webpage = requests.get("https://www.amazon.in" + link, headers=HEADERS)

        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Function calls to display all necessary product information
        d["Product URL"].append("https://www.amazon.in"+link)
        d['Product Name'].append(get_title(new_soup))
        d['Product Price'].append(get_price(new_soup))
        d['Rating'].append(get_rating(new_soup))
        d['Number of reviews'].append(get_review_count(new_soup))
        d['Availability'].append(get_availability(new_soup))
        d['ASIN'].append(get_asin("https://www.amazon.in" + link,new_soup))
        d['Product Description'].append(get_product_description(new_soup))
        d['Sold by'].append(get_seller(new_soup))
        d['Manufacturer'].append(get_manufacturer(new_soup))
    
        
    for i in range(0,3):
        if i == 0:
            second_page_url="https://amazon.in" + soup.find_all("a", attrs={'class':'s-pagination-item s-pagination-button'})[0].get("href")
            webpage = requests.get(second_page_url, headers=HEADERS)
            soup=BeautifulSoup(webpage.content,"html.parser")
            # Fetch links as List of Tag Objects
            links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

            # Store the links
            links_list = []

            # Loop for extracting links from Tag Objects
            for link in links:
                links_list.append(link.get('href')) 
            for link in links_list:
                new_webpage = requests.get("https://www.amazon.in" + link, headers=HEADERS)

                new_soup = BeautifulSoup(new_webpage.content, "html.parser")

                # Function calls to display all necessary product information
                d["Product URL"].append("https://www.amazon.in"+link)
                d['Product Name'].append(get_title(new_soup))
                d['Product Price'].append(get_price(new_soup))
                d['Rating'].append(get_rating(new_soup))
                d['Number of reviews'].append(get_review_count(new_soup))
                d['Availability'].append(get_availability(new_soup))
                d['ASIN'].append(get_asin("https://www.amazon.in" + link,new_soup))
                d['Product Description'].append(get_product_description(new_soup))
                d['Sold by'].append(get_seller(new_soup))
                d['Manufacturer'].append(get_manufacturer(new_soup))
            
        else :
            next_page_url="https://amazon.in" + soup.find_all("a", attrs={'class':'s-pagination-item s-pagination-button'})[-1].get("href")
            webpage = requests.get(next_page_url, headers=HEADERS)
            soup=BeautifulSoup(webpage.content,"html.parser")
 
            links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

            # Store the links
            links_list = []

            # Loop for extracting links from Tag Objects
            for link in links:
                links_list.append(link.get('href')) 
            for link in links_list:
                new_webpage = requests.get("https://www.amazon.in" + link, headers=HEADERS)

                new_soup = BeautifulSoup(new_webpage.content, "html.parser")

                # Function calls to display all necessary product information
                d["Product URL"].append("https://www.amazon.in"+link)
                d['Product Name'].append(get_title(new_soup))
                d['Product Price'].append(get_price(new_soup))
                d['Rating'].append(get_rating(new_soup))
                d['Number of reviews'].append(get_review_count(new_soup))
                d['Availability'].append(get_availability(new_soup))
                d['ASIN'].append(get_asin("https://www.amazon.in" + link,new_soup))
                d['Product Description'].append(get_product_description(new_soup))
                d['Sold by'].append(get_seller(new_soup))
                d['Manufacturer'].append(get_manufacturer(new_soup))

    
    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['Product Name'].replace('', np.nan, inplace=True)
    amazon_df = amazon_df.dropna(subset=['Product Name'])
    amazon_df.to_csv("amazon_data.csv", header=True, index=False)


In [113]:
amazon_df

Unnamed: 0,Product URL,Product Name,Product Price,Rating,Number of reviews,Availability,ASIN,Product Description,Sold by,Manufacturer
0,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,"URBAN TRIBE Havana 15.6"" inch Laptop Backpack ...",998,3.9 out of 5 stars,684 ratings,In stock.,B01LXNNFDF,"[Laptop Compatibility: Yes, Laptop size: 15.6 ...",BackpackInternational,Backpack International Pvt Ltd
1,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,Urban Tribe Jumbo Backpack with 17.3 inch Gami...,1498,4.1 out of 5 stars,198 ratings,In stock.,B01MSRADRY,[Separate 17.3 inch Laptop compartments for be...,BackpackInternational,BACKPACK INTERNATIONAL PVT LTD.
2,https://www.amazon.in/American-Tourister-AMT-S...,American Tourister 32 Ltrs Black Casual Backpa...,1299,4.1 out of 5 stars,"51,370 ratings",In stock.,B07CJCGM1M,"[Laptop Compatibility: No, Strap Type: Adjusta...",RetailEZ Pvt Ltd,Samsonite
3,https://www.amazon.in/Wesley-Milestone-Waterpr...,Wesley Milestone 2.0 Casual Waterproof Laptop ...,565,4.3 out of 5 stars,"9,226 ratings",In stock.,B084JGJ8PF,[30L Capacity: The Backpack has a padded lapto...,Appario Retail Private Ltd,"Longani Trading Company, F-82, Sector-1, Bawa..."
4,https://www.amazon.in/ADISA-Laptop-Backpack-Of...,ADISA 15.6 inch Laptop Backpack Office Bag Col...,499,3.9 out of 5 stars,462 ratings,In stock.,B09TPX22NF,[Material: Water Resistant Light-Weight Polyes...,Appario Retail Private Ltd,ADISA
...,...,...,...,...,...,...,...,...,...,...
95,https://www.amazon.in/HP-Lightweight-300-Grey-...,HP Lightweight 300 Grey Laptops Backpack with ...,1002,4.1 out of 5 stars,67 ratings,In stock.,B08CY44PVD,[Protect your precious 15.6-inch laptop with t...,Appario Retail Private Ltd,Korrun India Pvt. Ltd.
96,https://www.amazon.in/Nivia-6853BKGR-Polyester...,NIVIA Basic Duffle Polyester Bag/ Gym Bags/ Ad...,245,3.9 out of 5 stars,329 ratings,In stock.,B07NLB6FRS,[Strong stitching at major stress points provi...,RetailEZ Pvt Ltd,"Nivia, Freewill Sports Pvt Ltd. - 391 Leather..."
97,https://www.amazon.in/Classic-Leather-Laptop-B...,Gear Classic 20L Faux Leather Water Resistant ...,999,4.4 out of 5 stars,"2,268 ratings",In stock.,B07G4CYC74,"[Outer material: synthetic, color: tan; Waterp...",RetailEZ Pvt Ltd,Gear Merchandise PVT LTD
98,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,COSMUS Madison waterproof Multipurpose Backpac...,1196,4.2 out of 5 stars,"2,166 ratings",In stock.,B078JVT384,"[Care Instructions: Hand Wash Only, Large mul...",COSMUS LIFESTYLE PVT. LTD.,Cosmus


- some of the functions I hard coded to get the desire result 