In [16]:
from bs4 import BeautifulSoup
import openpyxl
import os

# Provide the full path to the directory containing your HTML files
html_directory = "html"

# Get a sorted list of HTML file names
html_file_names = sorted([f for f in os.listdir(html_directory) if f.endswith(".html")])

# Initialize a dictionary to store data for each file
file_data = {}

# Loop through all HTML files in the directory
for html_file_name in html_file_names:
    with open(os.path.join(html_directory, html_file_name), "r", encoding="utf-8") as html_file:
        content = html_file.read()

    # Parse HTML using BeautifulSoup
    soup = BeautifulSoup(content, "html.parser")

    # Find all divs with the specified class
    divs = soup.find_all("div", class_="s-card-container s-overflow-hidden aok-relative puis-wide-grid-style puis-wide-grid-style-t2 puis-include-content-margin puis puis-v3b48cl1js792724v4d69zlbwph s-latency-cf-section s-card-border")

    # Initialize a list to store data for this file
    data = []

    # Extract information from each div
    for div in divs:
        name_element = div.find("span", class_="a-size-medium a-color-base a-text-normal")
        name = name_element.get_text() if name_element else ""

        price_element = div.find("span", class_="a-price-whole")
        price = price_element.get_text() if price_element else ""

        rating_element = div.find("span", class_="a-size-base puis-normal-weight-text")
        rating = rating_element.get_text() if rating_element else ""
        
        reviews_element = div.find("span", class_="a-size-base s-underline-text")
        reviews = reviews_element.get_text() if reviews_element else ""

        url_element = div.find("a", class_="a-link-normal")  # Assuming the URL is within an anchor tag
        url = url_element.get("href") if url_element else ""

        data.append((name, price, rating,reviews, url))

    # Store the data in the dictionary
    file_data[html_file_name] = data

# Create an Excel workbook and add data to a sheet
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = "Amazon Products"

# Write header
sheet.append(["Name", "Price", "Rating","Reviews", "URL"])

# Write data to the sheet
for html_file_name, data in file_data.items():
    for name, price, rating,reviews,url in data:
        sheet.append([name, price, rating,reviews, url])

# Save the Excel file
workbook.save("amazon_data.xlsx")


In [20]:
import pandas as pd 
df=pd.read_excel("amazon_data.xlsx")
df

Unnamed: 0,Name,Price,Rating,Reviews,URL
0,FUR\n JADEN Anti Theft Number Lock Backpack Ba...,679,4.0,"(5,611)",https://www.amazon.in/sspa/click?ie=UTF8&spc=M...
1,"Zebronics\n Camp1, 14 Liters, 1 Compartment La...",528,3.0,(6),https://www.amazon.in/sspa/click?ie=UTF8&spc=M...
2,Skybags Brat Black 46 Cms Casual Backpack,672,4.1,"(5,525)",https://www.amazon.in/Skybags-Brat-Black-Casua...
3,TRUE\n HUMAN EMPEROR® Anti-Theft backpack With...,599,3.7,"(2,462)",https://www.amazon.in/TRUE-HUMAN-Anti-Theft-ba...
4,American Tourister Fizz Large Size 32 Ltrs Cas...,1199,4.0,"(55,249)",https://www.amazon.in/American-Tourister-AMT-S...
...,...,...,...,...,...
319,"Zebronics\n Camp1, 14 Liters, 1 Compartment La...",528,3.0,(6),https://www.amazon.in/Zebronics-Compartment-Ba...
320,Skybags Kick Unisex Adjustable Strap Polyester...,1360,4.0,(70),https://www.amazon.in/Skybags-Unisex-Adjustabl...
321,Gear Workspace 18L Water Resistant Laptop Bag/...,907,4.2,(345),https://www.amazon.in/Gear-Workspace-Briefcase...
322,uppercase\n 22 Ltrs Large (14.6 inch) Laptop B...,1600,4.4,(31),https://www.amazon.in/sspa/click?ie=UTF8&spc=M...
