In [73]:
from bs4 import BeautifulSoup
import pandas as pd

# List of file paths containing HTML content of each page
file_paths = [
    "C:/Users/shalu/OneDrive/Desktop/webbb/page2.html",
    "C:/Users/shalu/OneDrive/Desktop/webbb/page1.html",
    "C:/Users/shalu/OneDrive/Desktop/webbb/page.html",
    "C:/Users/shalu/OneDrive/Desktop/webbb/page3.html",
    "C:/Users/shalu/OneDrive/Desktop/webbb/page4.html"
]

# Initialize lists to store data
product_names = []
product_prices = []
product_reviews_list = []
product_description_list = []

# Iterate over each file path
for file_path in file_paths:
    # Read the HTML file
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Find all divs with specified class
    product_divs = soup.find_all("div", class_="cPHDOP col-12-12")

    # Extract data for each product
    for div in product_divs:
        # Find product name
        name_span = div.find("div", class_="KzDlHZ")
        product_name = name_span.get_text().strip() if name_span else ""

        # Find product price
        price_span = div.find("div", class_="Nx9bqj _4b5DiR")
        product_price = price_span.get_text().strip() if price_span else ""

        # Find product reviews
        reviews_span = div.find("div", class_="XQDdHH")
        product_reviews = reviews_span.get_text().strip() if reviews_span else ""

        # Find product description
        desc_span = div.find("div", class_="_6NESgJ")
        product_description = desc_span.get_text().strip() if price_span else ""

        # Append data to respective lists
        product_names.append(product_name)
        product_prices.append(product_price)
        product_reviews_list.append(product_reviews)
        product_description_list.append(product_description)

# Create a DataFrame
data = {
    "Product Name": product_names,
    "Product Price": product_prices,
    "Product Reviews": product_reviews_list,
    "Product Description": product_description_list
}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv("flipkart_products.csv", index=False)


In [74]:
df.head()

Unnamed: 0,Product Name,Product Price,Product Reviews,Product Description
0,,,,
1,"Canon EOS 3000D DSLR Camera 1 Camera Body, 18 ...","₹32,990",4.4,"Self-Timer, Type C and Mini HDMI, 9 Auto Focus..."
2,FUJIFILM Instax Mini 9 Instant Camera,"₹4,999",4.3,Exposure Mode: AutomaticView Finder: YesSelf T...
3,dji Action 2 Power Combo Power Combo with Modu...,"₹15,990",4.2,Effective Pixels: 12 MP4K1 Year Manufacturer W...
4,SONY Alpha ILCE-6100 APS-C Mirrorless Camera B...,"₹51,989",4.6,"High-resolution 4K movie recording, Choose you..."


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Product Name         145 non-null    object
 1   Product Price        145 non-null    object
 2   Product Reviews      145 non-null    object
 3   Product Description  145 non-null    object
dtypes: object(4)
memory usage: 4.7+ KB


In [76]:
df.shape

(145, 4)

In [77]:
df.isnull().sum()

Product Name           0
Product Price          0
Product Reviews        0
Product Description    0
dtype: int64

In [78]:
import numpy as np
df.replace("", np.nan, inplace=True)

In [79]:
df.head(25)

Unnamed: 0,Product Name,Product Price,Product Reviews,Product Description
0,,,,
1,"Canon EOS 3000D DSLR Camera 1 Camera Body, 18 ...","₹32,990",4.4,"Self-Timer, Type C and Mini HDMI, 9 Auto Focus..."
2,FUJIFILM Instax Mini 9 Instant Camera,"₹4,999",4.3,Exposure Mode: AutomaticView Finder: YesSelf T...
3,dji Action 2 Power Combo Power Combo with Modu...,"₹15,990",4.2,Effective Pixels: 12 MP4K1 Year Manufacturer W...
4,SONY Alpha ILCE-6100 APS-C Mirrorless Camera B...,"₹51,989",4.6,"High-resolution 4K movie recording, Choose you..."
5,SONY Alpha ILCE-6400L APS-C Mirrorless Camera ...,"₹74,489",4.6,"4K movies and pro-level features, Natural-look..."
6,GoPro HERO12 DualLCDScreens 5.3K60 UltraHDVide...,"₹37,990",4.0,Effective Pixels: 27 MP5K2 Years Warranty
7,dji Osmo Action 4 Adventure Combo Sports and A...,"₹40,990",4.7,Effective Pixels: 12 MP4K1 Year Warranty
8,Canon EOS 200D II DSLR Camera EF-S18-55mm IS STM,"₹54,990",4.5,"Vari-angle Touch Screen LCD, Dual pixel CMOS A..."
9,Canon EOS R50 Mirrorless Camera Body with RF -...,"₹66,990",4.6,"4K 30p (6K oversampled) & FHD 120p, Up to 15 f..."


In [80]:
# Check for missing values again
df.isna().sum()

Product Name           25
Product Price          25
Product Reviews        39
Product Description    25
dtype: int64

In [81]:
cleaned_df = df.dropna()

In [82]:
cleaned_df.head()

Unnamed: 0,Product Name,Product Price,Product Reviews,Product Description
1,"Canon EOS 3000D DSLR Camera 1 Camera Body, 18 ...","₹32,990",4.4,"Self-Timer, Type C and Mini HDMI, 9 Auto Focus..."
2,FUJIFILM Instax Mini 9 Instant Camera,"₹4,999",4.3,Exposure Mode: AutomaticView Finder: YesSelf T...
3,dji Action 2 Power Combo Power Combo with Modu...,"₹15,990",4.2,Effective Pixels: 12 MP4K1 Year Manufacturer W...
4,SONY Alpha ILCE-6100 APS-C Mirrorless Camera B...,"₹51,989",4.6,"High-resolution 4K movie recording, Choose you..."
5,SONY Alpha ILCE-6400L APS-C Mirrorless Camera ...,"₹74,489",4.6,"4K movies and pro-level features, Natural-look..."


In [83]:
# Reset the index of the DataFrame
cleaned_df.reset_index(drop=True, inplace=True)

# Check for duplicates
duplicate_rows = cleaned_df.duplicated()

# Count the number of duplicates
num_duplicates = duplicate_rows.sum()

# Display the number of duplicates
print("Number of duplicate rows:", num_duplicates)

# Display the duplicate rows themselves (if any)
if num_duplicates > 0:
    print("Duplicate rows:")
    print(cleaned_df[duplicate_rows])
else:
    print("No duplicate rows found.")


Number of duplicate rows: 7
Duplicate rows:
                                         Product Name Product Price  \
14             FUJIFILM Instax Mini 11 Instant Camera        ₹5,999   
20  FUJIFILM Instax Treasure Box Mini 11 Instant C...        ₹6,499   
67  lezzie E88 Drone WiFi Camera Drone Remote Cont...        ₹2,479   
69             FUJIFILM Instax Mini 11 Instant Camera        ₹5,999   
77             FUJIFILM Instax Mini 11 Instant Camera        ₹5,999   
83  FUJIFILM Instax Treasure Box Mini 11 Instant C...        ₹6,499   
90  FUJIFILM Instax Treasure Box Mini 11 Instant C...        ₹6,499   

   Product Reviews                                Product Description  
14             4.3  Exposure Mode: ManualView Finder: YesSelf Time...  
20             4.4  Exposure Mode: AutomaticView Finder: YesSelf T...  
67             4.3  Type: Professional DroneControl Range: 103 mBa...  
69             4.3  Exposure Mode: ManualView Finder: YesSelf Time...  
77             4.3  Exposur

In [84]:
cleaned_df.drop_duplicates(inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df.drop_duplicates(inplace=True)


In [85]:
# Reset the index of the DataFrame
cleaned_df.reset_index(drop=True, inplace=True)

# Check for duplicates
duplicate_rows = cleaned_df.duplicated()

# Count the number of duplicates
num_duplicates = duplicate_rows.sum()

# Display the number of duplicates
print("Number of duplicate rows:", num_duplicates)

# Display the duplicate rows themselves (if any)
if num_duplicates > 0:
    print("Duplicate rows:")
    print(cleaned_df[duplicate_rows])
else:
    print("No duplicate rows found.")


Number of duplicate rows: 0
No duplicate rows found.


In [86]:
# Convert the 'Product Price' column to string type
cleaned_df['Product Price'] = cleaned_df['Product Price'].astype(str).copy()

# Remove currency symbols and commas from the 'Product Price' column
cleaned_df['Product Price'] = cleaned_df['Product Price'].str.replace('₹', '').str.replace(',', '')

# Convert the 'Product Price' column to numeric values
cleaned_df['Product Price'] = pd.to_numeric(cleaned_df['Product Price'], errors='coerce')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['Product Price'] = cleaned_df['Product Price'].astype(str).copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['Product Price'] = cleaned_df['Product Price'].str.replace('₹', '').str.replace(',', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['Product Price'] 

In [87]:
cleaned_df.head()

Unnamed: 0,Product Name,Product Price,Product Reviews,Product Description
0,"Canon EOS 3000D DSLR Camera 1 Camera Body, 18 ...",32990,4.4,"Self-Timer, Type C and Mini HDMI, 9 Auto Focus..."
1,FUJIFILM Instax Mini 9 Instant Camera,4999,4.3,Exposure Mode: AutomaticView Finder: YesSelf T...
2,dji Action 2 Power Combo Power Combo with Modu...,15990,4.2,Effective Pixels: 12 MP4K1 Year Manufacturer W...
3,SONY Alpha ILCE-6100 APS-C Mirrorless Camera B...,51989,4.6,"High-resolution 4K movie recording, Choose you..."
4,SONY Alpha ILCE-6400L APS-C Mirrorless Camera ...,74489,4.6,"4K movies and pro-level features, Natural-look..."


In [88]:
# Save the cleaned data to a CSV file
cleaned_df.to_csv('cleaned.csv', index=False)