In [12]:
import pandas as pd
from bs4 import BeautifulSoup

# Load the combined HTML file
with open('myntra_watches_all_combined.html', 'r', encoding='utf-8') as file:
    combined_html = file.read()

# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(combined_html, 'html.parser')

# Find all <ul> elements with the class 'results-base'
results = soup.find_all('ul', class_='results-base')

# List to hold the extracted data
data = []

# Iterate through the results and extract data
for result in results:
    # Extract data from each result
    items = result.find_all('li')
    for item in items:
        # Extract specific data from each <li> element
        product_id = item.get('id')
        product_name = item.find('h4', class_='product-product').text if item.find('h4', class_='product-product') else 'N/A'
        product_brand_name = item.find('h3').text if item.find('h3') else 'N/A'
        product_price = item.find('div', class_='product-price').text if item.find('div', class_='product-price') else 'N/A'
        
        # Extract ratings
        ratings_div = item.find('div', class_='product-ratingsContainer')
        if ratings_div:
            ratings = ratings_div.text.strip()  # Modify this as needed to match the exact format
        else:
            ratings = 'N/A'
        
        # Append data to the list
        data.append({
            'ID': product_id,
            'Watch Name': product_name,
            'Brand Name': product_brand_name,
            'Product Price': product_price,
            'Ratings': ratings
        })

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data)

# Print the DataFrame
df


Unnamed: 0,ID,Watch Name,Brand Name,Product Price,Ratings
0,24295998,Unisex Couple Watch Gift Set,JOKER & WITCH,Rs. 1724Rs. 6899(75% OFF),4|62
1,24296002,Unisex Couple Watches Gift Set,JOKER & WITCH,Rs. 1655Rs. 7199(77% OFF),4.3|23
2,29847143,Unisex Analogue Watches,Kool Kidz,Rs. 715Rs. 795(10% OFF),
3,24286024,Analogue Watch Gift Set,Daniel Klein,Rs. 2590Rs. 7000(63% OFF),
4,13759602,Blaine & Kurt Couple Watches Gift Set,JOKER & WITCH,Rs. 1599Rs. 6398(75% OFF),4.3|99
...,...,...,...,...,...
240,,,,,
241,,,,,
242,,,,,
243,,,,,


In [13]:
myntra_watches_data=df.drop_duplicates()
myntra_watches_data

Unnamed: 0,ID,Watch Name,Brand Name,Product Price,Ratings
0,24295998,Unisex Couple Watch Gift Set,JOKER & WITCH,Rs. 1724Rs. 6899(75% OFF),4|62
1,24296002,Unisex Couple Watches Gift Set,JOKER & WITCH,Rs. 1655Rs. 7199(77% OFF),4.3|23
2,29847143,Unisex Analogue Watches,Kool Kidz,Rs. 715Rs. 795(10% OFF),
3,24286024,Analogue Watch Gift Set,Daniel Klein,Rs. 2590Rs. 7000(63% OFF),
4,13759602,Blaine & Kurt Couple Watches Gift Set,JOKER & WITCH,Rs. 1599Rs. 6398(75% OFF),4.3|99
...,...,...,...,...,...
235,24765920,Women Bracelet Watch Gift Set,JOKER & WITCH,Rs. 2199Rs. 10999(80% OFF),
236,17788600,Women Watch Gift Set,JOKER & WITCH,Rs. 1632Rs. 7097(77% OFF),
237,23802460,Women Watch Gift Set,JOKER & WITCH,Rs. 2092Rs. 9099(77% OFF),
238,709052,Men Dial Watch,Fastrack,Rs. 2026Rs. 2895(30% OFF),4|183


In [11]:
myntra_watches_data['Product Price'].head(5)

0    Rs. 1724Rs. 6899(75% OFF)
1                     Rs. 9995
2    Rs. 1678Rs. 7298(77% OFF)
3    Rs. 1795Rs. 1995(10% OFF)
4    Rs. 3836Rs. 4795(20% OFF)
Name: Product Price, dtype: object

In [21]:
import pandas as pd
import re

# Assuming 'myntra_watches_data' is your DataFrame

# Function to extract original price, discount price, and discount percent
def extract_price_info(price_str):
    discount_price = "Not available"
    original_price = "Not available"
    discount_percent = "Not available"
    
    # Check for discount information
    if "(" in price_str:
        match = re.search(r"\((.*?)\)", price_str)
        if match:
            discount_percent = match.group(1)
        prices = re.findall(r"\d+", price_str)
        if len(prices) == 2:
            discount_price = prices[0]
            original_price = prices[1]
    else:
        prices = re.findall(r"\d+", price_str)
        if len(prices) == 1:
            original_price = prices[0]
    
    return original_price, discount_price, discount_percent

# Apply the function to the 'Product Price' column and create new columns
myntra_watches_data[['Original Price', 'Discount Price', 'Discount Percent']] = myntra_watches_data['Product Price'].apply(lambda x: pd.Series(extract_price_info(x)))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  myntra_watches_data[['Original Price', 'Discount Price', 'Discount Percent']] = myntra_watches_data['Product Price'].apply(lambda x: pd.Series(extract_price_info(x)))


Unnamed: 0,ID,Watch Name,Brand Name,Product Price,Ratings,Discount Price,Original Price,Discount Percent
0,24295998,Unisex Couple Watch Gift Set,JOKER & WITCH,Rs. 1724Rs. 6899(75% OFF),4|62,Not available,Not available,75% OFF
1,24296002,Unisex Couple Watches Gift Set,JOKER & WITCH,Rs. 1655Rs. 7199(77% OFF),4.3|23,Not available,Not available,77% OFF
2,29847143,Unisex Analogue Watches,Kool Kidz,Rs. 715Rs. 795(10% OFF),,Not available,Not available,10% OFF
3,24286024,Analogue Watch Gift Set,Daniel Klein,Rs. 2590Rs. 7000(63% OFF),,Not available,Not available,63% OFF
4,13759602,Blaine & Kurt Couple Watches Gift Set,JOKER & WITCH,Rs. 1599Rs. 6398(75% OFF),4.3|99,Not available,Not available,75% OFF
...,...,...,...,...,...,...,...,...
235,24765920,Women Bracelet Watch Gift Set,JOKER & WITCH,Rs. 2199Rs. 10999(80% OFF),,Not available,Not available,80% OFF
236,17788600,Women Watch Gift Set,JOKER & WITCH,Rs. 1632Rs. 7097(77% OFF),,Not available,Not available,77% OFF
237,23802460,Women Watch Gift Set,JOKER & WITCH,Rs. 2092Rs. 9099(77% OFF),,Not available,Not available,77% OFF
238,709052,Men Dial Watch,Fastrack,Rs. 2026Rs. 2895(30% OFF),4|183,Not available,Not available,30% OFF


In [31]:
myntra_watch_data[['j', 'people rate']] = df['Ratings'].str.split('|', expand=True)

# Optionally, you can convert the new columns to integers if needed
#df['rating'] = df['j'].astype(int)
#df['people rates'] = df['people rate'].astype(int)

# Drop the original 'data' column if no longer needed
myntra_watches_data = myntra_watch_data.drop(['j','Ratings'],axis=1)
myntra_watches_data

Unnamed: 0,ID,Watch Name,Brand Name,Product Price,Discount Percent,ratings,people rate
0,24295998,Unisex Couple Watch Gift Set,JOKER & WITCH,Rs. 1724Rs. 6899(75% OFF),75% OFF,4,62
1,24296002,Unisex Couple Watches Gift Set,JOKER & WITCH,Rs. 1655Rs. 7199(77% OFF),77% OFF,4.3,23
2,29847143,Unisex Analogue Watches,Kool Kidz,Rs. 715Rs. 795(10% OFF),10% OFF,,
3,24286024,Analogue Watch Gift Set,Daniel Klein,Rs. 2590Rs. 7000(63% OFF),63% OFF,,
4,13759602,Blaine & Kurt Couple Watches Gift Set,JOKER & WITCH,Rs. 1599Rs. 6398(75% OFF),75% OFF,4.3,99
...,...,...,...,...,...,...,...
235,24765920,Women Bracelet Watch Gift Set,JOKER & WITCH,Rs. 2199Rs. 10999(80% OFF),80% OFF,,
236,17788600,Women Watch Gift Set,JOKER & WITCH,Rs. 1632Rs. 7097(77% OFF),77% OFF,,
237,23802460,Women Watch Gift Set,JOKER & WITCH,Rs. 2092Rs. 9099(77% OFF),77% OFF,,
238,709052,Men Dial Watch,Fastrack,Rs. 2026Rs. 2895(30% OFF),30% OFF,4,183


In [35]:
myntra_watches_data[['price', 'discount']] = df['Product Price'].str.split('(', expand=True)
myntra_watchess_data=myntra_watches_data.drop(['Product Price','discount'],axis=1)
myntra_watchess_data

Unnamed: 0,ID,Watch Name,Brand Name,Discount Percent,ratings,people rate,price
0,24295998,Unisex Couple Watch Gift Set,JOKER & WITCH,75% OFF,4,62,Rs. 1724Rs. 6899
1,24296002,Unisex Couple Watches Gift Set,JOKER & WITCH,77% OFF,4.3,23,Rs. 1655Rs. 7199
2,29847143,Unisex Analogue Watches,Kool Kidz,10% OFF,,,Rs. 715Rs. 795
3,24286024,Analogue Watch Gift Set,Daniel Klein,63% OFF,,,Rs. 2590Rs. 7000
4,13759602,Blaine & Kurt Couple Watches Gift Set,JOKER & WITCH,75% OFF,4.3,99,Rs. 1599Rs. 6398
...,...,...,...,...,...,...,...
235,24765920,Women Bracelet Watch Gift Set,JOKER & WITCH,80% OFF,,,Rs. 2199Rs. 10999
236,17788600,Women Watch Gift Set,JOKER & WITCH,77% OFF,,,Rs. 1632Rs. 7097
237,23802460,Women Watch Gift Set,JOKER & WITCH,77% OFF,,,Rs. 2092Rs. 9099
238,709052,Men Dial Watch,Fastrack,30% OFF,4,183,Rs. 2026Rs. 2895


In [48]:
myntra_watchess_data[['rr','discount price','original price']] = myntra_watchess_data['price'].str.split('.', expand=True)
myntra_watchesss_data=myntra_watchess_data.drop(['rr','price'],axis=1)
myntra_watchesss_data.head(20)

Unnamed: 0,ID,Watch Name,Brand Name,Discount Percent,ratings,people rate,discount price,original price
0,24295998,Unisex Couple Watch Gift Set,JOKER & WITCH,75% OFF,4.0,62,1724Rs,6899.0
1,24296002,Unisex Couple Watches Gift Set,JOKER & WITCH,77% OFF,4.3,23,1655Rs,7199.0
2,29847143,Unisex Analogue Watches,Kool Kidz,10% OFF,,,715Rs,795.0
3,24286024,Analogue Watch Gift Set,Daniel Klein,63% OFF,,,2590Rs,7000.0
4,13759602,Blaine & Kurt Couple Watches Gift Set,JOKER & WITCH,75% OFF,4.3,99,1599Rs,6398.0
5,24081380,Women Watch Gift Set,JOKER & WITCH,77% OFF,4.3,23,1678Rs,7298.0
6,24081466,Women Watch Gift Set,JOKER & WITCH,77% OFF,3.5,11,1517Rs,6598.0
7,12906412,Star & Marco Couple Watches,JOKER & WITCH,76% OFF,4.2,26,1823Rs,7598.0
8,24081602,Women Watch Gift Set,JOKER & WITCH,75% OFF,,,1424Rs,5699.0
9,29892796,Unisex Kids Watch,YOUNG MISSION,15% OFF,,,977Rs,1150.0


In [42]:
split_result = myntra_watchess_data['price'].str.split('.', expand=True)
print(split_result.head())


    0        1      2
0  Rs   1724Rs   6899
1  Rs   1655Rs   7199
2  Rs    715Rs    795
3  Rs   2590Rs   7000
4  Rs   1599Rs   6398


In [46]:
# save the DataFrame to a CSV file
myntra_watchesss_data.to_csv('myntra watchess details.csv', index=False)