In [1]:
import pandas as pd
df = pd.read_csv('data/raw/Fashion Dataset.csv', on_bad_lines='skip') # 'on_bad_lines' is a lifesaver


In [2]:
# --- Initial Exploration ---
print("Dataset Head:")
print(df.head())

Dataset Head:
   Index        p_id                                               name  \
0      0  17048614.0  Khushal K Women Black Ethnic Motifs Printed Ku...   
1      1  16524740.0  InWeave Women Orange Solid Kurta with Palazzos...   
2      2  16331376.0  Anubhutee Women Navy Blue Ethnic Motifs Embroi...   
3      3  14709966.0  Nayo Women Red Floral Printed Kurta With Trous...   
4      4  11056154.0   AHIKA Women Black & Green Printed Straight Kurta   

    price     colour      brand  \
0  5099.0      Black  Khushal K   
1  5899.0     Orange    InWeave   
2  4899.0  Navy Blue  Anubhutee   
3  3699.0        Red       Nayo   
4  1350.0      Black      AHIKA   

                                                 img  ratingCount  avg_rating  \
0  http://assets.myntassets.com/assets/images/170...       4522.0    4.418399   
1  http://assets.myntassets.com/assets/images/165...       1081.0    4.119334   
2  http://assets.myntassets.com/assets/images/163...       1752.0    4.161530   


In [3]:
print("\nDataset Info:")
df.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14330 entries, 0 to 14329
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Index         14330 non-null  int64  
 1   p_id          14312 non-null  float64
 2   name          14312 non-null  object 
 3   price         14312 non-null  float64
 4   colour        14309 non-null  object 
 5   brand         14312 non-null  object 
 6   img           14312 non-null  object 
 7   ratingCount   6581 non-null   float64
 8   avg_rating    6581 non-null   float64
 9   description   14312 non-null  object 
 10  p_attributes  14312 non-null  object 
dtypes: float64(4), int64(1), object(6)
memory usage: 1.2+ MB


In [4]:
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
Index              0
p_id              18
name              18
price             18
colour            21
brand             18
img               18
ratingCount     7749
avg_rating      7749
description       18
p_attributes      18
dtype: int64


In [5]:
#we have sufficiently large dataset
#drop the rows which dont have the price or name
df.dropna(subset=['price', 'img', 'name'], inplace=True)


In [6]:
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
Index              0
p_id               0
name               0
price              0
colour             3
brand              0
img                0
ratingCount     7731
avg_rating      7731
description        0
p_attributes       0
dtype: int64


In [7]:
#dropping the image URL (we have local copy) and product unique ID (useless) 
df.drop(["p_id","img"], axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,Index,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes
0,0,Khushal K Women Black Ethnic Motifs Printed Ku...,5099.0,Black,Khushal K,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32..."
1,1,InWeave Women Orange Solid Kurta with Palazzos...,5899.0,Orange,InWeave,1081.0,4.119334,Orange solid Kurta with Palazzos with dupatta<...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32..."
2,2,Anubhutee Women Navy Blue Ethnic Motifs Embroi...,4899.0,Navy Blue,Anubhutee,1752.0,4.16153,Navy blue embroidered Kurta with Trousers with...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ..."
3,3,Nayo Women Red Floral Printed Kurta With Trous...,3699.0,Red,Nayo,4113.0,4.088986,Red printed kurta with trouser and dupatta<br>...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ..."
4,4,AHIKA Women Black & Green Printed Straight Kurta,1350.0,Black,AHIKA,21274.0,3.978377,"Black and green printed straight kurta, has a ...","{'Body Shape ID': '424', 'Body or Garment Size..."


In [9]:
#mapping the id to images
df['image_path'] = df['Index'].apply(lambda x: f'Images/{x}.jpg')
df.head()

Unnamed: 0,Index,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes,image_path
0,0,Khushal K Women Black Ethnic Motifs Printed Ku...,5099.0,Black,Khushal K,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...",Images/0.jpg
1,1,InWeave Women Orange Solid Kurta with Palazzos...,5899.0,Orange,InWeave,1081.0,4.119334,Orange solid Kurta with Palazzos with dupatta<...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...",Images/1.jpg
2,2,Anubhutee Women Navy Blue Ethnic Motifs Embroi...,4899.0,Navy Blue,Anubhutee,1752.0,4.16153,Navy blue embroidered Kurta with Trousers with...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...",Images/2.jpg
3,3,Nayo Women Red Floral Printed Kurta With Trous...,3699.0,Red,Nayo,4113.0,4.088986,Red printed kurta with trouser and dupatta<br>...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...",Images/3.jpg
4,4,AHIKA Women Black & Green Printed Straight Kurta,1350.0,Black,AHIKA,21274.0,3.978377,"Black and green printed straight kurta, has a ...","{'Body Shape ID': '424', 'Body or Garment Size...",Images/4.jpg


In [10]:
import ast
import json

# --- Define a safe parsing function ---
def parse_attributes(attr_string):
    # Check if the data is not a string (it might be a float NaN)
    if not isinstance(attr_string, str):
        return {} # Return an empty dict if the data is missing
    try:
        # ast.literal_eval is the safe way to evaluate a string containing a Python literal
        return ast.literal_eval(attr_string)
    except (ValueError, SyntaxError):
        # If it fails, return an empty dict to avoid crashing
        return {}

# --- Apply the function to the entire column ---
# This creates a new column where each entry is a proper Python dictionary
df['attributes_dict'] = df['p_attributes'].apply(parse_attributes)


In [11]:
# --- Display the result to verify ---
print("Original string:")
print(df['p_attributes'].iloc[0])
print("\nParsed dictionary:")
print(df['attributes_dict'].iloc[0])

Original string:
{'Add-Ons': 'NA', 'Body Shape ID': '443,333,324,424', 'Body or Garment Size': 'Garment Measurements in', 'Bottom Closure': 'Slip-On', 'Bottom Fabric': 'Viscose Rayon', 'Bottom Pattern': 'Printed', 'Bottom Type': 'Palazzos', 'Character': 'NA', 'Dupatta': 'With Dupatta', 'Dupatta Border': 'Solid', 'Dupatta Fabric': 'Viscose Rayon', 'Dupatta Pattern': 'Printed', 'Main Trend': 'Indie Prints', 'Neck': 'Mandarin Collar', 'Number of Pockets': 'NA', 'Occasion': 'Festive', 'Ornamentation': 'NA', 'Pattern Coverage': 'Placement', 'Print or Pattern Type': 'Ethnic Motifs', 'Sleeve Length': 'Three-Quarter Sleeves', 'Sleeve Styling': 'Regular Sleeves', 'Slit Detail': 'NA', 'Stitch': 'Ready to Wear', 'Sustainable': 'Regular', 'Technique': 'Screen', 'Top Design Styling': 'Regular', 'Top Fabric': 'Viscose Rayon', 'Top Hemline': 'Flared', 'Top Length': 'Calf Length', 'Top Pattern': 'Printed', 'Top Shape': 'Anarkali', 'Top Type': 'Kurta', 'Waistband': 'Elasticated', 'Wash Care': 'Machine 

In [12]:
df.head()

Unnamed: 0,Index,name,price,colour,brand,ratingCount,avg_rating,description,p_attributes,image_path,attributes_dict
0,0,Khushal K Women Black Ethnic Motifs Printed Ku...,5099.0,Black,Khushal K,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...",Images/0.jpg,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32..."
1,1,InWeave Women Orange Solid Kurta with Palazzos...,5899.0,Orange,InWeave,1081.0,4.119334,Orange solid Kurta with Palazzos with dupatta<...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...",Images/1.jpg,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32..."
2,2,Anubhutee Women Navy Blue Ethnic Motifs Embroi...,4899.0,Navy Blue,Anubhutee,1752.0,4.16153,Navy blue embroidered Kurta with Trousers with...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...",Images/2.jpg,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ..."
3,3,Nayo Women Red Floral Printed Kurta With Trous...,3699.0,Red,Nayo,4113.0,4.088986,Red printed kurta with trouser and dupatta<br>...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...",Images/3.jpg,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ..."
4,4,AHIKA Women Black & Green Printed Straight Kurta,1350.0,Black,AHIKA,21274.0,3.978377,"Black and green printed straight kurta, has a ...","{'Body Shape ID': '424', 'Body or Garment Size...",Images/4.jpg,"{'Body Shape ID': '424', 'Body or Garment Size..."


In [13]:
# We can now access elements like a normal dictionary
print(f"\nExample Access - Neck Style: {df['attributes_dict'].iloc[0].get('Neck')}")


Example Access - Neck Style: Mandarin Collar


In [14]:
from collections import Counter

# --- Collect all attribute keys from all rows ---
all_keys = Counter()
df['attributes_dict'].apply(lambda x: all_keys.update(x.keys()))

# --- Display the most common attribute keys ---
print("Most common attributes across all products:")
print(all_keys.most_common(30)) # Show the top 30

Most common attributes across all products:
[('Occasion', 13213), ('Wash Care', 13103), ('Sustainable', 13010), ('Body or Garment Size', 10959), ('Pattern', 10506), ('Fabric', 10426), ('Print or Pattern Type', 10037), ('Closure', 8528), ('Number of Pockets', 8443), ('Length', 8287), ('Main Trend', 7901), ('Sleeve Length', 7807), ('Type', 7636), ('Character', 6869), ('Surface Styling', 6322), ('Neck', 6321), ('Body Shape ID', 6286), ('Weave Type', 5351), ('Add-Ons', 5173), ('Hemline', 5048), ('Ornamentation', 4734), ('Multipack Set', 4362), ('Wedding', 4220), ('Technique', 3624), ('Fabric 2', 3481), ('Sleeve Styling', 3451), ('Lining', 3060), ('Fit', 3032), ('Features', 2961), ('Transparency', 2880)]


In [15]:
# --- Define the list of attributes we've decided to extract ---
# These should be visually indentifiable

selected_attributes = [
    'Neck',
    'Sleeve Length',
    'Print or Pattern Type',
    'Hemline',
    'Pattern',
    'Sleeve Styling'
]

# --- Create a new column for each selected attribute ---
for attr in selected_attributes:
    # Using .get(attr, 'Unknown') to handle cases where a product doesn't have that specific key
    df[attr] = df['attributes_dict'].apply(lambda x: x.get(attr, 'Unknown'))

# --- Cleaning up 'NA' or other junk values ---
# Often, the data has 'NA' as a string. Let's standardize it.
for attr in selected_attributes:
    df[attr] = df[attr].replace('NA', 'Unknown')
    df[attr] = df[attr].fillna('Unknown')

# --- Display the new, structured dataframe ---
print(df[['Index'] + selected_attributes].head(10))

   Index             Neck          Sleeve Length Print or Pattern Type  \
0      0  Mandarin Collar  Three-Quarter Sleeves         Ethnic Motifs   
1      1      Square Neck             Sleeveless                 Solid   
2      2       Round Neck  Three-Quarter Sleeves         Ethnic Motifs   
3      3       Round Neck  Three-Quarter Sleeves         Ethnic Motifs   
4      4       Round Neck  Three-Quarter Sleeves         Ethnic Motifs   
5      5  Mandarin Collar           Long Sleeves                 Solid   
6      6     Keyhole Neck  Three-Quarter Sleeves                Floral   
7      7       Round Neck           Long Sleeves                Floral   
8      8       Round Neck  Three-Quarter Sleeves                Floral   
9      9       Round Neck  Three-Quarter Sleeves         Ethnic Motifs   

    Hemline       Pattern   Sleeve Styling  
0   Unknown       Unknown  Regular Sleeves  
1   Unknown       Unknown  Shoulder Straps  
2   Unknown       Unknown       No Sleeves  
3   U

In [16]:
#Analyze Value Counts 
for attr in selected_attributes:
    print(f"\nValue counts for '{attr}':")
    print(df[attr].value_counts().nlargest(10)) # Show top 10 values


Value counts for 'Neck':
Neck
Unknown            8170
Round Neck         2721
V-Neck              983
Mandarin Collar     368
Shoulder Straps     298
Hood                278
Shirt Collar        247
Boat Neck           189
Square Neck         169
High Neck           118
Name: count, dtype: int64

Value counts for 'Sleeve Length':
Sleeve Length
Unknown                  6505
Long Sleeves             2635
Three-Quarter Sleeves    2086
Short Sleeves            1545
Sleeveless               1541
Name: count, dtype: int64

Value counts for 'Print or Pattern Type':
Print or Pattern Type
Unknown          4283
Solid            3928
Floral           1691
Ethnic Motifs    1334
Geometric         524
Striped           388
Abstract          249
Self Design       195
Checked           182
Colourblocked     172
Name: count, dtype: int64

Value counts for 'Hemline':
Hemline
Unknown            9264
Straight           2406
Flared             1667
Ribbed              409
Curved              288
Asymmetric

In [17]:
df_final = df.drop(columns=['p_attributes', 'attributes_dict'],inplace=True)
df.head(2)

Unnamed: 0,Index,name,price,colour,brand,ratingCount,avg_rating,description,image_path,Neck,Sleeve Length,Print or Pattern Type,Hemline,Pattern,Sleeve Styling
0,0,Khushal K Women Black Ethnic Motifs Printed Ku...,5099.0,Black,Khushal K,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,Images/0.jpg,Mandarin Collar,Three-Quarter Sleeves,Ethnic Motifs,Unknown,Unknown,Regular Sleeves
1,1,InWeave Women Orange Solid Kurta with Palazzos...,5899.0,Orange,InWeave,1081.0,4.119334,Orange solid Kurta with Palazzos with dupatta<...,Images/1.jpg,Square Neck,Sleeveless,Solid,Unknown,Unknown,Shoulder Straps


In [19]:
import os
df = df[df['image_path'].apply(os.path.exists)]

In [20]:
df.head()

Unnamed: 0,Index,name,price,colour,brand,ratingCount,avg_rating,description,image_path,Neck,Sleeve Length,Print or Pattern Type,Hemline,Pattern,Sleeve Styling
