**Data Prepocessing**

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [5]:
#Reading the styles data
df = pd.read_csv("/content/styles.csv")

In [6]:
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


In [7]:
# Dropping the 'year' column
df = df.drop(columns=['year'])

In [8]:
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,Casual,Puma Men Grey T-shirt


In [9]:
#data information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44446 entries, 0 to 44445
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  44446 non-null  int64 
 1   gender              44446 non-null  object
 2   masterCategory      44446 non-null  object
 3   subCategory         44446 non-null  object
 4   articleType         44446 non-null  object
 5   baseColour          44431 non-null  object
 6   season              44425 non-null  object
 7   usage               44129 non-null  object
 8   productDisplayName  44439 non-null  object
dtypes: int64(1), object(8)
memory usage: 3.1+ MB


In [10]:
#number of unique data
df.nunique()

id                    44446
gender                    5
masterCategory            7
subCategory              45
articleType             143
baseColour               46
season                    4
usage                     8
productDisplayName    31138
dtype: int64

In [11]:
df.isnull().sum()

id                      0
gender                  0
masterCategory          0
subCategory             0
articleType             0
baseColour             15
season                 21
usage                 317
productDisplayName      7
dtype: int64

In [12]:
#null data percentage
(df.isnull().sum()/(len(df)))*100

id                    0.000000
gender                0.000000
masterCategory        0.000000
subCategory           0.000000
articleType           0.000000
baseColour            0.033749
season                0.047248
usage                 0.713225
productDisplayName    0.015749
dtype: float64

In [13]:
#Filling the null values
# For categorical data
for column in ['baseColour', 'season', 'usage']:
    df[column].fillna(df[column].mode()[0], inplace=True)
# For productDisplayName
df['productDisplayName'].fillna('Unknown Product', inplace=True)

In [14]:
df.isnull().sum()

id                    0
gender                0
masterCategory        0
subCategory           0
articleType           0
baseColour            0
season                0
usage                 0
productDisplayName    0
dtype: int64

In [15]:
df.describe(include='all')

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,usage,productDisplayName
count,44446.0,44446,44446,44446,44446,44446,44446,44446,44446
unique,,5,7,45,143,46,4,8,31139
top,,Men,Apparel,Topwear,Tshirts,Black,Summer,Casual,Lucera Women Silver Earrings
freq,,22165,21400,15405,7070,9747,21497,34731,82
mean,29692.63135,,,,,,,,
std,17048.234982,,,,,,,,
min,1163.0,,,,,,,,
25%,14770.25,,,,,,,,
50%,28609.5,,,,,,,,
75%,44678.75,,,,,,,,


**Exploratory Data Analysis(EDA)**

In [16]:
#EDA
!pip install sweetviz
import sweetviz as sv
report = sv.analyze(df)
report.show_html('EDA.html')

Collecting sweetviz
  Downloading sweetviz-2.3.1-py3-none-any.whl (15.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sweetviz
Successfully installed sweetviz-2.3.1


                                             |          | [  0%]   00:00 -> (? left)

Report EDA.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


**Feature Engineering**

In [17]:
import re
# Extracting brand name (as brand names are usually the first word in productDisplayName)
df['brand'] = df['productDisplayName'].str.split().str[0]

In [18]:
df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,usage,productDisplayName,brand
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,Casual,Turtle Check Men Navy Blue Shirt,Turtle
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,Casual,Peter England Men Party Blue Jeans,Peter
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,Casual,Titan Women Silver Watch,Titan
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,Casual,Manchester United Men Solid Black Track Pants,Manchester
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,Casual,Puma Men Grey T-shirt,Puma


In [19]:
# Finding unique colors
unique_colors = df['baseColour'].unique()
unique_colors

array(['Navy Blue', 'Blue', 'Silver', 'Black', 'Grey', 'Green', 'Purple',
       'White', 'Beige', 'Brown', 'Bronze', 'Teal', 'Copper', 'Pink',
       'Off White', 'Maroon', 'Red', 'Khaki', 'Orange', 'Coffee Brown',
       'Yellow', 'Charcoal', 'Gold', 'Steel', 'Tan', 'Multi', 'Magenta',
       'Lavender', 'Sea Green', 'Cream', 'Peach', 'Olive', 'Skin',
       'Burgundy', 'Grey Melange', 'Rust', 'Rose', 'Lime Green', 'Mauve',
       'Turquoise Blue', 'Metallic', 'Mustard', 'Taupe', 'Nude',
       'Mushroom Brown', 'Fluorescent Green'], dtype=object)

In [20]:
# Color grouping
color_mapping = {
    'Navy Blue': 'Blue','Blue': 'Blue','Teal': 'Blue','Turquoise Blue': 'Blue',
    'Black': 'Black','Charcoal': 'Black',
    'White': 'White','Cream': 'White','Off White' : 'White', 'Skin': 'White',
    'Grey': 'Grey', 'Silver': 'Grey','Metallic': 'Grey','Grey Melange': 'Grey', 'Steel':'Grey',
    'Red' : 'Red','Burgundy' : 'Red','Rose':'Red','Orange':'Red',
    'Maroon':'Brown','Coffee Brown':'Brown','Copper':'Brown', 'Rust':'Brown','Mushroom Brown':'Brown','Nude':'Brown','Bronze':'Brown',
    'Yellow':'Yellow','Beige' : 'Yellow','Gold' : 'Yellow','Mustard' : 'Yellow', 'Khaki': 'Yellow','Taupe': 'Yellow','Tan':'Yellow',
    'Green' : 'Green','Olive' : 'Green','Lime Green' : 'Green','Fluorescent Green':'Green','Sea Green':'Green',
    'Purple':'Purple','Magenta':'Purple','Lavender':'Purple','Mauve':'Purple',
    'Pink':'Pink', 'Peach':'Pink',
}
df['colorGroup'] = df['baseColour'].map(color_mapping).fillna(df['baseColour'])

In [21]:
df

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,usage,productDisplayName,brand,colorGroup
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,Casual,Turtle Check Men Navy Blue Shirt,Turtle,Blue
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,Casual,Peter England Men Party Blue Jeans,Peter,Blue
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,Casual,Titan Women Silver Watch,Titan,Grey
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,Casual,Manchester United Men Solid Black Track Pants,Manchester,Black
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,Casual,Puma Men Grey T-shirt,Puma,Grey
...,...,...,...,...,...,...,...,...,...,...,...
44441,17036,Men,Footwear,Shoes,Casual Shoes,White,Summer,Casual,Gas Men Caddy Casual Shoe,Gas,White
44442,6461,Men,Footwear,Flip Flops,Flip Flops,Red,Summer,Casual,Lotto Men's Soccer Track Flip Flop,Lotto,Red
44443,18842,Men,Apparel,Topwear,Tshirts,Blue,Fall,Casual,Puma Men Graphic Stellar Blue Tshirt,Puma,Blue
44444,46694,Women,Personal Care,Fragrance,Perfume and Body Mist,Blue,Spring,Casual,Rasasi Women Blue Lady Perfume,Rasasi,Blue


In [22]:
df['brand'].nunique()

479

In [23]:
df['colorGroup'].nunique()

11

In [24]:
### One Hot Encoding
df = pd.get_dummies(df,columns=['gender', 'masterCategory', 'season', 'usage','colorGroup'],dtype='int')
df.head()

Unnamed: 0,id,subCategory,articleType,baseColour,productDisplayName,brand,gender_Boys,gender_Girls,gender_Men,gender_Unisex,...,colorGroup_Blue,colorGroup_Brown,colorGroup_Green,colorGroup_Grey,colorGroup_Multi,colorGroup_Pink,colorGroup_Purple,colorGroup_Red,colorGroup_White,colorGroup_Yellow
0,15970,Topwear,Shirts,Navy Blue,Turtle Check Men Navy Blue Shirt,Turtle,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1,39386,Bottomwear,Jeans,Blue,Peter England Men Party Blue Jeans,Peter,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,59263,Watches,Watches,Silver,Titan Women Silver Watch,Titan,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,21379,Bottomwear,Track Pants,Black,Manchester United Men Solid Black Track Pants,Manchester,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,53759,Topwear,Tshirts,Grey,Puma Men Grey T-shirt,Puma,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0


In [25]:
# Encoding labels in the columns
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

for column in ['subCategory','articleType','baseColour','brand','productDisplayName']:
    df[column] = label_encoder.fit_transform(df[column])

In [26]:
df

Unnamed: 0,id,subCategory,articleType,baseColour,productDisplayName,brand,gender_Boys,gender_Girls,gender_Men,gender_Unisex,...,colorGroup_Blue,colorGroup_Brown,colorGroup_Green,colorGroup_Grey,colorGroup_Multi,colorGroup_Pink,colorGroup_Purple,colorGroup_Red,colorGroup_White,colorGroup_Yellow
0,15970,38,104,25,27790,436,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1,39386,6,56,2,20707,326,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,59263,42,140,37,27555,427,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,21379,6,128,1,16437,271,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,53759,38,134,13,22209,342,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44441,17036,30,19,44,11128,170,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
44442,6461,11,39,33,15900,262,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
44443,18842,38,134,2,22183,342,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
44444,46694,12,91,2,23964,354,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
