### Fetching the Dataset

In [1]:
# %pip install kaggle
# !kaggle datasets download -d paramaggarwal/fashion-product-images-small

Collecting kaggle
  Downloading kaggle-1.6.14.tar.gz (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m696.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tqdm (from kaggle)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting python-slugify (from kaggle)
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting text-unidecode>=1.3 (from python-slugify->kaggle)
  Downloading text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Downloading python_slugify-8.0.4-py2.py3-none-any.whl (10 kB)
Downloading tqdm-4.66.4-py3-non

### Data Preprocessing

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [29]:
# read data/styles.csv using pd
data = pd.read_csv('data/styles.csv', delimiter=',', on_bad_lines='skip')
data.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


In [24]:
data.columns

Index(['id', 'gender', 'masterCategory', 'subCategory', 'articleType',
       'baseColour', 'season', 'year', 'usage', 'productDisplayName'],
      dtype='object')

In [30]:
data.shape

(44424, 10)

In [33]:
# check if there are any missing values
data.isnull().sum()

id                      0
gender                  0
masterCategory          0
subCategory             0
articleType             0
baseColour             15
season                 21
year                    1
usage                 317
productDisplayName      7
dtype: int64

In [34]:
# drop rows with missing values
data.dropna(inplace=True)

In [35]:
data.shape

(44077, 10)

##### Check Uniques

In [43]:
# check the unique values in the column 'masterCategory'
data['masterCategory'].unique()

array(['Apparel', 'Accessories', 'Footwear', 'Personal Care',
       'Free Items', 'Sporting Goods', 'Home'], dtype=object)

In [44]:
# check the unique values in the column 'subCategory'
data['subCategory'].unique()

array(['Topwear', 'Bottomwear', 'Watches', 'Socks', 'Shoes', 'Belts',
       'Flip Flops', 'Bags', 'Innerwear', 'Sandal', 'Shoe Accessories',
       'Fragrance', 'Jewellery', 'Lips', 'Saree', 'Eyewear', 'Scarves',
       'Dress', 'Loungewear and Nightwear', 'Wallets', 'Apparel Set',
       'Headwear', 'Mufflers', 'Skin Care', 'Makeup', 'Free Gifts',
       'Ties', 'Accessories', 'Nails', 'Beauty Accessories',
       'Water Bottle', 'Skin', 'Eyes', 'Bath and Body', 'Gloves',
       'Sports Accessories', 'Cufflinks', 'Sports Equipment', 'Stoles',
       'Hair', 'Perfumes', 'Home Furnishing', 'Umbrellas', 'Wristbands',
       'Vouchers'], dtype=object)

In [52]:
# check the unique values in the column 'articleType'
data['articleType'].unique()

# count the number of unique values in the column 'articleType'
data['articleType'].nunique()

142

In [46]:
# check the unique values in the column 'baseColour'
data['baseColour'].unique()

array(['Navy Blue', 'Blue', 'Silver', 'Black', 'Grey', 'Green', 'Purple',
       'White', 'Beige', 'Brown', 'Bronze', 'Teal', 'Copper', 'Pink',
       'Off White', 'Maroon', 'Red', 'Khaki', 'Orange', 'Yellow',
       'Charcoal', 'Gold', 'Steel', 'Tan', 'Multi', 'Magenta', 'Lavender',
       'Sea Green', 'Cream', 'Peach', 'Olive', 'Skin', 'Burgundy',
       'Coffee Brown', 'Grey Melange', 'Rust', 'Rose', 'Lime Green',
       'Mauve', 'Turquoise Blue', 'Metallic', 'Mustard', 'Taupe', 'Nude',
       'Mushroom Brown', 'Fluorescent Green'], dtype=object)

In [47]:
# check the unique values in the column 'season'
data['season'].unique()

array(['Fall', 'Summer', 'Winter', 'Spring'], dtype=object)

In [48]:
# check the unique values in the column 'year'
data['year'].unique()

array([2011., 2012., 2016., 2017., 2015., 2014., 2010., 2013., 2018.,
       2019., 2007., 2009., 2008.])

In [49]:
# check the unique values in the column 'usage'
data['usage'].unique()

array(['Casual', 'Ethnic', 'Formal', 'Sports', 'Smart Casual', 'Travel',
       'Party', 'Home'], dtype=object)

In [51]:
# check the unique values in the column 'productDisplayName'
data['productDisplayName'].unique()

# count the total number of unique values in the column 'productDisplayName'
data['productDisplayName'].nunique()

30806