## Data Preparation ##

We will split the given data into train and test dataset. Each product has corresponding year. We will keep all products with even year (i.e 2012) in train set and remaining ones in test set.

In [2]:
# imports
import pandas as pd
import os

In [5]:
DATA_DIR = "D:\\Projects\\Datasets\\fashion-product-images"
data = pd.read_csv(os.path.join(DATA_DIR, "styles.csv"),error_bad_lines=False)

b'Skipping line 6044: expected 10 fields, saw 11\nSkipping line 6569: expected 10 fields, saw 11\nSkipping line 7399: expected 10 fields, saw 11\nSkipping line 7939: expected 10 fields, saw 11\nSkipping line 9026: expected 10 fields, saw 11\nSkipping line 10264: expected 10 fields, saw 11\nSkipping line 10427: expected 10 fields, saw 11\nSkipping line 10905: expected 10 fields, saw 11\nSkipping line 11373: expected 10 fields, saw 11\nSkipping line 11945: expected 10 fields, saw 11\nSkipping line 14112: expected 10 fields, saw 11\nSkipping line 14532: expected 10 fields, saw 11\nSkipping line 15076: expected 10 fields, saw 12\nSkipping line 29906: expected 10 fields, saw 11\nSkipping line 31625: expected 10 fields, saw 11\nSkipping line 33020: expected 10 fields, saw 11\nSkipping line 35748: expected 10 fields, saw 11\nSkipping line 35962: expected 10 fields, saw 11\nSkipping line 37770: expected 10 fields, saw 11\nSkipping line 38105: expected 10 fields, saw 11\nSkipping line 38275: ex

In [6]:
data.describe()

Unnamed: 0,id,year
count,44424.0,44423.0
mean,29696.334301,2012.806497
std,17049.490518,2.12648
min,1163.0,2007.0
25%,14768.75,2011.0
50%,28618.5,2012.0
75%,44683.25,2015.0
max,60000.0,2019.0


#### Note: #### 
After ignoring the erroneous lines, we are left with 44424 records. But note that we have 44441 images.

In [25]:
data.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


In [34]:
test_set, train_set = [x for _, x in data.groupby(data['year'] %2== 0)]

In [35]:
# size of train and test set
print("Size of Train Set: ", len(train_set))
print("Size of Test Set: ", len(test_set))

Size of Train Set:  23788
Size of Test Set:  20636


In [36]:
train_set.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt
6,30805,Men,Apparel,Topwear,Shirts,Green,Summer,2012.0,Ethnic,Fabindia Men Striped Green Shirt
7,26960,Women,Apparel,Topwear,Shirts,Purple,Summer,2012.0,Casual,Jealous 21 Women Purple Shirt


## Sub-splits ##
We will split train data into two sets: one for pre-training and one for finetuning.
We will use top-20 classes for pre-training and remaining for finetuning.

In [46]:
# find top-20 classes
top_classes = train_set.groupby(['articleType']).size().reset_index(name='counts').sort_values('counts', ascending=False)

In [47]:
top_classes

Unnamed: 0,articleType,counts
99,Tshirts,2749
105,Watches,2482
14,Casual Shoes,1550
43,Kurtas,1140
73,Shirts,1130
...,...,...
101,Umbrellas,1
62,Rain Trousers,1
64,Robe,1
83,Suits,1


In [44]:
#get top 20 classes
top_classes[:20]

Unnamed: 0,articleType,counts
99,Tshirts,2749
105,Watches,2482
14,Casual Shoes,1550
43,Kurtas,1140
73,Shirts,1130
92,Tops,1059
84,Sunglasses,1042
31,Handbags,1006
80,Sports Shoes,881
33,Heels,775


In [58]:
pretrain_set = train_set[train_set['articleType'].isin(top_classes['articleType'][:20])]
finetune_set = train_set[train_set['articleType'].isin(top_classes['articleType'][20:])]

In [59]:
print("Records with top 20 classes: ", len(pretrain_set))
print("Records with remainig classes: ", len(finetune_set))

Records with top 20 classes:  18556
Records with remainig classes:  5232


## Save Files ##

In [60]:
train_set.to_csv(os.path.join(DATA_DIR, "train_set.csv"))
test_set.to_csv(os.path.join(DATA_DIR, "test_set.csv"))
pretrain_set.to_csv(os.path.join(DATA_DIR, "top20classes_set.csv"))
finetune_set.to_csv(os.path.join(DATA_DIR, "remainingclasses_set.csv"))

## Data Loader ##

In [146]:
import glob
import numpy as np
import torch
import os
import csv
import pandas as pd
from PIL import Image
from torchvision import transforms
from torch.utils.data.dataset import Dataset


#pytorch Dataset class to load provided data
class FashionDataset(Dataset):
    def __init__(self, data_dir, csv_path, transform=None):
        """
        Arguments:
            - data_dir : path to folder containing csv files and "images" folder
            - csv_path : path to csv file (i.e test_set.csv)
            - transform : (optional)
        """
        self.data = pd.read_csv(os.path.join(data_dir, csv_path))
        # generate images names before hand as some images are not there in the folder
        self.generate_image_list(data_dir)
        self.transform = transform
        self.data_dir = data_dir

    def __getitem__(self, index):
        image = Image.open(os.path.join(self.data_dir, "images", str(self.data.iloc[index]['id'])+".jpg"))
        #uncomment following line to read image in grayscale
        # self.image = Image.open(self.image_names[index]).convert('L')
        
        if self.transform:
            image = self.transform(image)
        return (image, self.label_map[self.data.iloc[index]['articleType']])

    def __len__(self):
        return len(self.data)

    def generate_image_list(self,data_dir):
        self.label_map = {}
        i = 0
        for indx, img_path in self.data.iterrows():
            path = os.path.join(DATA_DIR, "images", str(img_path['id'])+".jpg")
            if not os.path.exists(path):
                self.data.drop(indx, inplace=True)
            if img_path['articleType'] not in self.label_map:
                self.label_map[img_path['articleType']] = i
                i += 1
        print("Found {} images in {} for provided csv file.".format(len(self.data), data_dir))




In [147]:

# for testing purpose
if __name__ == "__main__":
    data_loader = FashionDataset(DATA_DIR, "remainingclasses_set.csv", transform=transforms.Compose([transforms.Resize((280,280)), transforms.ToTensor()]))
    i = 0
    for img, label in data_loader:
        print(label)
        d = transforms.ToPILImage()(img)
        i += 1
        if i == 10:
            d.show()
            break


Found 5230 images in D:\Projects\Datasets\fashion-product-images for provided csv file.
0
1
2
3
4
5
6
7
8
9
