# Data preparation for building a classifier

``Input:``

root-dir

    |- label 1
        |- image 1
        |- image 2
        ...
    |- label 2
        |- image 1
        |- image 2
        ...
    ...

``Output:``

train-dir

    |- label 1
        |- image 1
        |- image 3
        ...
    |- label 2
        |- image 1
        |- image 3
        ...
    ...
    
test-dir

    |- label 1
        |- image 2
        |- image 4
        ...
    |- label 2
        |- image 2
        |- image 4
        ...
    ...

### Algorithm

1. Get image paths
2. Randomly split them into train and test
    - fix percentage test images for each class
3. Copy images to train and test directories
4. Save dataframes with class label and image path both for train and test
    - we will use with torch's dataloader

In [1]:
%matplotlib inline

import os
from tqdm import tqdm
import shutil
import numpy as np
import pandas as pd
import imageio

In [3]:
DATA_DIR = './data/'
TRAIN_DIR = './Train/'
TEST_DIR = './Test/'

### Prepare data

##### Get class labels

In [4]:
classes = os.listdir(DATA_DIR)

len(classes)

14

##### List number of each class

In [5]:
files = {}

for cl in classes:
    class_files = os.listdir(os.path.join(DATA_DIR, cl))
    files[cl] = [os.path.join(DATA_DIR, cl, f) for f in class_files]
    print(cl, len(class_files))

lecoq 301
crocs 296
sbenu 362
descente 111
zeepseen 233
converse 479
barefoot 248
newbalance 422
adidas 687
puma 655
nike 659
drmartens 493
reebok 777
vans 453


##### Clean subdirs

In [6]:
!rm -r $TRAIN_DIR

!rm -r $TEST_DIR

##### Create subdirs and copy random images into Train and Test

In [46]:
NUM_TEST = 30

In [47]:
for cl in classes:
    train_subdir = os.path.join(TRAIN_DIR, cl)
    if not os.path.exists(train_subdir):
        os.makedirs(train_subdir)
    test_subdir = os.path.join(TEST_DIR, cl)
    if not os.path.exists(test_subdir):
        os.makedirs(test_subdir)

In [52]:
filepaths = pd.DataFrame(columns=['filepath','subset'])

for lab,fp in files.items():
    tmp = pd.DataFrame(fp, index=[lab]*len(fp), columns=['filepath'])
    test_prob = NUM_TEST / len(fp)
    tmp['subset'] = np.random.choice([TRAIN_DIR,TEST_DIR], size=len(fp), p=[1 - test_prob, test_prob])
    filepaths = filepaths.append(tmp)
    
filepaths.reset_index(inplace=True)
filepaths.head()

Unnamed: 0,index,filepath,subset
0,zeepseen,./data/zeepseen/41.jpg,./Train/
1,zeepseen,./data/zeepseen/48755.jpg,./Train/
2,zeepseen,./data/zeepseen/min001012.jpg,./Train/
3,zeepseen,./data/zeepseen/39.JPG,./Train/
4,zeepseen,./data/zeepseen/pla001996.jpg,./Train/


##### Select only valid extensions

In [53]:
filepaths = filepaths[(filepaths['filepath'].str.endswith('jpg')) | (filepaths['filepath'].str.endswith('jpeg'))]
filepaths.shape

(6161, 3)

In [54]:
filepaths['subset'].value_counts()

./Train/    5747
./Test/      414
Name: subset, dtype: int64

In [55]:
filepaths.groupby(['index','subset'])['filepath'].nunique()

index       subset  
adidas      ./Test/      27
            ./Train/    660
barefoot    ./Test/      32
            ./Train/    211
converse    ./Test/      31
            ./Train/    447
crocs       ./Test/      35
            ./Train/    261
descente    ./Test/      26
            ./Train/     85
drmartens   ./Test/      26
            ./Train/    467
lecoq       ./Test/      28
            ./Train/    273
newbalance  ./Test/      27
            ./Train/    395
nike        ./Test/      36
            ./Train/    623
puma        ./Test/      39
            ./Train/    615
reebok      ./Test/      29
            ./Train/    748
sbenu       ./Test/      33
            ./Train/    329
vans        ./Test/      22
            ./Train/    431
zeepseen    ./Test/      23
            ./Train/    202
Name: filepath, dtype: int64

### Copy images

##### Create destination filepath and copy

In [56]:
filepaths['fname'] = filepaths['filepath'].apply(lambda x: x.split('/')[-1])

filepaths['dest'] = filepaths.apply(lambda x: os.path.join(x['subset'], x['index'], x['fname']), axis=1)

In [58]:
for i,row in tqdm(filepaths.iterrows()):
    img_shape = imageio.imread(row['filepath']).shape
    if (len(img_shape) == 3) & (img_shape[-1] == 3):
        _ = shutil.copy(row['filepath'], row['dest'])
    else:
        print('Could not copy %s' % row['filepath'])

Could not copy ./data/crocs/187.jpg
Could not copy ./data/crocs/91.jpg
Could not copy ./data/sbenu/513.jpg
Could not copy ./data/vans/268.jpg
Could not copy ./data/nike/602.jpg
Could not copy ./data/nike/102.jpg
Could not copy ./data/nike/185.jpg
Could not copy ./data/nike/199.jpg
Could not copy ./data/nike/153.jpg
Could not copy ./data/descente/394.jpg
Could not copy ./data/descente/56.jpg
Could not copy ./data/descente/288.jpg
Could not copy ./data/descente/18.jpg
Could not copy ./data/descente/301.jpg
Could not copy ./data/descente/195.jpg
Could not copy ./data/adidas/162.jpg
Could not copy ./data/adidas/312.jpg
Could not copy ./data/adidas/348.jpg
Could not copy ./data/adidas/473.jpg
Could not copy ./data/adidas/137.jpg
Could not copy ./data/adidas/247.jpg
Could not copy ./data/adidas/438.jpg
Could not copy ./data/adidas/173.jpg


  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


Could not copy ./data/converse/152.jpg


  " Skipping tag %s" % (size, len(data), tag))


Could not copy ./data/lecoq/54.jpg
Could not copy ./data/lecoq/63.jpg


##### Save dataframes with image paths and labels

In [71]:
train_df = pd.DataFrame(columns=['class','filepath'])
test_df = pd.DataFrame(columns=['class','filepath'])

for cl in classes:
    train_dest = os.path.join(TRAIN_DIR, cl)
    test_dest = os.path.join(TEST_DIR, cl)
    train_filepaths = [os.path.join(train_dest, fp) for fp in os.listdir(train_dest)]
    test_filepaths = [os.path.join(test_dest, fp) for fp in os.listdir(test_dest)]
    train_df = train_df.append(pd.DataFrame({'class': cl,
                                             'filepath': train_filepaths}))
    test_df = test_df.append(pd.DataFrame({'class': cl,
                                             'filepath': test_filepaths}))

In [72]:
train_df.shape, test_df.shape

((5722, 2), (413, 2))

In [73]:
train_df.sample(n=len(train_df), replace=False).to_csv('./train_df.csv', index=False)
test_df.sample(n=len(test_df), replace=False).to_csv('./test_df.csv', index=False)