In [5]:
%matplotlib inline

In [36]:
import os
from tqdm import tqdm
import shutil
import numpy as np
import pandas as pd
import imageio

In [6]:
DATA_DIR = './data/'
TRAIN_DIR = './Train/'
TEST_DIR = './Test/'

### Prepare data

##### Get class labels

In [67]:
classes = os.listdir(DATA_DIR)

len(classes)

14

##### List number of each class

In [68]:
files = {}

for cl in classes:
    class_files = os.listdir(os.path.join(DATA_DIR, cl))
    files[cl] = [os.path.join(DATA_DIR, cl, f) for f in class_files]
    print(cl, len(class_files))

lecoq 301
crocs 296
sbenu 362
descente 111
zeepseen 233
converse 479
barefoot 248
newbalance 422
adidas 687
puma 655
nike 659
drmartens 493
reebok 777
vans 453


##### Clean subdirs

In [69]:
!rm -r $TRAIN_DIR

!rm -r $TEST_DIR

##### Create subdirs and copy random images into Train and Test

In [70]:
NUM_TRAIN = 100
NUM_TEST = 30

In [71]:
for cl in classes:
    train_subdir = os.path.join(TRAIN_DIR, cl)
    if not os.path.exists(train_subdir):
        os.makedirs(train_subdir)
    test_subdir = os.path.join(TEST_DIR, cl)
    if not os.path.exists(test_subdir):
        os.makedirs(test_subdir)

In [72]:
for cl,fl in tqdm(files.items()):
    select_train = np.random.choice(fl, size=NUM_TRAIN, replace=False,).tolist()
    select_train = list(filter(lambda x: x if x.endswith('jpg') or x.endswith('jpeg') else None, select_train))
    select_test = np.random.choice(fl, size=NUM_TEST, replace=False,).tolist()
    select_test = list(filter(lambda x: x if x.endswith('jpg') or x.endswith('jpeg') else None, select_test))
    train_dest = os.path.join(TRAIN_DIR, cl)
    test_dest = os.path.join(TEST_DIR, cl)
    for tr in select_train:
        img_shape = imageio.imread(tr).shape
        if (len(img_shape) == 3) & (img_shape[-1] == 3):
            _ = shutil.copy(tr, train_dest)
        else:
            print('Could not copy %s' % tr)
    for te in select_test:
        img_shape = imageio.imread(te).shape
        if (len(img_shape) == 3) & (img_shape[-1] == 3):
            _ = shutil.copy(te, test_dest)
        else:
            print('Could not copy %s' % te)




  0%|          | 0/14 [00:00<?, ?it/s][A[A[A

Could not copy ./data/descente/394.jpg
Could not copy ./data/descente/18.jpg
Could not copy ./data/descente/195.jpg
Could not copy ./data/descente/288.jpg
Could not copy ./data/descente/301.jpg





  7%|▋         | 1/14 [00:01<00:18,  1.42s/it][A[A[A


 14%|█▍        | 2/14 [00:04<00:24,  2.01s/it][A[A[A

Could not copy ./data/nike/602.jpg





 21%|██▏       | 3/14 [00:08<00:28,  2.58s/it][A[A[A


 29%|██▊       | 4/14 [00:13<00:31,  3.19s/it][A[A[A


 36%|███▌      | 5/14 [00:17<00:30,  3.40s/it][A[A[A


 43%|████▎     | 6/14 [00:19<00:23,  2.98s/it][A[A[A


 50%|█████     | 7/14 [00:21<00:18,  2.66s/it][A[A[A

Could not copy ./data/sbenu/513.jpg





 57%|█████▋    | 8/14 [00:22<00:13,  2.32s/it][A[A[A

Could not copy ./data/adidas/137.jpg
Could not copy ./data/adidas/312.jpg
Could not copy ./data/adidas/348.jpg





 64%|██████▍   | 9/14 [00:27<00:14,  2.95s/it][A[A[A

Could not copy ./data/adidas/438.jpg
Could not copy ./data/crocs/187.jpg





 71%|███████▏  | 10/14 [00:29<00:10,  2.73s/it][A[A[A


 79%|███████▊  | 11/14 [00:31<00:08,  2.70s/it][A[A[A

Could not copy ./data/lecoq/54.jpg
Could not copy ./data/lecoq/63.jpg





 86%|████████▌ | 12/14 [00:36<00:06,  3.13s/it][A[A[A


 93%|█████████▎| 13/14 [00:39<00:03,  3.09s/it][A[A[A


100%|██████████| 14/14 [00:41<00:00,  2.84s/it][A[A[A


[A[A[A

##### Save dataframes with image paths and labels

In [73]:
train_df = pd.DataFrame(columns=['class','filepath'])
test_df = pd.DataFrame(columns=['class','filepath'])

for cl in classes:
    train_dest = os.path.join(TRAIN_DIR, cl)
    test_dest = os.path.join(TEST_DIR, cl)
    train_filepaths = [os.path.join(train_dest, fp) for fp in os.listdir(train_dest)]
    test_filepaths = [os.path.join(test_dest, fp) for fp in os.listdir(test_dest)]
    train_df = train_df.append(pd.DataFrame({'class': cl,
                                             'filepath': train_filepaths}))
    test_df = test_df.append(pd.DataFrame({'class': cl,
                                             'filepath': test_filepaths}))

In [74]:
train_df.shape, test_df.shape

((1381, 2), (418, 2))

In [75]:
train_df.sample(n=len(train_df), replace=False).to_csv('./train_df.csv', index=False)
test_df.sample(n=len(test_df), replace=False).to_csv('./test_df.csv', index=False)