In [1]:
import numpy as np
import pandas as pd
from scipy import misc

import os
import shutil
from tqdm import tqdm 

# Delete rubbish

In [2]:
# the folder from train.zip file
dir_train = '/home/ubuntu/train/'

In [None]:
# remove non-images
os.remove(os.path.join(dir_train, '198.spider/RENAME2'))
shutil.rmtree(os.path.join(dir_train, '056.dog/greg'))
# we don't need the class with noise
shutil.rmtree(os.path.join(dir_train, '257.clutter'))

# Collect metadata

In [3]:
subdirs = list(os.walk(dir_train))[1:]

# collect train metadata
train_metadata = []

for dir_path, _, files in tqdm(subdirs):
    
    dir_name = dir_path.split('/')[-1]
    
    for file_name in files:
        if not file_name.startswith('.'):
            # read image
            temp = misc.imread(os.path.join(dir_path, file_name)) 
            # collect image metadata
            image_metadata = []
            image_metadata.extend([dir_name, file_name])
            image_metadata.extend( 
                list(temp.shape) if len(temp.shape) == 3 
                else [temp.shape[0], temp.shape[1], 1]
            )
            image_metadata.extend([temp.nbytes, temp.dtype])
            # append image metadata to list
            train_metadata.append(image_metadata)

100%|██████████| 256/256 [01:09<00:00,  2.93it/s]


# Explore metadata

In [4]:
M = pd.DataFrame(train_metadata)
M.columns = ['directory', 'img_name', 'height', 'width', 'channels', 'byte_size', 'bit_depth']

M['category_name'] = M.directory.apply(lambda x: x.split('.')[-1].lower())
M['img_extension'] = M.img_name.apply(lambda x: x.split('.')[-1])
M['category_number'] = M.directory.apply(lambda x: int(x.split('.')[0]))

# remove '101' from some category names
M.category_name = M.category_name.apply(lambda x: x[:-4] if '101' in x else x)

In [5]:
# number of grayscale images
(M.channels != 3).sum()

298

In [6]:
M.img_extension.unique()

array(['jpg'], dtype=object)

In [7]:
M.bit_depth.unique()

array([dtype('uint8')], dtype=object)

In [8]:
# number of categories
M.category_name.nunique()

256

# Create decoder

In [9]:
# class number -> class name
decode = {n: i for i, n in M.groupby('category_name').category_number.first().iteritems()}

In [10]:
np.save('decode.npy', decode)

# Split data

In [11]:
# 20 images per class
V = M.groupby('category_name', group_keys=False).apply(lambda x: x.sample(n=20, replace=False))
V.sort_index(inplace=True)
M.drop(V.index, axis=0, inplace=True)

In [12]:
# train data
len(M)

16980

In [13]:
# validation data
len(V)

5120

# Save split

In [14]:
M.to_csv('train_metadata.csv', index=False)

In [15]:
V.to_csv('val_metadata.csv', index=False)