In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:
from fastai import *
from fastai.vision import *
from pathlib import Path
import os, sys
import pandas as pd
import shutil
from sklearn.externals.joblib import Parallel, delayed

# Create Imagenet Style Folders

It's much easier to just convert our file structure to imagenet standard instead of create a custom ImageData bunch. The Imagenet format is:

path\
  train\
    clas1\
    clas2\
    ...
  valid\
    clas1\
    clas2\
    ...
  test\
  
Whereas our format is:

path\
  labels.csv
  train\
    (all class)
  test\
    (all class)
  val\
    (all class)    

## Directions

1. Download gzipped data files
2. Untar all the files so you have directories

In [6]:
base_path = Path('/home/jupyter/data')
orig_datasets = ['uo_dress', 'uo_dress_tiny', 'cifar10', 'cifar10_tiny', 'mnist', 'mnist_tiny', 'fashion_mnist', 'fashion_mnist_tiny']
split_names = [('train', 'train'), 
               ('val', 'valid'), 
               ('test', 'test')]
dst_base = base_path/'imagenet_style'

## Create output dir

In [7]:
for dataset in orig_datasets:
    print('Copying {}'.format(dataset))
    dst_path = dst_base/dataset
    os.makedirs(dst_path, exist_ok=True)

    # Read labels file
    df = pd.read_csv(base_path/dataset/'labels.csv', delimiter=',', header=None, names=['name','label'])

    # Get unique classes
    class_labels = df.label.unique()

    # Create output directories
    for split in split_names:
        src_split = split[0]
        dst_split = split[1]
        print('\tCopying {}/{}'.format(dataset, dst_split))        

        for label in class_labels:
            label_str = str(label)
            src_path = base_path/dataset
            if split[1] == 'test':
                save_path = dst_path/dst_split
            else:
                save_path = dst_path/dst_split/label_str
            print('\t\tCopying {}'.format(save_path))        

            # Create directory
            os.makedirs(save_path, exist_ok=True)        

            # Get matching values
            matches = df[(df.name.str.contains(src_split)) & (df.label==label)].name.tolist()

            # Parallel copy files to proper locations
            Parallel(n_jobs=-1)(delayed(shutil.copy)(src_path/fname, save_path) for fname in matches)        

    

Copying uo_dress
	Copying uo_dress/train
		Copying /home/jupyter/data/imagenet_style/uo_dress/train/not_dress
		Copying /home/jupyter/data/imagenet_style/uo_dress/train/solid_color
		Copying /home/jupyter/data/imagenet_style/uo_dress/train/floral
		Copying /home/jupyter/data/imagenet_style/uo_dress/train/stripes
	Copying uo_dress/valid
		Copying /home/jupyter/data/imagenet_style/uo_dress/valid/not_dress
		Copying /home/jupyter/data/imagenet_style/uo_dress/valid/solid_color
		Copying /home/jupyter/data/imagenet_style/uo_dress/valid/floral
		Copying /home/jupyter/data/imagenet_style/uo_dress/valid/stripes
	Copying uo_dress/test
		Copying /home/jupyter/data/imagenet_style/uo_dress/test
		Copying /home/jupyter/data/imagenet_style/uo_dress/test
		Copying /home/jupyter/data/imagenet_style/uo_dress/test
		Copying /home/jupyter/data/imagenet_style/uo_dress/test
Copying uo_dress_tiny
	Copying uo_dress_tiny/train
		Copying /home/jupyter/data/imagenet_style/uo_dress_tiny/train/floral
		Copying /h