# Create "tiny" datasets from larger sets
This generates a smaller train/validation dataset of N per class (default to 100). Used to evaluate how well platforms perform with small input data

In [None]:
import os
import sys
import clarifai
import pandas as pd
import numpy as np
import datetime
import pickle
import time
import shutil
import csv

# Config Params
1. Insert datasets you want to make tiny
2. Insert path to datasets, root level

In [None]:
# Insert the dataset of interest
datasets = [
    'fashion_mnist',    
    'cifar10', 
    'uo_dress',
    'mnist'
]
root_data = '{PATH_TO_ROOT_DATASETS}'
n_per_class = 100
suffix = 'tiny'

# Loop through datasets and make tiny train/val versions

In [None]:
for d in datasets:
    dataset_path = os.path.join(root_data, d)
    save_path = os.path.join(root_data, '{}_{}'.format(d, suffix))
    save_train = os.path.join(save_path, 'train')
    save_val = os.path.join(save_path, 'val')
    save_test = os.path.join(save_path, 'test')
    try:
        os.makedirs(save_path)
    except:
        pass
    try:
        os.makedirs(save_train)
    except:
        pass
    try:
        os.makedirs(save_val)    
    except:
        pass
    try:
        os.makedirs(save_test)    
    except:
        pass

    # Open CSV
    labels_file = os.path.join(save_path, 'labels.csv')
    fid = open(labels_file, "w")
    writer = csv.writer(fid)

    # Read in file locations and labels
    labels = pd.read_csv(os.path.join(dataset_path, 'labels.csv'), header=None, dtype=str)
    train_files = labels[labels[0].str.contains('(train|val)')].values
    test_files = labels[labels[0].str.contains('test')].values

    # Fill in training, select n_per_class_random
    class_labels = np.sort(np.unique(train_files[:,1]))
    new_labels = []
    for cls in class_labels:
        # get class subset
        tfiles = train_files[train_files[:,1]==cls,:]
        # Find N train
        inds = np.random.choice(len(tfiles), size=n_per_class, replace=False)
        keep_files = tfiles[inds,:]
        keep_paths = keep_files[:,0]

        # Copy files over
        for p in keep_paths:
            shutil.copyfile(os.path.join(dataset_path, p), os.path.join(save_path, p))

        # Store off labels
        for k in keep_files:
            new_labels += [k.tolist()]

        writer.writerows(keep_files)

    # Copy all of test over
    # Copy files over
    for p in test_files[:,0]:
        shutil.copyfile(os.path.join(dataset_path, p), os.path.join(save_path, p))

    writer.writerows(test_files)

    # Close file
    fid.close()
