# Google AutoML Input Generation
Google AutoML allows a CSV upload for input data. This takes the datasets and converts them to google automl input

In [5]:
import os
import sys
import pandas as pd
import numpy as np
import csv

# Config Params

In [92]:
# Define the datasets of interest
datasets = [
    'fashion_mnist_tiny',    
    'cifar10_tiny', 
    'uo_dress_tiny',
    'mnist_tiny'
]
root_data = '{PATH_TO_ROOT_DATASETS}'
remote_dir = 'gs://PATH/TO/BUCKET/WHERE/DATA/LIVES'
max_test = 100

# Create the input datasets
Loops through each dataset and creates input CSV file

In [103]:
for d in datasets:
    dataset_path = os.path.join(root_data, d)
    remote_path = os.path.join(remote_dir, d)

    # Read in file locations and labels
    labels = pd.read_csv(os.path.join(dataset_path, 'labels.csv'), header=None, dtype=str)
    
    # Train
    train_files = labels[labels[0].str.contains('train/')].values
    train_files[:,0] = [os.path.join(remote_path,t) for t in train_files[:,0]]
    if len(train_files[0,1])==1:
        # turn into an actual entry supported by google
        train_files[:,1] = ['a_{}'.format(t) for t in train_files[:,1]]
    train_new = []
    for i,t in enumerate(train_files):
        train_new += [['TRAIN', t[0], t[1]]]
    train_files = train_new
    
    # Lazy copy paste for validation
    val_files = labels[labels[0].str.contains('val/')].values
    val_files[:,0] = [os.path.join(remote_path,t) for t in val_files[:,0]]
    if len(val_files[0,1])==1:
        # turn into an actual entry supported by google
        val_files[:,1] = ['a_{}'.format(t) for t in val_files[:,1]]
    val_new = []
    for i,t in enumerate(val_files):
        val_new += [['VALIDATION', t[0], t[1]]]
    val_files = val_new    
    
    # Lazy copy paste for test
    test_files = labels[labels[0].str.contains('test/')].values
    test_files[:,0] = [os.path.join(remote_path,t) for t in test_files[:,0]]
    keep_inds = np.random.choice(len(test_files), max_test, replace=False)
    test_files = test_files[keep_inds, :]
    if len(test_files[0,1])==1:
        # turn into an actual entry supported by google
        test_files[:,1] = ['a_{}'.format(t) for t in test_files[:,1]]
    test_new = []
    for i,t in enumerate(test_files):
        test_new += [['TEST', t[0], t[1]]]
    test_files = test_new        
    
    files = train_files + val_files + test_files 
    
    # Write file
    labels_file = os.path.join(dataset_path, 'google_inputs.csv')    
    with open(labels_file, "w") as fid:
        writer = csv.writer(fid)
        writer.writerows(files)


In [105]:
len(files)

1100