# Create Zipped Dataset for Salesforce Einstein
Salesforce Einstein supports zip-file uploads which appeared to be the fastest approach. This creates a zip file and then another script in this directory helps upload them. Once zips are created, see "upload_zip_datasets.sh"

In [None]:
import os, sys
import pandas as pd
import requests
import numpy as np
import shutil
import requests

# Config Params
Set datasets of interest, root dataset location, and save directory

In [None]:
# Set datasets you want to generate zips for
datasets = ['fashion_mnist_tiny', 'cifar10_tiny', 'mnist_tiny', 'uo_dress_tiny']

# Define where the root datasets are
root_data = '{ROOT_DATASET_DIRECTORY}'

# Define where to save the data
save_dir = '{SAVE_DIRECTORY}'

# Create the Zip Files

In [None]:
for d in datasets:
    print('CREATING ZIP FOR DATASET {}...'.format(d))
    dataset_path = os.path.join(root_data, d)

    # Get filenames and ground truth
    labels = pd.read_csv(os.path.join(dataset_path, 'labels.csv'), header=None, dtype=str)
    train_files = labels[labels[0].str.contains('(train|val)')].values
    test_files = labels[labels[0].str.contains('test')].values

    # Assign labels
    y_train = train_files[:,1]
    class_labels = np.sort(np.unique(y_train))

    # Make output directory
    dataset_out = os.path.join(save_dir, d)
    try:
        os.makedirs(dataset_out)
    except:
        pass

    # Copy files to uploadable format
    for cls in class_labels:
        # Grab files
        print('Grabbing files from class: {} ...'.format(cls))
        inds_match = y_train == cls
        files_temp = train_files[inds_match, 0]
        y_temp = y_train[inds_match]
        files_temp = [os.path.join(dataset_path, c) for c in files_temp]
        print('{} files found.'.format(len(files_temp)))

        # Make directory
        cls_dir = os.path.join(dataset_out, str(cls))
        try:
            os.makedirs(cls_dir)
        except:
            pass    

        # Copy all matching files into that directory
        for i, file_name in enumerate(files_temp):
            if i+1 % 100 == 0:
                print('{}/{}'.format(i+1, len(files_temp)))
            full_file_name = os.path.join(dataset_path, file_name)
            #print(full_file_name)
            try:
                shutil.copy(full_file_name, cls_dir)    
            except: 
                print('Error copying {}'.format(full_file_name))


    # Zip the directory
    output_filename = os.path.join(save_dir, d)
    shutil.make_archive(output_filename, 'zip', dataset_out)