# Clarifai Dataset Evaluation Notebook

In [None]:
import os
import sys
import clarifai
import pandas as pd
import numpy as np
import datetime
import pickle
import time

from clarifai.rest import ClarifaiApp
from clarifai.rest import Image as ClImage

# Config Params
This script assumes you already manually uploaded the datasets into different groups, and you have an API key for each dataset. 

In [None]:
# Update with your API keys
datasets = {
    'fashion_mnist_10p': '{DATASET_SPECIFIC_API_KEY}',    
    'cifar10_20p': '{DATASET_SPECIFIC_API_KEY}',    
    'uo_dress': '{DATASET_SPECIFIC_API_KEY}',
    'cifar10': '{DATASET_SPECIFIC_API_KEY}',
    'fashion_mnist': '{DATASET_SPECIFIC_API_KEY}',
    'mnist': '{DATASET_SPECIFIC_API_KEY}',
    'fashion_mnist_tiny': '{DATASET_SPECIFIC_API_KEY}',
    'cifar10_tiny': '{DATASET_SPECIFIC_API_KEY}',
    'mnist_tiny': '{DATASET_SPECIFIC_API_KEY}',
    'uo_dress_tiny': '{DATASET_SPECIFIC_API_KEY}'
    }
skip_sets = []  # List ones to skip here
root_data = '{PATH_TO_ROOT_DATASETS}'
n_batch = 128

# Support Functions

In [None]:
# Yield successive n-sized chunks from l. 
def divide_chunks(l, n):      
    # looping till length l 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 

## The main train/test function

In [None]:
def train_and_test(dataset_path, api_key):
    
    # Get dataset
    d = os.path.basename(dataset_path)

    #
    # Get application using API Key
    #
    app = ClarifaiApp(api_key=api_key)    
    
    
    #
    # Read in file locations and labels
    #
    labels = pd.read_csv(os.path.join(dataset_path, 'labels.csv'), header=None, dtype=str)
    train_files = labels[labels[0].str.contains('(train|val)')].values
    test_files = labels[labels[0].str.contains('test')].values

    #
    # Load in training images
    # Create array of clarifai image objects (i.e. read in images to memory)
    #
    image_objs = []
    print('Loading training images into memory...')
    for row in train_files:
        tfile = os.path.join(dataset_path, row[0])
        tconcept = row[1]
        tmeta = {'filename': tfile}
        img_tmp = ClImage(filename=tfile, concepts=[tconcept], metadata=tmeta)
        image_objs = image_objs + [img_tmp]
    print('Number of training images: {}'.format(len(image_objs)))


    #
    # Create new model
    #
    timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    model_name = '{}_{}'.format(d, timestamp)
    concepts = np.unique(train_files[:,1]).tolist()  # Get list of concept names
    model = app.models.create(model_name, concepts=concepts)
    print('Model Name: '.format(model_name))
    print('Concepts: {}'.format(concepts))


    #
    # Upload training data in chunks
    #
    chunks = list(divide_chunks(image_objs, n_batch))
    for i,chunk in enumerate(chunks):
        print('Bulk upload chunk {} of {}...'.format(i+1, len(chunks)))
        app.inputs.bulk_create_images(chunk)
        

    #
    # Train model
    # 
    print('Training model ...')
    model = app.models.get(model_name)  # Retrieve in case it expired
    model.train()
    print('Sleeping for a few minutes to make sure model is done training ...')
    time.sleep(240)

    
    #
    # Load in prediction images
    #
    image_objs_test = []
    print('Loading prediction images into memory...')
    for row in test_files:
        tfile = os.path.join(dataset_path, row[0])
        #tconcept = row[1]
        #tmeta = {'filename': tfile}
        img_tmp = ClImage(filename=tfile)
        image_objs_test = image_objs_test + [img_tmp]
    print('Number of prediction images: {}'.format(len(image_objs_test)))
    
    
    #
    # Bulk predict on holdout test data in chunks
    #
    y_true = test_files[:,1]
    class_labels = np.sort(np.unique(y_true))
    model = app.models.get(model_name)  # Retrieve in case it expired


    # Chunk it up!
    y_pred = []
    scores = []
    chunks = list(divide_chunks(image_objs_test, n_batch))
    for i,chunk in enumerate(chunks):
        print('Predicting on chunk {} of {}...'.format(i+1, len(chunks)))
        m = model.predict(chunk)

        # Parse out predictions
        for i,out in enumerate(m['outputs']):
            pred_set = out['data']['concepts']

            # Get sorted scores from prediction set
            t_names = np.array([z['name'] for z in pred_set])
            t_scores = np.array([z['value'] for z in pred_set])
            sort_inds = np.argsort(t_names)
            t_names = t_names[sort_inds]
            t_scores = t_scores[sort_inds]
            t_pred = t_names[np.argmax(t_scores)]

            y_pred = y_pred + [t_pred]
            scores = scores + [t_scores]
    scores = np.array(scores)

    #
    # Save results
    #
    save_file = '{}-results.p'.format(model_name)
    save_dict = {
        'y_true': y_true,
        'y_pred': y_pred,
        'scores': scores,
        'class_labels': class_labels,
        'model_name': model_name,
        'model': model,
        'train_files': train_files,
        'test_files': test_files
        }
    with open(save_file, 'wb') as f:
        pickle.dump(save_dict, f)
    print('Saved to {}'.format(save_file))
    
    return save_file

# Execute Train/Test on Each Dataset

In [None]:
for d,api_key in datasets.items():
    if d in skip_sets:
        continue
        
    # Get dataset key and directory
    dataset_path = os.path.join(root_data, d)
    print('EXECUTING DATASET: {}'.format(dataset_path))    
    
    # Run mega routine
    train_and_test(dataset_path, api_key)
    
    # Output
    print('Done! Be sure to press EVALUATE on the UI since that cannot be performed programmatically.\n\n\n\n')