# Azure Dataset Evaluation Notebook

In [None]:
import os
import sys
import clarifai
import pandas as pd
import numpy as np
import datetime
import pickle
import time
import requests
import json
from joblib import Parallel, delayed


# Usage:
1. Replace root_data with path to datasets
2. Insert training key for your project
3. Insert prediction key for your project
4. Comment out any dataset_domains in dict that you're not interested in.

# Config Params

In [None]:
skip_sets = []  # List ones to skip here

root_data = '{PATH_TO_ROOT_DATASETS}'
n_batch = 64
max_total = 50000

# Replace with keys
training_key = '{TRAINING_KEY}'
prediction_key = '{PREDICTION_KEY}'

# Hardcoded URLs. Needs updating if API updates are made
url_train = 'https://southcentralus.api.cognitive.microsoft.com/customvision/v2.0/Training'
url_pred = 'https://southcentralus.api.cognitive.microsoft.com/customvision/v2.0/Prediction'

# It's suggested you select the appropriate domain for your dataset
retail_domain_id = 'b30a91ae-e3c1-4f73-a81e-c270bff27c39'
general_domain_id = 'ee85a74c-405e-4adc-bb47-ffa8ca0c9f31'

dataset_domains = {
    'fashion_mnist_10p': general_domain_id,    
    'cifar10_20p': general_domain_id,    
    'uo_dress': retail_domain_id,
    'cifar10': general_domain_id,
    'fashion_mnist': general_domain_id,
    'mnist': general_domain_id
    'fashion_mnist_tiny': general_domain_id,    
    'cifar10_tiny': general_domain_id,    
    'uo_dress_tiny': retail_domain_id,
    'mnist_tiny': general_domain_id    
    }

# Setup

In [None]:
# Setup
header_train = {'Training-Key': training_key}
header_pred = {'Prediction-Key': prediction_key}

# Support Functions

In [None]:
# Yield successive n-sized chunks from l. 
def divide_chunks(l, n):      
    # looping till length l 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 

## The main train/test function

In [None]:
def train_and_test(dataset_path, domain_id):
    labels = pd.read_csv(os.path.join(dataset_path, 'labels.csv'), header=None, dtype=str)
    train_files = labels[labels[0].str.contains('(train|val)')].values
    test_files = labels[labels[0].str.contains('test')].values    
    
    y_train = train_files[:,1]
    class_labels = np.sort(np.unique(y_train))
    
    # Create a new project
    timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    project_name = '{}_{}'.format(d, timestamp)
    url_create = '{}/projects'.format(url_train)
    params = {'name': project_name,
              'domainId': domain_id
             }

    # Call the API
    print(url_create)
    r = requests.post(url_create, headers=header_train, params=params)
    print(r.status_code)

    # Get ID
    project_id = r.json()['id']
    print('Project Name: {}. ID: {}'.format(project_name, project_id)) 
    
    project_settings = r.json()
    print(project_settings)
    
    
    # Assign to multi-class instead of multi-label
    url_project = '{}/projects/{}'.format(url_train, project_id)
    print(url_project)
    project_settings['settings']['classificationType'] = 'Multiclass'
    r = requests.patch(url_project, headers=header_train, data=json.dumps(project_settings))
    print(r)

    # Create tags
    tag_ids = {}
    url_tag = '{}/projects/{}/tags'.format(url_train, project_id)
    for c in class_labels:
        params = {'name': c}
        r = requests.post(url_tag, headers=header_train, params=params)
        tag_ids[c] = r.json()['id']
        print(c, tag_ids[c], r)



    # Load in files for training

    # Set up base url
    url_upload = '{}/projects/{}/images'.format(url_train, project_id)

    # Get all matching files for this class (to batch upload)
    max_per_class = (max_total/len(class_labels))-1
    for cls in class_labels:
        print('Uploading files from class: {} ...'.format(cls))
        inds_match = y_train == cls
        files_temp = train_files[inds_match, 0]
        y_temp = y_train[inds_match]
        files_temp = [os.path.join(dataset_path, c) for c in files_temp]
        print('{} files found.'.format(len(files_temp)))

        # Chunk up in groups of 64
        chunks = list(divide_chunks(files_temp, n_batch))
        for i,chunk in enumerate(chunks):
            print('Bulk upload chunk {} of {} ({} items) ...'.format(i+1, len(chunks), len(chunk)))

            # Open handles for bulk upload
            handles = {}
            for f in chunk:
                basename = os.path.basename(f)
                handles[basename] = open(f, 'rb')

            # Assign tag from lookup
            params = {'tagIds': tag_ids[cls]}

            # Make the request and print resonse status
            r = requests.post(url_upload, headers=header_train, params=params, files=handles)
            print(r)

            # Close the handles
            for k,v in handles.items():
                v.close()  
            
            # Don't upload past max per class
            if (i+1)*n_batch > (max_per_class-n_batch):
                break

    # Train the model
    url_run_training = '{}/projects/{}/train'.format(url_train, project_id)
    r = requests.post(url_run_training, headers=header_train)
    print(r)
    print(r.json)

    # Assign model ID
    model_id = r.json()['id']
    print('Trained Model ID: {}'.format(model_id))


    # Wait for training to complete
    url_status = '{}/projects/{}/iterations'.format(url_train, project_id)

    # Loop and wait for training to complete
    max_tries = 60
    wait_cnt = 0
    while wait_cnt < max_tries:
        print('Try #{} of {}...'.format(wait_cnt+1, max_tries))
        r = requests.get(url_status, headers=header_train)
        try:
            status = r.json()[0]['status']
        except:
            status = 'error'
        print(r)
        if status == 'Completed':
            print('Training Complete!')
            break
        print('Status: {}. Waiting for model training to complete ...'.format(status))
        time.sleep(30)
        wait_cnt += 1

    # Save off iteration ID for prediction
    iteration_id = r.json()[0]['id']
    print('Iteration ID: {}'.format(iteration_id))


    # Assign ground truth for test
    y_true = test_files[:,1]


    # Load in files for prediction 

    # Set up base url
    #url_query = '{}/{}/image'.format(url_pred, project_id) # with storing
    url_query = '{}/{}/image/nostore'.format(url_pred, project_id)  # No storing
    params = {'iterationId': iteration_id}

    # Chunk it up!
    y_pred = []
    scores = []
    handle = {}
    test_filenames = test_files[:, 0]
    test_filenames = [os.path.join(dataset_path, c) for c in test_filenames]
    print('{} prediction files found.'.format(len(test_filenames)))

    def predict_worker(filename):
        basename = os.path.basename(filename)
        handle['imageData'] = open(filename, 'rb') 
        r = requests.post(url_query, headers=header_pred, params=params, files=handle)
        handle['imageData'].close() # Close the handle and move on   

        # Error check
        t_pred = None
        t_scores = None
        if r.status_code == 200:
            # Extract predictions
            out = r.json()
            pred_set = out['predictions']

            # Get sorted scores from prediction set
            t_names = np.array([z['tagName'] for z in pred_set])
            t_scores = np.array([z['probability'] for z in pred_set])
            sort_inds = np.argsort(t_names)
            t_names = t_names[sort_inds]
            t_scores = t_scores[sort_inds]
            t_pred = t_names[np.argmax(t_scores)]
        else:
            print('Error occured on prediction: {}. Skipping save.'.format(filename))

        return basename, t_pred, t_scores


    # Run parallel calls to make faster
    t_start = time.time()
    with Parallel(n_jobs=-1, verbose=5) as parallel:
        n_iter = 0
        results = parallel(delayed(predict_worker)(f) for f in test_filenames)
    t_elapsed = time.time() - t_start
    print(len(results))
    print('{:0.3f} secs elapsed for predicting {} images'.format(t_elapsed, len(test_filenames)))

    # Parse the parallel output
    returned_files = [r[0] for r in results]
    y_pred = [r[1] for r in results]
    scores = [r[2] for r in results]
    scores = np.array(scores)

    print('Number of predictions: {}'.format(len(y_pred)))
    print('Number of fails: {}.'.format(sum([yy is None for yy in y_pred])))


    #
    # Save results
    #
    save_file = '{}-results.p'.format(project_name)
    save_dict = {
        'y_true': y_true,
        'y_pred': y_pred,
        'scores': scores,
        'class_labels': class_labels,
        'model_name': project_name,
        'model': None,
        'train_files': train_files,
        'test_files': test_files,
        'returned_files': returned_files
        }
    with open(save_file, 'wb') as f:
        pickle.dump(save_dict, f)
    print('Saved to {}'.format(save_file))

    return save_file

# Execute Train/Test on Each Dataset

In [None]:
for d,domain_id in dataset_domains.items():
    if d in skip_sets:
        continue
        
    # Get dataset key and directory
    dataset_path = os.path.join(root_data, d)
    print('EXECUTING DATASET: {}'.format(dataset_path))    
    
    # Run mega routine
    train_and_test(dataset_path, domain_id)
    
    # Output
    print('Done!')