### Notebook for cross validation 

In [1]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import pickle

#### Load results pickle

In [3]:
def _read_pickle(filename):
    with open(filename, 'rb') as handle:
        d = pickle.load(handle)
    return d

In [91]:
def read_files(base_datadir=None, expname=None, clf_keys=[], cv=None):
    dicts = {}
    for clf_key in clf_keys:
        datadir = os.path.join(base_datadir,"{}/cv-{}/{}".format(str(expname), str(cv), clf_key) )
        filelist = glob.glob(os.path.join(datadir, "*.pkl"))
        tmp_list = []
        for ifile in filelist:
            #print((_read_pickle(ifile)))
            tmp_list.append(_read_pickle(ifile))
        dicts[clf_key] = tmp_list
    return dicts

In [133]:
# RI autoencoder
ri_dicts = read_files(base_datadir='./cross_validation', expname='67011582',
                      clf_keys=['SVM', "MLP", "RF", "ADABOOST"], cv=4,
                     )

In [134]:
# NRI autoencoder
nri_dicts = read_files(base_datadir='./cross_validation', expname='m2_02_global_2000_2018_band28_29_31',
                      clf_keys=['SVM', "MLP", "RF", "ADABOOST"], cv=4,
                     )

In [94]:
ri_dicts['SVM'][0]['idx-1742']

{'fit_time': array([0.21437693, 0.19412374, 0.17950153]),
 'score_time': array([0.12303901, 0.10711384, 0.13404226]),
 'test_accuracy': array([1., 1., 1.]),
 'test_precision': array([1., 1., 1.]),
 'test_recall': array([1., 1., 1.]),
 'test_f1': array([1., 1., 1.]),
 'test_average_precision': array([1., 1., 1.]),
 'test_roc_auc': array([1., 1., 1.])}

In [95]:
ri_dicts['SVM'][0].keys()

dict_keys(['idx-1742', 'idx-1947', 'idx-629', 'idx-864', 'idx-1243', 'idx-1634', 'idx-848', 'idx-822', 'idx-634', 'idx-522', 'idx-344', 'idx-1508', 'idx-62', 'idx-352', 'idx-749', 'idx-1192', 'idx-431', 'idx-1743', 'idx-1803', 'idx-724', 'idx-167', 'idx-273', 'idx-1523', 'idx-733', 'idx-41'])

#### Analysis
  'test_accuracy',
  'test_precision',
  'test_recall',
  'test_f1'

In [117]:
def decode_dict(dicts={}, clf_keys=[], nfiles=4):

    result_dicts = {}
    for clf_key in clf_keys:
        accs = []
        precisions = []
        recalls = []
        f1s = []
        for i in range(nfiles):
            ind = dicts[clf_key][i]    
            for idx in ind.keys():
                for item in dicts[clf_key][i][idx].keys():
                    if  item == 'test_accuracy':
                        accs.append(dicts[clf_key][i][idx][item])
                    elif item  ==  'test_precision':
                        precisions.append(dicts[clf_key][i][idx][item])
                    elif item  ==  'test_recall':
                        recalls.append(dicts[clf_key][i][idx][item])
                    elif item  == 'test_f1':
                        f1s.append(dicts[clf_key][i][idx][item])
        result_dicts[clf_key+'-acc'] = np.concatenate(accs)
        result_dicts[clf_key+'-precision'] = np.concatenate(precisions)
        result_dicts[clf_key+'-recall'] = np.concatenate(recalls)
        result_dicts[clf_key+'-f1'] =np.concatenate(f1s)
    return result_dicts

In [135]:
ri_results = decode_dict(ri_dicts,clf_keys=['SVM', "MLP", "RF", "ADABOOST"] )

In [136]:
nri_results = decode_dict(nri_dicts,clf_keys=['SVM', "MLP", "RF", "ADABOOST"] )

In [125]:
ri_results.keys()

dict_keys(['SVM-acc', 'SVM-precision', 'SVM-recall', 'SVM-f1', 'MLP-acc', 'MLP-precision', 'MLP-recall', 'MLP-f1', 'RF-acc', 'RF-precision', 'RF-recall', 'RF-f1', 'ADABOOST-acc', 'ADABOOST-precision', 'ADABOOST-recall', 'ADABOOST-f1'])

#### Make pandas frame

In [128]:
def stats_df(result_dicts):
    data = np.zeros((2,len(result_dicts.keys()) ))
    for idx, ikey in enumerate(result_dicts.keys()):
        data[0][idx] = np.mean(result_dicts[ikey])
        data[1][idx] = np.std(result_dicts[ikey])
    df = pd.DataFrame(data)
    df.columns = [i for i in result_dicts.keys() ]
    df.index = ['mean', 'std']
    return df

In [137]:
ri_df = stats_df(ri_results)

In [130]:
# cv-3
ri_df

Unnamed: 0,SVM-acc,SVM-precision,SVM-recall,SVM-f1,MLP-acc,MLP-precision,MLP-recall,MLP-f1,RF-acc,RF-precision,RF-recall,RF-f1,ADABOOST-acc,ADABOOST-precision,ADABOOST-recall,ADABOOST-f1
mean,0.955819,0.946523,0.984417,0.959915,0.761792,0.803781,0.794972,0.74398,0.621917,0.66233,0.654139,0.602946,0.759514,0.802425,0.778806,0.746152
std,0.082433,0.106092,0.072699,0.075175,0.175266,0.197899,0.298573,0.231919,0.094289,0.14693,0.278112,0.164397,0.12354,0.159165,0.255191,0.162875


In [139]:
# cv-4
ri_df

Unnamed: 0,SVM-acc,SVM-precision,SVM-recall,SVM-f1,MLP-acc,MLP-precision,MLP-recall,MLP-f1,RF-acc,RF-precision,RF-recall,RF-f1,ADABOOST-acc,ADABOOST-precision,ADABOOST-recall,ADABOOST-f1
mean,0.955208,0.950578,0.973889,0.957615,0.800417,0.854316,0.816111,0.78187,0.640139,0.688665,0.671944,0.625542,0.789444,0.817188,0.822222,0.783977
std,0.080519,0.099336,0.082908,0.077415,0.179563,0.18753,0.285168,0.233146,0.089123,0.15174,0.264732,0.151315,0.137197,0.166044,0.236389,0.169989


In [144]:
np.mean([  ri_df[i+'-acc']['mean'] for i in ['SVM', "MLP", "RF", "ADABOOST"] ])

0.7963020833333334

---------

In [138]:
nri_df = stats_df(nri_results)

In [132]:
# cv-3
nri_df

Unnamed: 0,SVM-acc,SVM-precision,SVM-recall,SVM-f1,MLP-acc,MLP-precision,MLP-recall,MLP-f1,RF-acc,RF-precision,RF-recall,RF-f1,ADABOOST-acc,ADABOOST-precision,ADABOOST-recall,ADABOOST-f1
mean,0.959097,0.941207,1.0,0.965877,0.717778,0.627749,0.834444,0.700896,0.805208,0.810758,0.819722,0.807137,0.907431,0.90466,0.951528,0.913269
std,0.081346,0.112416,0.0,0.066293,0.221485,0.333906,0.358454,0.327031,0.14986,0.159851,0.168922,0.151554,0.126704,0.1414,0.137441,0.126853


In [140]:
# cv-4
nri_df

Unnamed: 0,SVM-acc,SVM-precision,SVM-recall,SVM-f1,MLP-acc,MLP-precision,MLP-recall,MLP-f1,RF-acc,RF-precision,RF-recall,RF-f1,ADABOOST-acc,ADABOOST-precision,ADABOOST-recall,ADABOOST-f1
mean,0.925069,0.931608,0.942083,0.922554,0.680347,0.608778,0.721528,0.619381,0.780208,0.777591,0.811389,0.782775,0.930069,0.939507,0.944861,0.93309
std,0.133365,0.138088,0.176372,0.149602,0.211671,0.34447,0.413004,0.349805,0.129218,0.137789,0.175515,0.138707,0.130611,0.134078,0.135906,0.123217


In [145]:
np.mean([  nri_df[i+'-acc']['mean'] for i in ['SVM', "MLP", "RF", "ADABOOST"] ])

0.8289236111111111