In [1]:
from mlaut.analyze_results import AnalyseResults
from mlaut.data import Data
import pandas as pd
from mlaut.estimators.estimators import instantiate_default_estimators
from mlaut.analyze_results.scores import ScoreAccuracy

import matplotlib.pyplot as plt
pd.options.display.max_rows = 1000

  from ._conv import register_converters as _register_converters


In [2]:
data = Data()
input_io = data.open_hdf5('data/delgado.h5', mode='r')
out_io = data.open_hdf5('data/delgado-classification.h5', mode='a')
analyze = AnalyseResults(hdf5_output_io=out_io, 
                        hdf5_input_io=input_io, 
                        input_h5_original_datasets_group='openml/', 
                        output_h5_predictions_group='experiments/predictions/')


### All datasets

In [3]:
estimators = instantiate_default_estimators(['Classification'])
score_accuracy = ScoreAccuracy()

# (errors_per_estimator, 
#  errors_per_dataset_per_estimator) = analyze.prediction_errors(metric=score_accuracy, estimators=estimators)
 
(errors_per_estimator, 
 errors_per_dataset_per_estimator, 
 errors_per_dataset_per_estimator_df) = analyze.prediction_errors(score_accuracy, estimators)



#### Simple average and standard error

In [4]:
avg_and_std_error = analyze.average_and_std_error(errors_per_estimator)
# avg_and_std_error.index.name='Estimator Name'
avg_and_std_error.round(3)

Unnamed: 0,avg_score,std_error
BaselineClassifier,0.42,0.019
NeuralNetworkDeepClassifier,0.671,0.021
GaussianNaiveBayes,0.675,0.019
BernoulliNaiveBayes,0.707,0.015
PassiveAggressiveClassifier,0.758,0.016
GradientBoostingClassifier,0.79,0.016
K_Neighbours,0.805,0.013
SVC,0.818,0.013
BaggingClassifier,0.819,0.014
RandomForestClassifier,0.83,0.013


#### Average Rank

In [5]:
avg_rank = analyze.ranks(errors_per_estimator, ascending=False)
avg_rank.round(1)

Unnamed: 0,avg_rank
RandomForestClassifier,2.8
SVC,3.4
BaggingClassifier,3.8
K_Neighbours,4.1
GradientBoostingClassifier,4.9
PassiveAggressiveClassifier,5.4
NeuralNetworkDeepClassifier,6.7
BernoulliNaiveBayes,6.9
GaussianNaiveBayes,7.3
BaselineClassifier,9.7


#### Training time

In [6]:
avg_training_time, trainig_time_per_dataset = analyze.average_training_time(estimators)
avg_training_time

Unnamed: 0,avg training time (in sec)
BaselineClassifier,0.001
GaussianNaiveBayes,0.004
BernoulliNaiveBayes,0.006
NeuralNetworkDeepClassifier,3.249
BaggingClassifier,19.103
PassiveAggressiveClassifier,21.346
RandomForestClassifier,34.491
GradientBoostingClassifier,144.499
K_Neighbours,192.382
SVC,6006.24


#### merge avg score, rank and training time

In [7]:
avg_metrics = pd.DataFrame.merge(avg_rank,avg_and_std_error, left_index=True, right_index=True)
avg_metrics = pd.DataFrame.merge(avg_metrics, avg_training_time,left_index=True, right_index=True)
avg_metrics


Unnamed: 0,avg_rank,avg_score,std_error,avg training time (in sec)
RandomForestClassifier,2.8,0.83,0.013,34.491
SVC,3.4,0.818,0.013,6006.24
BaggingClassifier,3.8,0.819,0.014,19.103
K_Neighbours,4.1,0.805,0.013,192.382
GradientBoostingClassifier,4.9,0.79,0.016,144.499
PassiveAggressiveClassifier,5.4,0.758,0.016,21.346
NeuralNetworkDeepClassifier,6.7,0.671,0.021,3.249
BernoulliNaiveBayes,6.9,0.707,0.015,0.006
GaussianNaiveBayes,7.3,0.675,0.019,0.004
BaselineClassifier,9.7,0.42,0.019,0.001


#### Cohen's d

In [8]:
cohens_d = analyze.cohens_d(errors_per_estimator)
cohens_d.round(2)

estimator_2,BaselineClassifier,BernoulliNaiveBayes,GaussianNaiveBayes,GradientBoostingClassifier,K_Neighbours,NeuralNetworkDeepClassifier,PassiveAggressiveClassifier,RandomForestClassifier,SVC
estimator_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BaggingClassifier,-2.245,-0.733,-0.821,-0.186,-0.099,-0.765,-0.386,0.077,-0.01
BaselineClassifier,,1.566,1.256,1.972,2.187,1.148,1.804,2.332,2.25
BernoulliNaiveBayes,,,-0.178,0.504,0.649,-0.181,0.312,0.817,0.73
GaussianNaiveBayes,,,,0.62,0.748,-0.017,0.45,0.894,0.82
GradientBoostingClassifier,,,,,0.097,-0.586,-0.187,0.26,0.18
K_Neighbours,,,,,,-0.697,-0.3,0.179,0.09
NeuralNetworkDeepClassifier,,,,,,,0.43,0.829,0.76
PassiveAggressiveClassifier,,,,,,,,0.463,0.38
RandomForestClassifier,,,,,,,,,-0.08


#### t-test

In [9]:
t_test, t_test_df = analyze.t_test(errors_per_estimator)
t_test_df.round(3)

Unnamed: 0_level_0,BaggingClassifier,BaggingClassifier,BaselineClassifier,BaselineClassifier,BernoulliNaiveBayes,BernoulliNaiveBayes,GaussianNaiveBayes,GaussianNaiveBayes,GradientBoostingClassifier,GradientBoostingClassifier,K_Neighbours,K_Neighbours,NeuralNetworkDeepClassifier,NeuralNetworkDeepClassifier,PassiveAggressiveClassifier,PassiveAggressiveClassifier,RandomForestClassifier,RandomForestClassifier,SVC,SVC
Unnamed: 0_level_1,t_stat,p_val,t_stat,p_val,t_stat,p_val,t_stat,p_val,t_stat,p_val,t_stat,p_val,t_stat,p_val,t_stat,p_val,t_stat,p_val,t_stat,p_val
BaggingClassifier,0.0,1.0,17.095,0.0,5.581,0.0,6.255,0.0,1.418,0.157,0.754,0.452,5.825,0.0,2.943,0.004,-0.585,0.559,0.049,0.961
BaselineClassifier,-17.095,0.0,0.0,1.0,-11.926,0.0,-9.567,0.0,-15.021,0.0,-16.656,0.0,-8.745,0.0,-13.742,0.0,-17.761,0.0,-17.13,0.0
BernoulliNaiveBayes,-5.581,0.0,11.926,0.0,0.0,1.0,1.356,0.176,-3.839,0.0,-4.939,0.0,1.382,0.168,-2.373,0.018,-6.223,0.0,-5.567,0.0
GaussianNaiveBayes,-6.255,0.0,9.567,0.0,-1.356,0.176,0.0,1.0,-4.72,0.0,-5.697,0.0,0.133,0.894,-3.424,0.001,-6.81,0.0,-6.243,0.0
GradientBoostingClassifier,-1.418,0.157,15.021,0.0,3.839,0.0,4.72,0.0,0.0,1.0,-0.74,0.46,4.464,0.0,1.427,0.155,-1.979,0.049,-1.381,0.169
K_Neighbours,-0.754,0.452,16.656,0.0,4.939,0.0,5.697,0.0,0.74,0.46,0.0,1.0,5.31,0.0,2.285,0.023,-1.361,0.175,-0.709,0.479
NeuralNetworkDeepClassifier,-5.825,0.0,8.745,0.0,-1.382,0.168,-0.133,0.894,-4.464,0.0,-5.31,0.0,0.0,1.0,-3.277,0.001,-6.317,0.0,-5.809,0.0
PassiveAggressiveClassifier,-2.943,0.004,13.742,0.0,2.373,0.018,3.424,0.001,-1.427,0.155,-2.285,0.023,3.277,0.001,0.0,1.0,-3.526,0.001,-2.915,0.004
RandomForestClassifier,0.585,0.559,17.761,0.0,6.223,0.0,6.81,0.0,1.979,0.049,1.361,0.175,6.317,0.0,3.526,0.001,0.0,1.0,0.638,0.524
SVC,-0.049,0.961,17.13,0.0,5.567,0.0,6.243,0.0,1.381,0.169,0.709,0.479,5.809,0.0,2.915,0.004,-0.638,0.524,0.0,1.0


#### sign test

In [10]:
sign_test, sign_test_df = analyze.sign_test(errors_per_estimator)
sign_test_df

Unnamed: 0_level_0,BaggingClassifier,BaggingClassifier,BaselineClassifier,BaselineClassifier,BernoulliNaiveBayes,BernoulliNaiveBayes,GaussianNaiveBayes,GaussianNaiveBayes,GradientBoostingClassifier,GradientBoostingClassifier,K_Neighbours,K_Neighbours,NeuralNetworkDeepClassifier,NeuralNetworkDeepClassifier,PassiveAggressiveClassifier,PassiveAggressiveClassifier,RandomForestClassifier,RandomForestClassifier,SVC,SVC
Unnamed: 0_level_1,t_stat,p_val,t_stat,p_val,t_stat,p_val,t_stat,p_val,t_stat,p_val,t_stat,p_val,t_stat,p_val,t_stat,p_val,t_stat,p_val,t_stat,p_val
BaggingClassifier,0.0,1.0,11.741,0.0,5.534,0.0,5.911,0.0,1.236,0.216,0.902,0.367,5.275,0.0,2.824,0.005,-0.686,0.493,0.003,0.998
BaselineClassifier,-11.741,0.0,0.0,1.0,-9.792,0.0,-8.455,0.0,-10.983,0.0,-11.656,0.0,-7.971,0.0,-10.62,0.0,-11.954,0.0,-11.766,0.0
BernoulliNaiveBayes,-5.534,0.0,9.792,0.0,0.0,1.0,0.865,0.387,-4.018,0.0,-4.827,0.0,0.417,0.677,-2.712,0.007,-6.131,0.0,-5.62,0.0
GaussianNaiveBayes,-5.911,0.0,8.455,0.0,-0.865,0.387,0.0,1.0,-4.551,0.0,-5.263,0.0,-0.322,0.748,-3.267,0.001,-6.477,0.0,-5.954,0.0
GradientBoostingClassifier,-1.236,0.216,10.983,0.0,4.018,0.0,4.551,0.0,0.0,1.0,-0.338,0.735,4.068,0.0,1.565,0.117,-1.74,0.082,-1.131,0.258
K_Neighbours,-0.902,0.367,11.656,0.0,4.827,0.0,5.263,0.0,0.338,0.735,0.0,1.0,4.632,0.0,2.013,0.044,-1.539,0.124,-0.89,0.373
NeuralNetworkDeepClassifier,-5.275,0.0,7.971,0.0,-0.417,0.677,0.322,0.748,-4.068,0.0,-4.632,0.0,0.0,1.0,-2.773,0.006,-5.838,0.0,-5.344,0.0
PassiveAggressiveClassifier,-2.824,0.005,10.62,0.0,2.712,0.007,3.267,0.001,-1.565,0.117,-2.013,0.044,2.773,0.006,0.0,1.0,-3.477,0.001,-2.894,0.004
RandomForestClassifier,0.686,0.493,11.954,0.0,6.131,0.0,6.477,0.0,1.74,0.082,1.539,0.124,5.838,0.0,3.477,0.001,0.0,1.0,0.702,0.483
SVC,-0.003,0.998,11.766,0.0,5.62,0.0,5.954,0.0,1.131,0.258,0.89,0.373,5.344,0.0,2.894,0.004,-0.702,0.483,0.0,1.0


#### t-test with Bonferroni correction

In [11]:
t_test_bonferroni_df = analyze.t_test_with_bonferroni_correction(errors_per_estimator)
t_test_bonferroni_df

Unnamed: 0,BaggingClassifier,BaselineClassifier,BernoulliNaiveBayes,GaussianNaiveBayes,GradientBoostingClassifier,K_Neighbours,NeuralNetworkDeepClassifier,PassiveAggressiveClassifier,RandomForestClassifier,SVC
BaggingClassifier,False,True,True,True,False,False,True,False,False,False
BaselineClassifier,True,False,True,True,True,True,True,True,True,True
BernoulliNaiveBayes,True,True,False,False,True,True,False,False,True,True
GaussianNaiveBayes,True,True,False,False,True,True,False,False,True,True
GradientBoostingClassifier,False,True,True,True,False,False,True,False,False,False
K_Neighbours,False,True,True,True,False,False,True,False,False,False
NeuralNetworkDeepClassifier,True,True,False,False,True,True,False,False,True,True
PassiveAggressiveClassifier,False,True,False,False,False,False,False,False,False,False
RandomForestClassifier,False,True,True,True,False,False,True,False,False,False
SVC,False,True,True,True,False,False,True,False,False,False


#### Wilcoxon test

In [12]:
a, wilcoxon_df_multiindex = analyze.wilcoxon_test(errors_per_estimator)
wilcoxon_df_multiindex

  z = (T - mn - correction) / se


Unnamed: 0_level_0,BaggingClassifier,BaggingClassifier,BaselineClassifier,BaselineClassifier,BernoulliNaiveBayes,BernoulliNaiveBayes,GaussianNaiveBayes,GaussianNaiveBayes,GradientBoostingClassifier,GradientBoostingClassifier,K_Neighbours,K_Neighbours,NeuralNetworkDeepClassifier,NeuralNetworkDeepClassifier,PassiveAggressiveClassifier,PassiveAggressiveClassifier,RandomForestClassifier,RandomForestClassifier,SVC,SVC
Unnamed: 0_level_1,statistic,p_val,statistic,p_val,statistic,p_val,statistic,p_val,statistic,p_val,statistic,p_val,statistic,p_val,statistic,p_val,statistic,p_val,statistic,p_val
BaggingClassifier,0.0,,6.0,0.0,464.5,0.0,429.5,0.0,1098.0,0.0,2072.0,0.016,579.0,0.0,1032.0,0.0,1204.0,0.0,2740.5,0.765
BaselineClassifier,6.0,0.0,0.0,,62.5,0.0,439.0,0.0,24.5,0.0,1.0,0.0,87.0,0.0,13.5,0.0,0.0,0.0,0.0,0.0
BernoulliNaiveBayes,464.5,0.0,62.5,0.0,0.0,,2404.0,0.053,1159.5,0.0,315.0,0.0,2848.0,0.359,1383.0,0.0,165.5,0.0,254.5,0.0
GaussianNaiveBayes,429.5,0.0,439.0,0.0,2404.0,0.053,0.0,,1120.0,0.0,365.0,0.0,2701.0,0.231,1001.0,0.0,151.5,0.0,144.0,0.0
GradientBoostingClassifier,1098.0,0.0,24.5,0.0,1159.5,0.0,1120.0,0.0,0.0,,2405.0,0.133,901.0,0.0,2550.5,0.055,604.5,0.0,1711.0,0.0
K_Neighbours,2072.0,0.016,1.0,0.0,315.0,0.0,365.0,0.0,2405.0,0.133,0.0,,412.0,0.0,1244.0,0.0,1109.0,0.0,1601.5,0.001
NeuralNetworkDeepClassifier,579.0,0.0,87.0,0.0,2848.0,0.359,2701.0,0.231,901.0,0.0,412.0,0.0,0.0,,1547.0,0.0,271.0,0.0,266.0,0.0
PassiveAggressiveClassifier,1032.0,0.0,13.5,0.0,1383.0,0.0,1001.0,0.0,2550.5,0.055,1244.0,0.0,1547.0,0.0,0.0,,613.0,0.0,533.5,0.0
RandomForestClassifier,1204.0,0.0,0.0,0.0,165.5,0.0,151.5,0.0,604.5,0.0,1109.0,0.0,271.0,0.0,613.0,0.0,0.0,,2050.5,0.028
SVC,2740.5,0.765,0.0,0.0,254.5,0.0,144.0,0.0,1711.0,0.0,1601.5,0.001,266.0,0.0,533.5,0.0,2050.5,0.028,0.0,


#### Friedman test

In [13]:
_, friedman_test_df = analyze.friedman_test(errors_per_estimator)
friedman_test_df

Unnamed: 0,statistic,p_value
0,534.76,0.0


In [14]:
nemeniy_test = analyze.nemenyi(errors_per_estimator)
nemeniy_test_df = pd.DataFrame(nemeniy_test)
nemeniy_test_df

Unnamed: 0,BaggingClassifier,BaselineClassifier,BernoulliNaiveBayes,GaussianNaiveBayes,GradientBoostingClassifier,K_Neighbours,NeuralNetworkDeepClassifier,PassiveAggressiveClassifier,RandomForestClassifier,SVC
BaggingClassifier,-1.0,0.0,0.002,0.0,0.998,1.0,0.001,0.652,1.0,1.0
BaselineClassifier,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BernoulliNaiveBayes,0.002,0.0,-1.0,1.0,0.094,0.028,1.0,0.744,0.0,0.002
GaussianNaiveBayes,0.0,0.0,1.0,-1.0,0.017,0.003,1.0,0.393,0.0,0.0
GradientBoostingClassifier,0.998,0.0,0.094,0.017,-1.0,1.0,0.054,0.991,0.957,0.997
K_Neighbours,1.0,0.0,0.028,0.003,1.0,-1.0,0.014,0.939,0.995,1.0
NeuralNetworkDeepClassifier,0.001,0.0,1.0,1.0,0.054,0.014,-1.0,0.624,0.0,0.001
PassiveAggressiveClassifier,0.652,0.0,0.744,0.393,0.991,0.939,0.624,-1.0,0.331,0.643
RandomForestClassifier,1.0,0.0,0.0,0.0,0.957,0.995,0.0,0.331,-1.0,1.0
SVC,1.0,0.0,0.002,0.0,0.997,1.0,0.001,0.643,1.0,-1.0


### Per dataset

In [15]:
errors_per_dataset_per_estimator_df

Unnamed: 0,Unnamed: 1,loss,std_error
abalone,BaggingClassifier,0.377,0.013
abalone,BaselineClassifier,0.667,0.013
abalone,BernoulliNaiveBayes,0.449,0.013
abalone,GaussianNaiveBayes,0.440,0.013
abalone,GradientBoostingClassifier,0.389,0.013
abalone,K_Neighbours,0.365,0.013
abalone,NeuralNetworkDeepClassifier,0.373,0.013
abalone,PassiveAggressiveClassifier,0.373,0.013
abalone,RandomForestClassifier,0.365,0.013
abalone,SVC,0.362,0.013


## Save tables to $\LaTeX$

In [16]:
#average and standard error
with open('../mlaut_paper/tables/avg_and_st_error.tex', 'w') as tf:
    tf.write(avg_and_std_error.to_latex())
    
#average rank
with open('../mlaut_paper/tables/avg_rank.tex', 'w') as tf:
    tf.write(avg_rank.to_latex())

#average metrics
with open('../mlaut_paper/tables/avg_metrics.tex', 'w') as tf:
    tf.write(avg_metrics.to_latex())
#Cohen's D
with open('../mlaut_paper/tables/cohens_d.tex', 'w') as tf:
    tf.write(cohens_d.to_latex())
#t-test
with open('../mlaut_paper/tables/t_test.tex', 'w') as tf:
    tf.write(t_test_df.to_latex())
with open('../mlaut_paper/tables/t_test_bonferroni.tex', 'w') as tf:
    tf.write(t_test_bonferroni_df.to_latex())
#sign test
with open('../mlaut_paper/tables/sign_test.tex', 'w') as tf:
    tf.write(sign_test_df.to_latex())
#t-test with Bonferroni correction
with open('../mlaut_paper/tables/t_test_bonferroni.tex', 'w') as tf:
    tf.write(t_test_bonferroni_df.to_latex())
#Wilcoxon
with open('../mlaut_paper/tables/wilxocon_test.tex', 'w') as tf:
    tf.write(wilcoxon_df_multiindex.to_latex())
#Friedman test
with open('../mlaut_paper/tables/friedman_test.tex', 'w') as tf:
    tf.write(friedman_test_df.to_latex())
#Nemeniy test
with open('../mlaut_paper/tables/nemeniy_test.tex', 'w') as tf:
    tf.write(nemeniy_test_df.to_latex())
#Errors per dataset per estimator
with open('../mlaut_paper/tables/errors_per_dataset_per_estimator.tex', 'w') as tf:
    tf.write(errors_per_dataset_per_estimator_df.to_latex(longtable=True).
             replace('\n', '\n\\caption{Errors per dataset and estimator}\\\\\n', 1))