In [1]:
from mleap.analyze_results import AnalyseResults
from mleap.data import Data
import pandas as pd
from mleap.estimators.estimators import instantiate_default_estimators

import matplotlib.pyplot as plt
pd.options.display.max_rows = 1000

  from ._conv import register_converters as _register_converters
  return f(*args, **kwds)


In [2]:
data = Data()
input_io = data.open_hdf5('data/delgado.hdf5', mode='r')
out_io = data.open_hdf5('data/classification.hdf5', mode='a')
analyze = AnalyseResults(hdf5_output_io=out_io, 
                        hdf5_input_io=input_io, 
                        input_h5_original_datasets_group='delgado_datasets/', 
                        output_h5_predictions_group='experiments/predictions/')


### All datasets

In [3]:
error_all_datasets = analyze.calculate_error_all_datasets(metric='mean_squared_error')

#### Simple average

In [4]:
res_df = analyze.calculate_average_std(error_all_datasets)
res_df

Unnamed: 0,avg,std
BaggingClassifier,9.678659,56.127002
RandomForestClassifier,10.411836,57.872823
LogisticRegression,12.885971,74.313996
GaussianNaiveBayes,13.72331,69.132686
BernoulliNaiveBayes,16.919236,98.644095
Lasso,17.556042,99.268908
LassoLars,17.618406,99.500173
RidgeRegression,17.728869,100.070785
PassiveAggressiveClassifier,20.1881,118.21405
GradientBoostingClassifier,24.04833,145.624963


#### Cohen's d

In [5]:
cohens_d = analyze.cohens_d(error_all_datasets)
cohens_d

Unnamed: 0,Cohen's d
BaggingClassifier-BaselineClassifier,0.195295
BaselineClassifier-RandomForestClassifier,-0.190966
BaselineClassifier-LogisticRegression,-0.174502
BaselineClassifier-GaussianNaiveBayes,-0.170901
BaggingClassifier-SVC,0.159332
RandomForestClassifier-SVC,0.153453
BaselineClassifier-BernoulliNaiveBayes,-0.14816
BaselineClassifier-Lasso,-0.14469
BaselineClassifier-LassoLars,-0.144317
BaselineClassifier-RidgeRegression,-0.143624


#### t-test

In [6]:
t_test, t_test_df = analyze.t_test(error_all_datasets)
t_test_df

Unnamed: 0,pair,t_statistic,p_value
0,BaggingClassifier - BaselineClassifier,-1.51275,0.131659
1,BaggingClassifier - BernoulliNaiveBayes,-0.698861,0.485316
2,BaggingClassifier - GaussianNaiveBayes,-0.497562,0.619249
3,BaggingClassifier - GradientBoostingClassifier,-1.008618,0.314174
4,BaggingClassifier - Lasso,-0.756702,0.449971
5,BaggingClassifier - LassoLars,-0.761348,0.447197
6,BaggingClassifier - LogisticRegression,-0.37727,0.706307
7,BaggingClassifier - PassiveAggressiveClassifier,-0.879746,0.379877
8,BaggingClassifier - RandomForestClassifier,-0.099623,0.920727
9,BaggingClassifier - RidgeRegression,-0.768594,0.44289


#### sign test

In [7]:
sign_test, sign_test_df = analyze.sign_test(error_all_datasets)
sign_test_df

Unnamed: 0,pair,t_statistic,p_value
0,BaggingClassifier - BaselineClassifier,-7.38292,1.54855e-13
1,BaggingClassifier - BernoulliNaiveBayes,-2.830119,0.004653064
2,BaggingClassifier - GaussianNaiveBayes,-3.367309,0.0007590546
3,BaggingClassifier - GradientBoostingClassifier,-0.557392,0.5772596
4,BaggingClassifier - Lasso,-1.354454,0.1755917
5,BaggingClassifier - LassoLars,-0.562902,0.5735018
6,BaggingClassifier - LogisticRegression,-0.867769,0.385521
7,BaggingClassifier - PassiveAggressiveClassifier,-1.528007,0.1265107
8,BaggingClassifier - RandomForestClassifier,0.46832,0.6395561
9,BaggingClassifier - RidgeRegression,-0.539027,0.5898685


#### t-test with Bonferroni correction

In [8]:
t_test_bonferroni, t_test_bonferroni_df = analyze.t_test_with_bonferroni_correction(error_all_datasets)
t_test_bonferroni_df

Unnamed: 0,pair,p_value
0,BaggingClassifier - BaselineClassifier,1.0
1,BaggingClassifier - BernoulliNaiveBayes,1.0
2,BaggingClassifier - GaussianNaiveBayes,1.0
3,BaggingClassifier - GradientBoostingClassifier,1.0
4,BaggingClassifier - Lasso,1.0
5,BaggingClassifier - LassoLars,1.0
6,BaggingClassifier - LogisticRegression,1.0
7,BaggingClassifier - PassiveAggressiveClassifier,1.0
8,BaggingClassifier - RandomForestClassifier,1.0
9,BaggingClassifier - RidgeRegression,1.0


#### Wilcoxon test

In [9]:
wilcoxon_test, wilcoxon_test_df = analyze.wilcoxon_test(error_all_datasets)
wilcoxon_test_df

Unnamed: 0,pair,statistic,p_value
0,BaggingClassifier - BaselineClassifier,134.0,5.407341e-20
1,BaggingClassifier - BernoulliNaiveBayes,1258.0,8.733553e-10
2,BaggingClassifier - GaussianNaiveBayes,670.0,3.874445e-14
3,BaggingClassifier - GradientBoostingClassifier,1640.0,4.059521e-05
4,BaggingClassifier - Lasso,3010.0,0.07837665
5,BaggingClassifier - LassoLars,3606.0,0.9498837
6,BaggingClassifier - LogisticRegression,2665.0,0.06151065
7,BaggingClassifier - PassiveAggressiveClassifier,1672.0,2.388333e-05
8,BaggingClassifier - RandomForestClassifier,2025.0,0.2059505
9,BaggingClassifier - RidgeRegression,3557.0,0.7298601


#### Friedman test

In [10]:
friedman_test, friedman_test_df = analyze.friedman_test(error_all_datasets)
friedman_test_df

Unnamed: 0,statistic,p_value
0,479.086023,9.21217e-96


In [11]:
nemeniy_test = analyze.nemenyi(error_all_datasets)
pd.DataFrame(nemeniy_test)

Unnamed: 0,BaggingClassifier,BaselineClassifier,BernoulliNaiveBayes,GaussianNaiveBayes,GradientBoostingClassifier,Lasso,LassoLars,LogisticRegression,PassiveAggressiveClassifier,RandomForestClassifier,RidgeRegression,SVC
BaggingClassifier,-1.0,1.330535e-08,0.729783,0.382436,0.9999995,0.999787,1.0,0.999978,0.995824,1.0,1.0,0.991555
BaselineClassifier,1.330535e-08,-1.0,0.01287,0.07768,7.213707e-07,1.2e-05,1.469692e-07,3e-06,0.000101,6.439114e-10,1.168595e-07,0.000187
BernoulliNaiveBayes,0.7297835,0.01286963,-1.0,1.0,0.9478722,0.994196,0.8836575,0.982193,0.99965,0.4863279,0.8717527,0.999888
GaussianNaiveBayes,0.382436,0.07768015,1.0,-1.0,0.7354808,0.922646,0.593989,0.854209,0.984081,0.1825839,0.5729845,0.991474
GradientBoostingClassifier,0.9999995,7.213707e-07,0.947872,0.735481,-1.0,1.0,1.0,1.0,0.999974,0.9998811,1.0,0.999901
Lasso,0.9997869,1.211277e-05,0.994196,0.922646,1.0,-1.0,0.9999963,1.0,1.0,0.9953856,0.999994,1.0
LassoLars,1.0,1.469692e-07,0.883657,0.593989,1.0,0.999996,-1.0,1.0,0.999681,0.9999933,1.0,0.999128
LogisticRegression,0.9999778,3.492093e-06,0.982193,0.854209,1.0,1.0,0.9999999,-1.0,0.999999,0.9989053,0.9999998,0.999996
PassiveAggressiveClassifier,0.9958239,0.0001013333,0.99965,0.984081,0.9999742,1.0,0.9996813,0.999999,-1.0,0.9668151,0.9995708,1.0
RandomForestClassifier,1.0,6.439114e-10,0.486328,0.182584,0.9998811,0.995386,0.9999933,0.998905,0.966815,-1.0,0.9999958,0.946709


### Per dataset

In [12]:
error_per_dataset, error_per_dataset_df = analyze.calculate_error_per_dataset(metric='mean_squared_error')
error_per_dataset_df

Unnamed: 0,Unnamed: 1,score,std
abalone,BaggingClassifier,0.703766,0.843316
abalone,BaselineClassifier,1.174110,1.527226
abalone,BernoulliNaiveBayes,0.871763,1.194485
abalone,GaussianNaiveBayes,0.782328,0.961718
abalone,GradientBoostingClassifier,0.706337,0.843328
abalone,Lasso,0.643337,0.540242
abalone,LassoLars,0.613381,0.822697
abalone,LogisticRegression,0.715011,0.917391
abalone,PassiveAggressiveClassifier,0.756408,0.673529
abalone,RandomForestClassifier,0.706337,0.888549
