# Multiple hypothesis testing

In [0]:
from __future__ import division

import numpy as np
import pandas as pd

from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests 

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import combinations

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Classifier C4.5 and its three modifications: with optimization of the hyperparameter, hyperparameter and with the simultaneous optimization of both hyperparameters. These four classifiers were compared on 14 data sets. On each dataset, the AUC of each classifier was calculated. Data is written in the format: 

AUCs.txt

In [2]:
aucs = pd.read_csv('https://raw.githubusercontent.com/OzmundSedler/100-Days-Of-ML-Code/master/week_12/datasets/AUCs.txt', delimiter='\t')
aucs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
Unnamed: 0    14 non-null object
C4.5          14 non-null float64
C4.5+m        14 non-null float64
C4.5+cf       14 non-null float64
C4.5+m+cf     14 non-null float64
dtypes: float64(4), object(1)
memory usage: 640.0+ bytes


In [3]:
aucs.head()

Unnamed: 0.1,Unnamed: 0,C4.5,C4.5+m,C4.5+cf,C4.5+m+cf
0,adult (sample),0.763,0.768,0.771,0.798
1,breast cancer,0.599,0.591,0.59,0.569
2,breast cancer wisconsin,0.954,0.971,0.968,0.967
3,cmc,0.628,0.661,0.654,0.657
4,ionosphere,0.882,0.888,0.886,0.898


In [4]:
aucs.describe()

Unnamed: 0,C4.5,C4.5+m,C4.5+cf,C4.5+m+cf
count,14.0,14.0,14.0,14.0
mean,0.804929,0.820429,0.808786,0.827214
std,0.160187,0.158583,0.167566,0.154548
min,0.583,0.583,0.563,0.569
25%,0.63625,0.6665,0.624,0.673
50%,0.8285,0.863,0.876,0.8865
75%,0.9505,0.96875,0.96025,0.96575
max,1.0,1.0,1.0,1.0


In [0]:
aucs.columns = ['Dataset', 'C4.5', 'C4.5+m', 'C4.5+cf', 'C4.5+m+cf']

Using the sign ranks test, a comparison is made with each pair of classigires. Choose two classifiers, the difference between them is the most statistically significant.

In [0]:
w_stat = pd.DataFrame(columns=['Model 1', 'Model 2', 'Wilcoxon stat', 'p-value'])
k = 0
for i, j in combinations([1, 2, 3, 4], 2):
    w_stat.loc[k, 'Model 1'], w_stat.loc[k, 'Model 2'] = aucs.columns[i], aucs.columns[j]
    w_stat.loc[k, 'Wilcoxon stat'], w_stat.loc[k, 'p-value'] = stats.wilcoxon(aucs.iloc[:, i], aucs.iloc[:, j])
    k += 1

In [7]:
w_stat

Unnamed: 0,Model 1,Model 2,Wilcoxon stat,p-value
0,C4.5,C4.5+m,6.5,0.0107571
1,C4.5,C4.5+cf,43.0,0.861262
2,C4.5,C4.5+m+cf,11.0,0.0159064
3,C4.5+m,C4.5+cf,17.0,0.0463327
4,C4.5+m,C4.5+m+cf,22.0,0.327826
5,C4.5+cf,C4.5+m+cf,10.0,0.0229091


In [23]:
print('Two classifiers with the highest significance difference:')
print(w_stat.sort_values('p-value').loc[0, ['Model 1', 'Model 2']])

Two classifiers with the highest significance difference:
Model 1      C4.5
Model 2    C4.5+m
Name: 0, dtype: object


How many statistically significant differences at 0.05 did we find?

In [24]:
diff_models_cnt = w_stat.loc[w_stat.loc[:, 'p-value'] <= 0.05, :].shape[0]
print('Number of p-value <= 0.05: %d' % diff_models_cnt)

Number of p-value <= 0.05: 4


Comparing 4 classifiers with each other, we tested 6 hypotheses. Let's make an allowance for multiple validation. Let's start with the Hill method. How many hypotheses can be rejected at a significance level of 0.05 after correction by this method?

In [0]:
reject, p_corrected, a1, a2 = multipletests(w_stat['p-value'], alpha = 0.05, method = 'holm') 

In [0]:
w_stat['p_corrected'] = p_corrected
w_stat['reject'] = reject

In [27]:
w_stat

Unnamed: 0,Model 1,Model 2,Wilcoxon stat,p-value,p_corrected,reject
0,C4.5,C4.5+m,6.5,0.010757,0.064543,False
1,C4.5,C4.5+cf,43.0,0.861262,0.861262,False
2,C4.5,C4.5+m+cf,11.0,0.015906,0.079532,False
3,C4.5+m,C4.5+cf,17.0,0.046333,0.138998,False
4,C4.5+m,C4.5+m+cf,22.0,0.327826,0.655651,False
5,C4.5+cf,C4.5+m+cf,10.0,0.022909,0.091636,False


How many hypotheses can be rejected at a significance level of 0.05 after correction by the Benjamini-Hochberg method?

In [0]:
reject, p_corrected, a1, a2 = multipletests(w_stat['p-value'], alpha = 0.05, method = 'fdr_bh') 

In [0]:
w_stat['p_corrected'] = p_corrected
w_stat['reject'] = reject

In [30]:
w_stat

Unnamed: 0,Model 1,Model 2,Wilcoxon stat,p-value,p_corrected,reject
0,C4.5,C4.5+m,6.5,0.010757,0.045818,True
1,C4.5,C4.5+cf,43.0,0.861262,0.861262,False
2,C4.5,C4.5+m+cf,11.0,0.015906,0.045818,True
3,C4.5+m,C4.5+cf,17.0,0.046333,0.069499,False
4,C4.5+m,C4.5+m+cf,22.0,0.327826,0.393391,False
5,C4.5+cf,C4.5+m+cf,10.0,0.022909,0.045818,True


In [31]:
diff_models_cnt = w_stat.loc[w_stat.loc[:, 'p_corrected'] <= 0.05, :].shape[0]
print('Number of p-value <= 0.05: %d' % diff_models_cnt)

Number of p-value <= 0.05: 3
