In [1]:
import numpy as np
import pandas as pd

from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests 

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import combinations

### Вопрос 2

Классификатор C4.5 и три его модификации: с оптимизацией гиперпараметра m, гиперпараметра cf и с одновременной оптимизацией обоих гиперпараметров. Эти четыре классификатора сравнивались на 14 наборах данных. На каждом датасете был посчитан AUC каждого классификатора.

Используя критерий знаковых рангов, проведите попарное сравнение каждого классификатора с каждым. Выберите два классификатора, различие между которыми наиболее статистически значимо.

In [3]:
aucs = pd.read_csv('AUCs.txt', delimiter='\t')

In [7]:
aucs

Unnamed: 0.1,Unnamed: 0,C4.5,C4.5+m,C4.5+cf,C4.5+m+cf
0,adult (sample),0.763,0.768,0.771,0.798
1,breast cancer,0.599,0.591,0.59,0.569
2,breast cancer wisconsin,0.954,0.971,0.968,0.967
3,cmc,0.628,0.661,0.654,0.657
4,ionosphere,0.882,0.888,0.886,0.898
5,iris,0.936,0.931,0.916,0.931
6,liver disorders,0.661,0.668,0.609,0.685
7,lung cancer,0.583,0.583,0.563,0.625
8,lymphography,0.775,0.838,0.866,0.875
9,mushroom,1.0,1.0,1.0,1.0


In [14]:
# stats.wilcoxon(aucs.iloc[:,1], aucs.iloc[:,2])
# stats.wilcoxon(aucs['C4.5'], aucs['C4.5+m'])

WilcoxonResult(statistic=6.5, pvalue=0.01075713311978963)

In [29]:
%%time 
w_stat = []
stat
for i in range(1,5,1):
    for j in range(1,5,1):
        if i == j: 
            continue
            
        stat, p = stats.wilcoxon(aucs.iloc[:,i], aucs.iloc[:,j])
#         print(stats.wilcoxon(aucs.iloc[:,i], aucs.iloc[:,j])[1])
        w_stat.append([aucs.columns.values[i], aucs.columns.values[j], stat, p])

Wall time: 8 ms


In [43]:
w_stat
# w_stat[0][1]

[['C4.5', 'C4.5+m', 6.5, 0.01075713311978963],
 ['C4.5', 'C4.5+cf', 43.0, 0.861262330095348],
 ['C4.5', 'C4.5+m+cf', 11.0, 0.015906444101703374],
 ['C4.5+m', 'C4.5', 6.5, 0.01075713311978963],
 ['C4.5+m', 'C4.5+cf', 17.0, 0.046332729793395394],
 ['C4.5+m', 'C4.5+m+cf', 22.0, 0.3278256758446406],
 ['C4.5+cf', 'C4.5', 43.0, 0.861262330095348],
 ['C4.5+cf', 'C4.5+m', 17.0, 0.046332729793395394],
 ['C4.5+cf', 'C4.5+m+cf', 10.0, 0.022909099354356588],
 ['C4.5+m+cf', 'C4.5', 11.0, 0.015906444101703374],
 ['C4.5+m+cf', 'C4.5+m', 22.0, 0.3278256758446406],
 ['C4.5+m+cf', 'C4.5+cf', 10.0, 0.022909099354356588]]

In [57]:
aucs_stat = pd.DataFrame.from_records(w_stat)
aucs_stat.columns = ['Model 1', 'Model 2', 'stat', 'p-value']

In [58]:
aucs_stat.head()

Unnamed: 0,Model 1,Model 2,stat,p-value
0,C4.5,C4.5+m,6.5,0.010757
1,C4.5,C4.5+cf,43.0,0.861262
2,C4.5,C4.5+m+cf,11.0,0.015906
3,C4.5+m,C4.5,6.5,0.010757
4,C4.5+m,C4.5+cf,17.0,0.046333


In [67]:
top_diff_idx = aucs_stat['p-value'].idxmin()
top_diff_idx
print('Two classifiers with the highest significance difference: %s & %s' % (aucs_stat.loc[top_diff_idx, 'Model 1'],
      aucs_stat.loc[top_diff_idx, 'Model 2']))

Two classifiers with the highest significance difference: C4.5 & C4.5+m


### Вопрос 3

Сколько статистически значимых на уровне 0.05 различий мы обнаружили?

In [69]:
sign_diff_models = aucs_stat.loc[aucs_stat.loc[:, 'p-value'] <= 0.05, :].shape[0]
print('Number of p-value <= 0.05: %d' % sign_diff_models)

Number of p-value <= 0.05: 8


### Вопрос 5

Сравнивая 4 классификатора между собой, мы проверили 6 гипотез. Давайте сделаем поправку на множественную проверку. Начнём с метода Холма. Сколько гипотез можно отвергнуть на уровне значимости 0.05 после поправки этим методом?

In [72]:
reject, p_corrected, a1, a2 = multipletests(aucs_stat['p-value'], 
                                            alpha = 0.05, 
                                            method = 'holm') 

In [73]:
aucs_stat['p_corrected'] = p_corrected
aucs_stat['reject'] = reject

In [78]:
aucs_stat.head()

Unnamed: 0,Model 1,Model 2,stat,p-value,p_corrected,reject
0,C4.5,C4.5+m,6.5,0.010757,0.129086,False
1,C4.5,C4.5+cf,43.0,0.861262,1.0,False
2,C4.5,C4.5+m+cf,11.0,0.015906,0.159064,False
3,C4.5+m,C4.5,6.5,0.010757,0.129086,False
4,C4.5+m,C4.5+cf,17.0,0.046333,0.277996,False


In [77]:
aucs_stat.reject.value_counts()

False    12
Name: reject, dtype: int64

### Вопрос 6

Сколько гипотез можно отвергнуть на уровне значимости 0.05 после поправки методом Бенджамини-Хохберга?

In [79]:
reject, p_corrected, a1, a2 = multipletests(aucs_stat['p-value'], 
                                            alpha = 0.05, 
                                            method = 'fdr_bh') 

In [81]:
aucs_stat['p_corrected'] = p_corrected
aucs_stat['reject'] = reject

In [87]:
aucs_stat.head()

Unnamed: 0,Model 1,Model 2,stat,p-value,p_corrected,reject
0,C4.5,C4.5+m,6.5,0.010757,0.045818,True
1,C4.5,C4.5+cf,43.0,0.861262,0.861262,False
2,C4.5,C4.5+m+cf,11.0,0.015906,0.045818,True
3,C4.5+m,C4.5,6.5,0.010757,0.045818,True
4,C4.5+m,C4.5+cf,17.0,0.046333,0.069499,False


In [83]:
aucs_stat.reject.value_counts()

True     6
False    6
Name: reject, dtype: int64

In [84]:
aucs_stat[aucs_stat.reject == True].sort_values(by='p_corrected', ascending=False).head()

Unnamed: 0,Model 1,Model 2,stat,p-value,p_corrected,reject
0,C4.5,C4.5+m,6.5,0.010757,0.045818,True
2,C4.5,C4.5+m+cf,11.0,0.015906,0.045818,True
3,C4.5+m,C4.5,6.5,0.010757,0.045818,True
8,C4.5+cf,C4.5+m+cf,10.0,0.022909,0.045818,True
9,C4.5+m+cf,C4.5,11.0,0.015906,0.045818,True
