In [36]:
import pandas as pd
import numpy as np

from scipy.stats import pearsonr, wilcoxon
from statsmodels.sandbox.stats.multicomp import multipletests

Классификатор C4.5 и три его модификации: с оптимизацией гиперпараметра m, гиперпараметра cf и с одновременной оптимизацией обоих гиперпараметров. Эти четыре классификатора сравнивались на 14 наборах данных. На каждом датасете был посчитан AUC каждого классификатора. Данные записаны в файле:

In [37]:
data = pd.read_csv('AUCs.txt', sep='\t', header=0)
data.head()

Unnamed: 0.1,Unnamed: 0,C4.5,C4.5+m,C4.5+cf,C4.5+m+cf
0,adult (sample),0.763,0.768,0.771,0.798
1,breast cancer,0.599,0.591,0.59,0.569
2,breast cancer wisconsin,0.954,0.971,0.968,0.967
3,cmc,0.628,0.661,0.654,0.657
4,ionosphere,0.882,0.888,0.886,0.898


Используя критерий знаковых рангов, проведите попарное сравнение каждого классификатора с каждым. Выберите два классификатора, различие между которыми наиболее статистически значимо.

In [38]:
from itertools import combinations


stats_data = []
for lhs_column, rhs_column in combinations(data.columns[1:], 2):
    stat, p = wilcoxon(data[lhs_column], data[rhs_column])
    stats_data.append([lhs_column, rhs_column, stat, p])
    
statistics_df = pd.DataFrame.from_records(stats_data)
statistics_df.columns = ['Classifier A', 'Classifier B', 'statistics', 'p-value']

In [39]:
statistics_df

Unnamed: 0,Classifier A,Classifier B,statistics,p-value
0,C4.5,C4.5+m,6.5,0.010757
1,C4.5,C4.5+cf,43.0,0.861262
2,C4.5,C4.5+m+cf,11.0,0.015906
3,C4.5+m,C4.5+cf,17.0,0.046333
4,C4.5+m,C4.5+m+cf,22.0,0.327826
5,C4.5+cf,C4.5+m+cf,10.0,0.022909


Сравнивая 4 классификатора между собой, мы проверили 6 гипотез. Давайте сделаем поправку на множественную проверку. Начнём с метода Холма. Сколько гипотез можно отвергнуть на уровне значимости 0.05 после поправки этим методом?

In [40]:
reject, p_corrected, a1, a2 = multipletests(statistics_df['p-value'], alpha=0.05, method='holm')

In [41]:
statistics_df['holm reject'] = reject
statistics_df['holm p-value'] = p_corrected

statistics_df

Unnamed: 0,Classifier A,Classifier B,statistics,p-value,holm reject,holm p-value
0,C4.5,C4.5+m,6.5,0.010757,False,0.064543
1,C4.5,C4.5+cf,43.0,0.861262,False,0.861262
2,C4.5,C4.5+m+cf,11.0,0.015906,False,0.079532
3,C4.5+m,C4.5+cf,17.0,0.046333,False,0.138998
4,C4.5+m,C4.5+m+cf,22.0,0.327826,False,0.655651
5,C4.5+cf,C4.5+m+cf,10.0,0.022909,False,0.091636


Сколько гипотез можно отвергнуть на уровне значимости 0.05 после поправки методом Бенджамини-Хохберга?

In [42]:
reject, p_corrected, a1, a2 = multipletests(statistics_df['p-value'], alpha=0.05, method='fdr_bh')

In [43]:
statistics_df['bh reject'] = reject
statistics_df['bh p-value'] = p_corrected

statistics_df

Unnamed: 0,Classifier A,Classifier B,statistics,p-value,holm reject,holm p-value,bh reject,bh p-value
0,C4.5,C4.5+m,6.5,0.010757,False,0.064543,True,0.045818
1,C4.5,C4.5+cf,43.0,0.861262,False,0.861262,False,0.861262
2,C4.5,C4.5+m+cf,11.0,0.015906,False,0.079532,True,0.045818
3,C4.5+m,C4.5+cf,17.0,0.046333,False,0.138998,False,0.069499
4,C4.5+m,C4.5+m+cf,22.0,0.327826,False,0.655651,False,0.393391
5,C4.5+cf,C4.5+m+cf,10.0,0.022909,False,0.091636,True,0.045818
