# Множественная проверка гипотез

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests 
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations

import warnings
warnings.filterwarnings('ignore')

Классификатор C4.5 и три его модификации: с оптимизацией гиперпараметра m, гиперпараметра cf и с одновременной оптимизацией обоих гиперпараметров. Эти четыре классификатора сравнивались на 14 наборах данных. На каждом датасете был посчитан AUC каждого классификатора.

Используя критерий знаковых рангов, проведите попарное сравнение каждого классификатора с каждым. Выберите два классификатора, различие между которыми наиболее статистически значимо. 

In [2]:
aucs = pd.read_csv('AUCs.txt', delimiter='\t')

In [3]:
aucs.head()

Unnamed: 0.1,Unnamed: 0,C4.5,C4.5+m,C4.5+cf,C4.5+m+cf
0,adult (sample),0.763,0.768,0.771,0.798
1,breast cancer,0.599,0.591,0.59,0.569
2,breast cancer wisconsin,0.954,0.971,0.968,0.967
3,cmc,0.628,0.661,0.654,0.657
4,ionosphere,0.882,0.888,0.886,0.898


In [4]:
aucs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
Unnamed: 0    14 non-null object
C4.5          14 non-null float64
C4.5+m        14 non-null float64
C4.5+cf       14 non-null float64
C4.5+m+cf     14 non-null float64
dtypes: float64(4), object(1)
memory usage: 688.0+ bytes


In [5]:
aucs.columns = ['name','C4.5', 'C4.5+m', 'C4.5+cf', 'C4.5+m+cf']

In [6]:
data = []

for i, lhs_column in enumerate(aucs.columns[1:]):
    for j, rhs_column in enumerate(aucs.columns[1:]):
        if i >= j:
            continue
        
        _, p = stats.wilcoxon(aucs[lhs_column], aucs[rhs_column])
        data.append([lhs_column, rhs_column, p])

In [7]:
data

[['C4.5', 'C4.5+m', 0.01075713311978963],
 ['C4.5', 'C4.5+cf', 0.861262330095348],
 ['C4.5', 'C4.5+m+cf', 0.015906444101703374],
 ['C4.5+m', 'C4.5+cf', 0.046332729793395394],
 ['C4.5+m', 'C4.5+m+cf', 0.3278256758446406],
 ['C4.5+cf', 'C4.5+m+cf', 0.022909099354356588]]

In [8]:
df = pd.DataFrame(data)
df.columns = ['Model 1', 'Model 2', 'p-value']

In [9]:
df

Unnamed: 0,Model 1,Model 2,p-value
0,C4.5,C4.5+m,0.010757
1,C4.5,C4.5+cf,0.861262
2,C4.5,C4.5+m+cf,0.015906
3,C4.5+m,C4.5+cf,0.046333
4,C4.5+m,C4.5+m+cf,0.327826
5,C4.5+cf,C4.5+m+cf,0.022909


Сравнивая 4 классификатора между собой, мы проверили 6 гипотез. Давайте сделаем поправку на множественную проверку. Начнём с метода Холма. Сколько гипотез можно отвергнуть на уровне значимости 0.05 после поправки этим методом?

In [10]:
reject, p_corrected, a1, a2 = multipletests(df['p-value'], 
                                            alpha = 0.05, 
                                            method = 'holm')

In [11]:
df['p_corrected'] = p_corrected
df['reject'] = reject

In [12]:
df

Unnamed: 0,Model 1,Model 2,p-value,p_corrected,reject
0,C4.5,C4.5+m,0.010757,0.064543,False
1,C4.5,C4.5+cf,0.861262,0.861262,False
2,C4.5,C4.5+m+cf,0.015906,0.079532,False
3,C4.5+m,C4.5+cf,0.046333,0.138998,False
4,C4.5+m,C4.5+m+cf,0.327826,0.655651,False
5,C4.5+cf,C4.5+m+cf,0.022909,0.091636,False


Сколько гипотез можно отвергнуть на уровне значимости 0.05 после поправки методом Бенджамини-Хохберга? 

In [13]:
reject, p_corrected, a1, a2 = multipletests(df['p-value'], 
                                            alpha = 0.05, 
                                            method = 'fdr_bh')

In [14]:
df['p_corrected'] = p_corrected
df['reject'] = reject

In [15]:
df

Unnamed: 0,Model 1,Model 2,p-value,p_corrected,reject
0,C4.5,C4.5+m,0.010757,0.045818,True
1,C4.5,C4.5+cf,0.861262,0.861262,False
2,C4.5,C4.5+m+cf,0.015906,0.045818,True
3,C4.5+m,C4.5+cf,0.046333,0.069499,False
4,C4.5+m,C4.5+m+cf,0.327826,0.393391,False
5,C4.5+cf,C4.5+m+cf,0.022909,0.045818,True
