## Bengio's t-test. Reference: "Evaluating the Replicability of Significance Tests for Comparing Learning Algorithms"

\begin{equation*}
t = \frac{\frac{1}{k*r} \sum_{i=1}^k \sum_{j=1}^r x_{ij}}{\sqrt{(\frac{1}{k*r}+\frac{n_2}{n_1}) \hat{\sigma}^2}}
\end{equation*}

In [1]:
import pandas as pd
import csv
import numpy as np
import math
from scipy import stats

In [2]:
input1_path = "../tests/model_comparison/history/_sgd_ridge_full.txtauroc_detailed"
input2_path = "../tests/model_comparison/_sgd_ridge_full.txtauroc_detailed"

In [3]:
with open(input1_path, 'r') as f:
    input1 = f.readlines()
with open(input2_path, 'r') as f:
    input2 = f.readlines()

In [4]:
input1

['\n',
 '0.9066688334201541,0.9164008971844152,0.9046148253495776,0.8991174800841989,0.9036605818984468,0.9123973875424538,0.9030254130192741,0.9173632005968997,0.9142877559206835,0.9056298723176838\n',
 '0.906931906059869,0.9052131184164419,0.9000543196283854,0.8992682826671456,0.8921514447699461,0.919441840199839,0.915634074980436,0.9074656790721513,0.9133532439143618,0.9168963554192373\n',
 '0.9075276472757309,0.9080884745301492,0.9001910756434753,0.9140671201416338,0.9019584847449268,0.911992308604262,0.9136520650325699,0.9178926336650294,0.9089306681644842,0.9071711221869809\n',
 '0.9185038543969495,0.9062386145397743,0.9104839666621545,0.9119746763022556,0.9142589874279368,0.9086578314913378,0.9036988625541179,0.9024661094395227,0.8990699192695772,0.9114175176884737\n',
 '0.9083117614928697,0.9054342015078424,0.8980499566591306,0.9084181713864395,0.9127585017276176,0.9085790661422448,0.90601437821427,0.9186749510645618,0.9113646218532431,0.9060632254016939\n',
 '0.901784358614869

In [5]:
input2

['\n',
 '0.924909137077753,0.92220100162241,0.8958646719050367,0.9126491753884468,0.9059185306962729,0.9058941036524678,0.9113668954214893,0.921652919605479,0.9168081209030022,0.9013695843159015\n',
 '0.9177795646759951,0.9071535165853947,0.9197048985880033,0.902947378466936,0.908929305862945,0.9087229541498707,0.9176250753980449,0.9088209538646492,0.9083548698030595,0.9115648174824206\n',
 '0.9092971316969863,0.9166408667851283,0.9119217607467631,0.9096040319392119,0.904271409451789,0.911693396290725,0.9208273583939361,0.9140636881960228,0.9148212882045592,0.9079539464558191\n',
 '0.9151945585635535,0.9116784844325883,0.9216496889244374,0.913749670518944,0.9177122482878346,0.9105484496212105,0.9018264820694913,0.907398799933132,0.9080861174244445,0.9055797062788324\n',
 '0.9133704832691791,0.9086963968406174,0.9049530943951862,0.921063013820026,0.9128178924119377,0.908717273442009,0.9009685092241683,0.9146451550945934,0.9184438255599303,0.911375879865268\n',
 '0.9120031368868812,0.914

In [6]:
# Extract values of repeated cross validation. 
# The splitting seed for each cross-validation are the same.

data1 = []
data2 = []
for i in range(1,len(input1)):
    data1.append([float(x) for x in input1[i].split(',')])
    data2.append([float(x) for x in input2[i].split(',')])

In [7]:
def bengio_ttest(data1, data2):
    # mean value for each 10-fold cross validation
    data1_mean = [np.mean(x) for x in data1]
    data2_mean = [np.mean(x) for x in data2]
    
    X = np.array(data1) - np.array(data2)
    m = X.mean()
    
    # k is the number of folds of cross validation
    # r is the repeated times of cross validation
    k = len(data1[0])
    r = len(data1)
    
    sigma = 0
    for row in X:
        for item in row:
            sigma += math.pow(item-m, 2)/(k*r-1)
    sigma = math.sqrt(sigma)
    
    t = 0
    for row in X:
        for item in row:
            t += item/((k*r)*math.sqrt((1/(k*r) + (1/(k-1)))*sigma*sigma))
            
    # degree of freedom
    df = k*r-1
    
    pval = stats.t.sf(np.abs(t), df)*2  # two-sided pvalue = Prob(abs(t)>tt)
    print('t-statistic = %6.8f pvalue = %6.8f' % (t, pval))

In [8]:
bengio_ttest(data1, data2)

t-statistic = -1.19923858 pvalue = 0.23329763
