This notebook should help to compute the Student T test for different settings.

In [57]:
model = "bert"
dataset1 = "ms_marco"
dataset2 = "threshold=3"
cutoff = 10
metric = "precision"

file1 = "output/%s/%s_scores_%s_%s_N%s.txt"%(metric,metric,model,dataset1,cutoff)
file2 = "output/%s/%s_scores_%s_%s_N%s.txt"%(metric,metric,model,dataset2,cutoff)

output_file = "output/student_t_test_output/student_t_test_results_experiment1_%s_%s_vs_%s_%s_N%s.txt"%(model,dataset1,dataset2,metric,cutoff)

In [58]:
data1 = []
with open (file1,'r') as infile:
    for line in infile:
        data1.append(float(line.rstrip().split()[1]))
        
data2 = []
with open (file2,'r') as infile:
    for line in infile:
        data2.append(float(line.rstrip().split()[1]))

In [59]:
# t-test for dependent samples
from math import sqrt
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from scipy.stats import t

In [60]:
# function for calculating the t-test for two dependent samples
def dependent_ttest(data1, data2, alpha):
    # calculate means
    mean1, mean2 = mean(data1), mean(data2)
    
    # number of paired samples
    n = len(data1)
    
    # sum squared difference between observations
    d1 = sum([(data1[i]-data2[i])**2 for i in range(n)])
    
    # sum difference between observations
    d2 = sum([data1[i]-data2[i] for i in range(n)])
    
    # standard deviation of the difference between means
    sd = sqrt((d1 - (d2**2 / n)) / (n - 1))
    
    # standard error of the difference between the means
    sed = sd / sqrt(n)
    
    # calculate the t statistic
    t_stat = (mean1 - mean2) / sed
    
    # degrees of freedom
    df = n - 1
    
    # calculate the critical value
    cv = t.ppf(1.0 - alpha, df)
    
    # calculate the p-value
    p = (1.0 - t.cdf(abs(t_stat), df)) * 2.0
    
    # return everything
    return t_stat, df, cv, p

In [61]:
# calculate the t test
alpha = 0.05
t_stat, df, cv, p = dependent_ttest(data1, data2, alpha)

In [62]:
print('t=%.3f, df=%d, cv=%.3f, p=%.10f' % (t_stat, df, cv, p))

t=-9.970, df=42, cv=1.682, p=0.0000000000


In [63]:
if abs(t_stat) <= cv:
	print('Accept null hypothesis that the means are equal.')
else:
	print('Reject the null hypothesis that the means are equal.')
# interpret via p-value
if p > alpha:
	print('Accept null hypothesis that the means are equal.')
else:
	print('Reject the null hypothesis that the means are equal.')

Reject the null hypothesis that the means are equal.
Reject the null hypothesis that the means are equal.


In [64]:
with open(output_file,'w') as outfile:
    outfile.write("model: %s\n"%(model))
    outfile.write("dataset1: %s\n"%(dataset1))
    outfile.write("dataset2: %s\n"%(dataset2))
    outfile.write("metric: %s@%s\n"%(metric,cutoff))
    outfile.write("alpha: %s\n\n"%(alpha))
    outfile.write('t=%.3f, df=%d, cv=%.3f, p=%.10f\n\n' % (t_stat, df, cv, p))
    outfile.write("interpret via critical value:\n")
    outfile.write("t <= cv\n")
    if abs(t_stat) <= cv:
        outfile.write('Accept null hypothesis that the means are equal.\n\n')
    else:
        outfile.write('Reject the null hypothesis that the means are equal.\n\n')
    outfile.write("interpret via p-value:\n")
    outfile.write("p > alpha\n")
    if p > alpha:
        outfile.write('Accept null hypothesis that the means are equal.\n')
    else:
        outfile.write('Reject the null hypothesis that the means are equal.\n')