In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from pathlib import Path
from scipy.stats import ttest_rel

In [2]:
results_file = Path(r"../DVlog/trained_models/test_metrics.csv")

# load in the file with the results on the test set
df_results = pd.read_csv(results_file, sep=";")
df_results.head()

Unnamed: 0.1,Unnamed: 0,name,seed,precision,recall,f1,f1_m,f1_f,Eq accuracy,eq opportunity,pred equality,unpriv_TPR,unpriv_FPR,priv_TPR,priv_FPR
0,0,unimodal_bias_mf-group_mpnet_sent,0,0.929159,0.927273,0.926964,0.877117,0.953358,0.92003,0.9,1.408046,0.9,0.172414,1.0,0.122449
1,1,unimodal_bias_mf-group_mpnet_sent,1,0.927386,0.927273,0.927192,0.877193,0.953522,0.919951,0.881111,1.351724,0.866667,0.137931,0.983607,0.102041
2,2,unimodal_bias_mf-group_mpnet_sent,1123,0.918511,0.915152,0.914619,0.859389,0.943915,0.910452,0.9,1.448276,0.9,0.206897,1.0,0.142857
3,3,unimodal_bias_mf-group_mpnet_sent,3407,0.933333,0.933333,0.933299,0.894672,0.953522,0.938282,0.881111,1.013793,0.866667,0.103448,0.983607,0.102041
4,4,unimodal_bias_mf-group_mpnet_sent,42,0.934632,0.933333,0.93311,0.876737,0.962873,0.910543,0.948889,2.534483,0.933333,0.206897,0.983607,0.081633


In [3]:
eqodds_file = Path(r"../DVlog/trained_models/eqodds_metrics.csv")

# load in the file with the eq odds results on the test set
df_eqodds = pd.read_csv(eqodds_file, sep=";")

# setup the dataframe
df_eqodds = df_eqodds[df_eqodds["dataset"] == "test"]
df_eqodds.drop("dataset", axis=1, inplace=True)
df_eqodds.head()

Unnamed: 0.1,Unnamed: 0,name,seed,precision,recall,f1,f1_m,f1_f,eq opportunity,Eq accuracy,pred equality
2,2,eqodds,0,0.934,0.933,0.933,0.877,0.963,0.86,0.91,1.69
5,5,eqodds,1,0.928,0.927,0.927,0.894,0.944,0.86,0.95,0.68
8,8,eqodds,42,0.906,0.903,0.902,0.842,0.935,0.92,0.9,1.69
11,11,eqodds,1123,0.934,0.933,0.933,0.895,0.954,0.91,0.94,1.69
14,14,eqodds,3407,0.933,0.933,0.933,0.895,0.954,0.93,0.94,1.69


In [4]:
df = pd.concat([df_results, df_eqodds], ignore_index=True)
df.head()

Unnamed: 0.1,Unnamed: 0,name,seed,precision,recall,f1,f1_m,f1_f,Eq accuracy,eq opportunity,pred equality,unpriv_TPR,unpriv_FPR,priv_TPR,priv_FPR
0,0,unimodal_bias_mf-group_mpnet_sent,0,0.929159,0.927273,0.926964,0.877117,0.953358,0.92003,0.9,1.408046,0.9,0.172414,1.0,0.122449
1,1,unimodal_bias_mf-group_mpnet_sent,1,0.927386,0.927273,0.927192,0.877193,0.953522,0.919951,0.881111,1.351724,0.866667,0.137931,0.983607,0.102041
2,2,unimodal_bias_mf-group_mpnet_sent,1123,0.918511,0.915152,0.914619,0.859389,0.943915,0.910452,0.9,1.448276,0.9,0.206897,1.0,0.142857
3,3,unimodal_bias_mf-group_mpnet_sent,3407,0.933333,0.933333,0.933299,0.894672,0.953522,0.938282,0.881111,1.013793,0.866667,0.103448,0.983607,0.102041
4,4,unimodal_bias_mf-group_mpnet_sent,42,0.934632,0.933333,0.93311,0.876737,0.962873,0.910543,0.948889,2.534483,0.933333,0.206897,0.983607,0.081633


In [5]:
# Performance metrics of the two models on the test set (e.g., F1-scores for each test sample)
base_model = "unimodal_mpnet_sent_keyw"
comparison_models = ["bimodal_mpnet_aud_sync_concat", "trimodal_mpnet_av_sync_keyw_concat"]
columns = ["f1", "f1_m", "f1_f", "Eq accuracy", "eq opportunity", "pred equality"]

# go over each comparison model and compute the t-test
base_info = df_results[df_results["name"] == base_model]
base_info

for comp_name in comparison_models:
    print(comp_name)
    comp_info = df_results[df_results["name"] == comp_name]
    
    # for each column perform the paired t-test
    for column in columns:
        model1_scores = base_info[column].values
        model2_scores = comp_info[column].values
        
        t_statistic, p_value = ttest_rel(model1_scores, model2_scores, alternative="two-sided")
        print(f"t-test: t-statistic = {t_statistic}, p-value = {p_value};  Column:{column}")

bimodal_mpnet_aud_sync_concat
t-test: t-statistic = -2.008846135755238, p-value = 0.11495014114274467;  Column:f1
t-test: t-statistic = -0.0055854988614234035, p-value = 0.9958109030810942;  Column:f1_m
t-test: t-statistic = -2.7356197733675227, p-value = 0.052140147849367685;  Column:f1_f
t-test: t-statistic = 0.8095135052154954, p-value = 0.4636240204258184;  Column:Eq accuracy
t-test: t-statistic = -0.5608502723790354, p-value = 0.6048283018503621;  Column:eq opportunity
t-test: t-statistic = -2.637501010186526, p-value = 0.057731087854918685;  Column:pred equality
trimodal_mpnet_av_sync_keyw_concat
t-test: t-statistic = 0.29794948979633096, p-value = 0.7805766456143577;  Column:f1
t-test: t-statistic = -0.00997253595630603, p-value = 0.9925207529948804;  Column:f1_m
t-test: t-statistic = 0.467982520118043, p-value = 0.6641578090529878;  Column:f1_f
t-test: t-statistic = -0.2118887140368696, p-value = 0.8425525812002301;  Column:Eq accuracy
t-test: t-statistic = -0.45415306374438447

In [6]:
base_model = "unimodal_mpnet_sent_keyw"
bias_models = ['unimodal_bias_oversample_mpnet_sent_keyw', 'unimodal_bias_mf-group_mpnet_sent', 'unimodal_bias_mf-mixgender_mpnet_sent', 'unimodal_bias_mf-subgroup_mpnet_sent',
                'unimodal_bias_mf-synthetic_mpnet_sent', 'unimodal_bias_mf-synthetic-mixg_mpnet_sent', 'unimodal_bias_reweighing_mpnet_sent_keyw', "eqodds"]
columns = ["f1", "f1_m", "f1_f", "Eq accuracy", "eq opportunity", "pred equality"]

# go over each comparison model and compute the t-test
base_info = df[df["name"] == base_model]

# get all the t-test statistics
pstats_info, tstats_info = [], []
for comp_name in bias_models:
    pstats, tstats = [], []
    comp_info = df[df["name"] == comp_name]
    
    # for each column perform the paired t-test
    for column in columns:
        model1_scores = base_info[column].values
        model2_scores = comp_info[column].values
        
        t_statistic, p_value = ttest_rel(model1_scores, model2_scores, alternative="two-sided")
        pstats.append(p_value)
        tstats.append(t_statistic)
        
    pstats_info.append((comp_name, *pstats))
    tstats_info.append((comp_name, *tstats))

# make the dataframes
df_ptstats = pd.DataFrame(pstats_info, columns=['name', *columns])
df_ttstats = pd.DataFrame(tstats_info, columns=['name', *columns])

df_ptstats

Unnamed: 0,name,f1,f1_m,f1_f,Eq accuracy,eq opportunity,pred equality
0,unimodal_bias_oversample_mpnet_sent_keyw,0.642809,0.408715,0.757998,0.320716,0.513445,0.112155
1,unimodal_bias_mf-group_mpnet_sent,0.869067,0.427329,0.641437,0.216443,0.725241,0.517774
2,unimodal_bias_mf-mixgender_mpnet_sent,0.742757,0.37787,0.587845,0.400681,0.006968,0.242857
3,unimodal_bias_mf-subgroup_mpnet_sent,0.974011,0.817775,0.85609,0.708904,0.447953,0.616341
4,unimodal_bias_mf-synthetic_mpnet_sent,0.070468,0.067356,0.140766,0.093027,0.568631,0.057731
5,unimodal_bias_mf-synthetic-mixg_mpnet_sent,0.157157,0.200379,0.722802,0.452426,0.059158,0.142469
6,unimodal_bias_reweighing_mpnet_sent_keyw,0.439911,0.999404,0.091018,0.544897,0.727785,0.513096
7,eqodds,0.789445,0.709026,0.98434,0.651755,0.995844,0.216911


In [7]:
df_ttstats

Unnamed: 0,name,f1,f1_m,f1_f,Eq accuracy,eq opportunity,pred equality
0,unimodal_bias_oversample_mpnet_sent_keyw,0.500809,0.921983,-0.329944,1.132482,-0.71622,-2.030449
1,unimodal_bias_mf-group_mpnet_sent,0.175698,0.882546,-0.50294,1.466332,-0.377117,-0.708441
2,unimodal_bias_mf-mixgender_mpnet_sent,0.351774,0.990797,-0.588463,0.939468,-5.102947,-1.368933
3,unimodal_bias_mf-subgroup_mpnet_sent,0.03466,0.246021,-0.193372,0.40102,-0.840462,-0.542412
4,unimodal_bias_mf-synthetic_mpnet_sent,2.449707,2.491784,1.832818,2.196388,-0.620337,-2.637501
5,unimodal_bias_mf-synthetic-mixg_mpnet_sent,1.73826,1.531601,0.380668,0.831541,-2.614208,-1.822464
6,unimodal_bias_reweighing_mpnet_sent_keyw,-0.856681,0.000794,-2.215986,0.660733,-0.373418,-0.71685
7,eqodds,0.285486,0.40084,-0.020881,0.486978,0.005541,-1.464506
