In [1]:
import numpy as np 
import pandas as pd 
import pickle 
import glob
from scipy import stats 

In [2]:
def organize_p_tests(res, alpha = 0.05):
    # ipdb.set_trace()
    version_pattern = r'v?_'
    gamma_pattern = 'G_\d+\.\d+'
    res['version'] = res['exp'].apply(lambda x: re.split(version_pattern, x)[0])
    res['param'] = res['exp'].apply(lambda x: re.findall(gamma_pattern, x)[0])
    res = res.groupby('param').agg({'norm_diversity_p': list, 'perc_LT_p': list}).reset_index()
    res['#tests'] = res['perc_LT_p'].apply(len)
    res['significant'] = res[['#tests', 'perc_LT_p']].apply(lambda x: bonferoni_significant(x, 'perc_LT_p'), axis=1)

    return res 
   
def run_p_tests(df1, df2, metric):
    stat, p_val = stats.ttest_ind(df1[metric].values, df2[metric].values)
    return p_val

def run_wilcox(df1, df2, metric): 
    stat, p_val = stats.wilcoxon(df1[metric].values, df2[metric].values)
    return p_val



In [3]:
def run_t_test(dataset, gamma_r, gamma_b, mode): 
    prefix = f'/home/mila/r/rebecca.salganik/scratch/PinSAGE_experiments/FULL_RUNS/{dataset}/'
    suffix = 'log10_popcat_fairness_breakdown_by_pid.pkl'
    if mode == 'perf': 
        suffix = 'performance_breakdown_by_pid.pkl'
    r_p = f'{prefix}REDRESS/v1_G_{gamma_r}_A_0.01_B_0.0/redress/{suffix}'
    ps_p = f'{prefix}REDRESS/v1_G_{gamma_r}_A_0.01_B_0.0/utility/{suffix}'
    b_p = f'{prefix}BOOST/boost2/v1_G_{gamma_b}_A_0.01/redress/{suffix}'
   

    r_df = pickle.load(open(r_p, "rb")).astype(float)
    b_df = pickle.load(open(b_p, "rb")).astype(float)
    ps_df = pickle.load(open(ps_p, "rb")).astype(float)
    
    metrics = [c for c in r_df.columns if 'pid' not in c]
    
    p_vals_redress_vs_ps = [float(run_p_tests(r_df, ps_df, m)) for m in metrics]
    p_vals_boost_vs_ps = [float(run_p_tests(b_df, ps_df, m)) for m in metrics]
    p_vals_boost_vs_redress = [float(run_p_tests(b_df, r_df, m)) for m in metrics]
    

    wil_redress_vs_ps = [float(run_wilcox(r_df, ps_df, m)) for m in metrics]
    wil_boost_vs_ps = [float(run_wilcox(b_df, ps_df, m)) for m in metrics]
    wil_boost_vs_redress = [float(run_wilcox(b_df, r_df, m)) for m in metrics]
    
    
    t_test = pd.DataFrame(
        [p_vals_redress_vs_ps, p_vals_boost_vs_ps, p_vals_boost_vs_redress], 
        columns=metrics, index=['redress_vs_ps', 'boost_vs_ps', 'boost_vs_redress'])

    wilcox_test = pd.DataFrame(
        [wil_redress_vs_ps, wil_boost_vs_ps, wil_boost_vs_redress], 
        columns=metrics, index=['redress_vs_ps', 'boost_vs_ps', 'boost_vs_redress']).astype(float)
    
    return t_test, wilcox_test



In [18]:
fair_t_df,fair_w_df =  run_t_test('MPD_Subset', '0.5', '0.5', 'fair')

perf_t_df,perf_w_df = run_t_test('MPD_Subset', '0.5', '0.5', 'perf')

t_df = perf_t_df.join(fair_t_df) #.apply(lambda x: np.round(x,10))
w_df = perf_w_df.join(fair_w_df) #.apply(lambda x: np.round(x,10))


w_df


t_df

Unnamed: 0,r_precision,competition_ndcg,artist_prec,norm_diversity,sound_homogeneity,perc_LT
redress_vs_ps,0.0002241879,0.0001733327,0.14166,1.837484e-12,4.423948e-42,0.047167
boost_vs_ps,4.408083e-16,1.768725e-19,0.727897,1.168816e-29,3.751961e-61,0.000596
boost_vs_redress,5.505001e-09,1.794279e-09,0.094062,8.24373e-11,1.126384e-12,0.196477


In [17]:
fair_t_df,fair_w_df =  run_t_test('LFM_Subset', '0.2', '0.6', 'fair')

perf_t_df,perf_w_df = run_t_test('LFM_Subset', '0.2', '0.6', 'perf')

t_df = perf_t_df.join(fair_t_df) #.apply(lambda x: np.round(x,10))
w_df = perf_w_df.join(fair_w_df)#.apply(lambda x: np.round(x,10))


w_df

t_df

Unnamed: 0,r_precision,competition_ndcg,artist_prec,norm_diversity,sound_homogeneity,perc_LT
redress_vs_ps,0.05960803,0.001989518,0.1230002,1.766603e-10,0.314695,0.1618156
boost_vs_ps,5.696989e-08,1.179627e-15,1.914129e-07,1.112495e-34,0.001408,2.4777e-11
boost_vs_redress,2.824536e-05,2.549566e-08,0.0007331554,1.887246e-13,0.03467,5.192154e-08


In [8]:
r_df = pickle.load(open('/home/mila/r/rebecca.salganik/scratch/PinSAGE_experiments/FULL_RUNS/MPD_Subset/REDRESS/v1_G_0.5_A_0.01_B_0.0/redress/performance_breakdown_by_pid.pkl', "rb"))


u_df = pickle.load(open('/home/mila/r/rebecca.salganik/scratch/PinSAGE_experiments/FULL_RUNS/MPD_Subset/REDRESS/v1_G_0.5_A_0.01_B_0.0/utility/performance_breakdown_by_pid.pkl', "rb"))


metrics = [c for c in r_df.columns if 'pid' not in c]
    
    
metrics

run_p_tests(r_df , u_df, 'r_precision')

0.00022418794101332552

In [9]:
p_vals_redress_vs_ps, wil_redress_vs_ps



NameError: name 'p_vals_redress_vs_ps' is not defined

In [10]:
p_vals_boost_vs_ps, wil_boost_vs_ps

NameError: name 'p_vals_boost_vs_ps' is not defined

In [31]:
p_vals_boost_vs_redress, wil_boost_vs_redress

({'norm_diversity': 8.243730183460495e-11,
  'sound_homogeneity': 1.1263840419174975e-12,
  'perc_LT': 0.19647656540206895},
 {'norm_diversity': 2.0534705827850006e-15,
  'sound_homogeneity': 8.267798584515514e-18,
  'perc_LT': 0.0045003864973730216})

Unnamed: 0,norm_diversity,sound_homogeneity,perc_LT
boost vs redress,2.053471e-15,8.267799e-18,0.0045
