## Fairness Metrics

- False Discovery Rate
- False Omission Rate
- Pinned AUC

## Fairness subgroups


- age (depends on categorisation in datasets but likely to follow the Department for Education statistical return categories: under 1 year, 1-4 years, 5-9 years, 10-15 years, 16+ years), 
- gender (likely to follow the Department for Education categorisation - male, female, unknown), 
- disability (recorded or not recorded)
- ethnicity (depends on categorisation in datasets but likely to follow the Department for Education statistical return categories: white, mixed, Asian or Asian British, black or black British, other, unknown).


In [None]:
# Variable set-up

rq = 'rq2' # Options: 'rq1', 'rq2'
cv = 'ts' # Options: 'ts' (time series split, ignore siblings), 'ss' (stratified shuffle, ignore siblings)
data_type = 'all' # Options: 'str' (just structured data), 'all' (structured data and list of strings)
algorithm_names = ['decision_tree', 'logistic_regression', 'gradient_boosting'] 
resampling_name = 'ada' # anything other than 'ada' does 'smote' 
select_features_alpha = 0.001 # Should be highest 0.001 as otherwise all dropped
rcv_n_iter = 50 # The more iterations, the more the randomised search searches for an optimal solution
parameters = 2 # 

# Don't change
file_stub_y_siblings = rq + '_' + cv + '_str' # use 'str' for all 
file_stub = rq + '_' + cv + '_' + data_type # Creates file stub for saving in the format e.g. rq1_ss_str
levers = resampling_name + '_' + str(select_features_alpha) + '_' + str(rcv_n_iter) + '_' + str(parameters)
print(file_stub + '_' + levers)


In [None]:
# Set up directories
local_dir = '/Users/[username]/Documents/Final transfer out data and code to WWC Jan 2020' # insert [username]
hard_drive_dir = '/Volumes/diskAshur2/Final transfer out data and code to WWC Jan 2020/Data for model/Use'

In [None]:
# Load user-written functions

%load_ext autoreload
%autoreload 2

#import text_functions, analysis_functions
import analysis_functions

In [None]:
# Set working directory
import os

os.chdir(hard_drive_dir)

In [None]:
import pandas as pd

X_test_w_sensitive = pd.read_csv('X_test_sensitive_{}.csv'.format(file_stub), index_col = 0)
print(X_test_w_sensitive.shape)
X_test_w_sensitive.reset_index(inplace = True, drop = True)
print(X_test_w_sensitive.index)

y_test = pd.read_csv("y_test_{}.csv".format(file_stub_y_siblings), index_col = 0, header = None)
print(y_test.shape)
y_test.reset_index(inplace = True, drop = True)
y_test = pd.Series(y_test[1])
print(y_test.index)

In [None]:
# Recategorise groups for estimating fairness metrics
import numpy as np
import pandas as pd

X_test_w_sensitive['AgeAtReferralDate'].value_counts().sort_index()
X_test_w_sensitive['Age_cut'] = pd.cut(X_test_w_sensitive['AgeAtReferralDate'], 
                   bins=[-1, 0, 4, 9, 15, 25], 
                   labels=['under 1 year', '1-4 years', '5-9 years', '10-15 years', '16+ years'],
                   include_lowest=True)
X_test_w_sensitive['Age_cut'] = np.where(X_test_w_sensitive['Age_cut'].isnull(),'missing',X_test_w_sensitive['Age_cut'])
print(pd.crosstab(X_test_w_sensitive['AgeAtReferralDate'], X_test_w_sensitive['Age_cut']))
print(X_test_w_sensitive['Age_cut'].value_counts())

# Gender
# No recoding needed
print(X_test_w_sensitive['Gender'].value_counts(dropna=False))

# Disabled (fill missing)
print(X_test_w_sensitive['hasdisability'].value_counts(dropna=False))
#X_test_w_sensitive['hasdisability'] = np.where(X_test_w_sensitive['hasdisability'].isnull(),'missing',X_test_w_sensitive['hasdisability'])
X_test_w_sensitive['hasdisability'] = np.where(X_test_w_sensitive['hasdisability']==True,'Disabled',
                                              np.where(X_test_w_sensitive['hasdisability']==False, 'Not Disabled', 'Disability Missing'))

print(X_test_w_sensitive['hasdisability'].value_counts(dropna=False))

# Ethnicity
# white, mixed, Asian or Asian British, black or black British, other, unknown, Arab


X_test_w_sensitive['ethnicity_highlevel'] = np.where((X_test_w_sensitive['ethnicity_highlevel']=='Declined/Missing') | 
                                                     (X_test_w_sensitive['ethnicity_highlevel']=='Ethnicity Not Given'),'Ethnicity Not Known',
                                              np.where((X_test_w_sensitive['ethnicity_highlevel']=='Arab') |
                                                       (X_test_w_sensitive['ethnicity_highlevel']=='Other'), 'Other Ethnicity', 
                                              np.where((X_test_w_sensitive['ethnicity_highlevel']=='Mixed'), 'Mixed Ethnicity',          
                                                       X_test_w_sensitive['ethnicity_highlevel'])))
X_test_w_sensitive['ethnicity_highlevel'].value_counts(dropna=False)

In [None]:
import pickle

filename = open("{}/Models/Prediction Intervals/preds_{}.pkl".format(local_dir, file_stub), 'rb')
preds = pickle.load(filename)

filename = open("{}/Models/Prediction Intervals/preds_proba_{}.pkl".format(local_dir, file_stub), 'rb')
preds_proba = pickle.load(filename)

In [None]:
# Make bootstrapped predictions into dataframes
assert (len(preds[0]) == len(preds_proba[0]))

num_predictions = len(preds[0])

preds_df = pd.DataFrame(preds).transpose()
preds_df = preds_df.add_prefix('pred_binary_')

preds_proba_df = pd.DataFrame(preds_proba).transpose()
preds_proba_df = preds_proba_df.add_prefix('pred_proba_')

In [None]:
# Combine predictions, actual, input data and sensitive
df_fairness  = pd.concat([preds_df, preds_proba_df, y_test, X_test_w_sensitive], axis = 1)
df_fairness.rename(columns = {1: 'y_test'}, inplace=True)
print(df_fairness.shape)
df_fairness.head()

In [None]:
# Make as categories for sorting
for col in ['Age_cut', 'Gender', 'hasdisability',  'ethnicity_highlevel']:
    print(df_fairness[col].dtype)
    df_fairness[col] = df_fairness[col].astype('category')
    print(df_fairness[col].dtype)

In [None]:
# Get N

variable_subgroup_dict = ({'Age_cut': df_fairness['Age_cut'].unique(),
                        'Gender': df_fairness['Gender'].unique(),  
                        'hasdisability': df_fairness['hasdisability'].unique(),
                          'ethnicity_highlevel': df_fairness['ethnicity_highlevel'].unique()})

counts_list = []
for variable in variable_subgroup_dict.keys():
    counts = df_fairness.astype(str).groupby([variable])['pred_binary_0'].count()
    counts = counts.reset_index()
    counts.columns = ['Characteristic', 'N']
    counts_list.append(counts)
    print(counts)
counts_df = pd.concat(counts_list, axis = 0)
counts_df.to_csv('{}/Models/Fairness N {}.csv'.format(local_dir, file_stub))

In [None]:
assert 1==2

In [None]:
# Calculate metrics
#  add in numbers of observations for each subgroup (so that it gives context to the proportion)

from functools import reduce

from analysis_functions import pinned_metrics

# Drop nas from prediction and true columns as metrics don't like it
pred_cols = [col for col in df_fairness.columns if 'pred_binary' in col]
pred_proba_cols = [col for col in df_fairness.columns if 'pred_proba' in col]

df_fairness.dropna(subset = ['y_test', *pred_cols, *pred_proba_cols], inplace = True)

metrics = ['average_precision_score', 'false_discovery_rate', 'false_omission_rate']

variable_subgroup_dict = ({'Age_cut': df_fairness['Age_cut'].unique(),
                        'Gender': df_fairness['Gender'].unique(),  
                        'hasdisability': df_fairness['hasdisability'].unique(),
                         'ethnicity_highlevel': df_fairness['ethnicity_highlevel'].unique()})

df_dict_variables_metrics = {}
for variable, subgroup in variable_subgroup_dict.items():
    print(variable)
    df_metrics_bootstrapped = {}
    for p in range(0,num_predictions):
        if p % 100 == 0:
            print(p)
        df_list_metrics = []
        for metric in metrics:
            subgroup_metrics = pinned_metrics(df = df_fairness, group = variable, subgroups = subgroup, y_true = 'y_test', y_scores  = 'pred_binary_{}'.format(p), y_proba_scores = 'pred_proba_{}'.format(p), metric = metric, upsampling = True) # return list of metrics for subgroup
            df_fairness_metric = pd.DataFrame({'{}'.format(variable): subgroup, '{}'.format(metric): subgroup_metrics})
            df_list_metrics.append(df_fairness_metric)
        df_metrics_merged = reduce(lambda  left,right: pd.merge(left,right,on=[variable], how='outer'), df_list_metrics)
        df_metrics_merged = df_metrics_merged.sort_values(by = variable)
        counts = df_fairness.astype(str).groupby([variable])['pred_binary_0'].count()
        counts.rename('N', inplace = True)
        
        counts = counts.astype('float64')
        df_metrics_merged[variable] = df_metrics_merged[variable].astype('category')
        df_metrics_merged = df_metrics_merged.merge(counts, on=[variable], how='left')
        df_metrics_bootstrapped[p] = df_metrics_merged
    print(counts)
    df_dict_variables_metrics[variable] = df_metrics_bootstrapped

In [None]:
### Calculate the 95% CI for the pinned metrics

metric_name = {'average_precision_score': 'Average precision score',
    'false_discovery_rate': 'False discovery rate',
    'false_omission_rate': 'False omission rate'} 
        

# For each metric
fairness_metrics_CI_metric_list = []
for metric in metrics:
    fairness_metrics_CI_var_list = []
    # For each group
    for variable in variable_subgroup_dict.keys():
        # Find median, upper and lower bounds
        median = round(pd.concat(df_dict_variables_metrics[variable]).groupby(variable)[metric].quantile(0.5), 4)
        lower = round(pd.concat(df_dict_variables_metrics[variable]).groupby(variable)[metric].quantile(0.025), 4)
        upper = round(pd.concat(df_dict_variables_metrics[variable]).groupby(variable)[metric].quantile(0.975), 4)

        fairness_metrics_CI = pd.concat([median, lower, upper], axis=1)
        fairness_metrics_CI.columns = (['{}'.format(metric_name[metric]),  
                                        '{} 95% CI (LL)'.format(metric_name[metric]),
                                        '{} 95% CI (UL)'.format(metric_name[metric])])
        fairness_metrics_CI['Group'] = '{}'.format(variable)
        fairness_metrics_CI.index.rename('', inplace = True)
        fairness_metrics_CI_var_list.append(fairness_metrics_CI)
    fairness_metrics_CI_var_all = pd.concat(fairness_metrics_CI_var_list, axis=0)
    fairness_metrics_CI_metric_list.append(fairness_metrics_CI_var_all)

# To handle duplicate Group columns
for n, l in enumerate(fairness_metrics_CI_metric_list):
    if n!=0:
        fairness_metrics_CI_metric_list[n].drop(columns = 'Group', inplace = True)
fairness_metrics_CI = pd.concat(fairness_metrics_CI_metric_list, axis=1)

In [None]:
# Standardise names
fairness_metrics_CI['Characteristic'] = fairness_metrics_CI.index
fairness_metrics_CI['Characteristic'].replace({'under 1 year': 'Under 1 Year',
                                '1-4 years': '1-4 Years',
                                '10-15 years': '10-15 Years',
                                '16+ years': '16+ Years',
                                   '5-9 years':'5-9 Years',
                                  'Asian/Asian British': 'Asian British',                                      
                                   'Black/Black British': 'Black / African / Caribbean / Black British', 
                                'Disability Missing': 'Missing Disability'}, inplace = True)


fairness_metrics_CI

In [None]:
# Produce report ready version of table
fairness_metrics_CI['Characteristic'] = pd.Categorical(fairness_metrics_CI['Characteristic'],
                                                 categories = ['Under 1 Year', '1-4 Years', '5-9 Years', '10-15 Years',
       '16+ Years', 'Female', 'Male', 'Unknown, Unborn or Indeterminate',
       'Disabled', 'Not Disabled', 'Missing Disability', 'Asian British',
       'Black / African / Caribbean / Black British', 'Mixed Ethnicity',
       'Other Ethnicity', 'Ethnicity Not Known', 'White'], ordered = True)
fairness_metrics_CI.reset_index(drop=True, inplace=True)
fairness_metrics_CI.sort_values(by = 'Characteristic', inplace = True)
fairness_metrics_CI.to_csv('{}/Models/Fairness_metrics_CI_{}.csv'.format(local_dir, file_stub))

In [None]:
# Reorder dictionaries to allow for all the predictions for a subgroup to be in one list
from collections import defaultdict
metric_scores_dict = {}
for m in metrics:
    print(m)
    var_scores_dict = {}
    for var in df_dict_variables_metrics.keys():
        print(var)
        subgroup_dict = defaultdict(list)
        for p in df_dict_variables_metrics[var].keys():
            print(p)
            df_temp = df_dict_variables_metrics[var][p]
            for subgroup in df_temp[var].unique():
                print(subgroup)
                subgroup_score = df_temp.loc[df_temp[var] == subgroup,m].values[0]
                print(subgroup_score)
                subgroup_dict[subgroup].append(subgroup_score)
                var_scores_dict[var] = subgroup_dict
            metric_scores_dict[m] = var_scores_dict       

In [None]:
# Pickle metrics scores
import pickle
with open("{}/Models/Prediction Intervals/metric_scores_dict_{}.pkl".format(local_dir, file_stub), "wb") as handle:
    pickle.dump(metric_scores_dict, handle, protocol = pickle.HIGHEST_PROTOCOL)

## Restart here

In [None]:
import pickle

filename = open("{}/Models/Prediction Intervals/metric_scores_dict_{}.pkl".format(local_dir, file_stub), 'rb')
metric_scores_dict = pickle.load(filename)

fairness_metrics_CI = pd.read_csv('{}/Models/Fairness_metrics_CI_{}.csv'.format(local_dir, file_stub), index_col = 0)

In [None]:
### Set up test for difference between the scores on different subgroups

from scipy.stats import friedmanchisquare
import numpy as np
#!pip install scikit_posthocs
import scikit_posthocs as sp

# Age
under_one = metric_scores_dict['average_precision_score']['Age_cut']['under 1 year']
one_to_four = metric_scores_dict['average_precision_score']['Age_cut']['1-4 years']
five_to_nine = metric_scores_dict['average_precision_score']['Age_cut']['5-9 years']
ten_to_fifteen = metric_scores_dict['average_precision_score']['Age_cut']['10-15 years']
sixteen_plus = metric_scores_dict['average_precision_score']['Age_cut']['16+ years']

age_list = [under_one, one_to_four, five_to_nine, ten_to_fifteen, sixteen_plus]
age_names = ['Under 1 Year', '1-4 Years', '5-9 Years', '10-15 Years', '16+ Years']

# Gender
female = metric_scores_dict['average_precision_score']['Gender']['Female']
male = metric_scores_dict['average_precision_score']['Gender']['Male']
unknown = metric_scores_dict['average_precision_score']['Gender']['Unknown, Unborn or Indeterminate']

gender_list = [female, male, unknown]
gender_names = sorted(metric_scores_dict['average_precision_score']['Gender'].keys())

# Disability
disabled = metric_scores_dict['average_precision_score']['hasdisability']['Disabled']
not_disabled = metric_scores_dict['average_precision_score']['hasdisability']['Not Disabled']
disabled_missing = metric_scores_dict['average_precision_score']['hasdisability']['Disability Missing']

disability_list = [disabled, not_disabled, disabled_missing]
disability_names = ['Disabled', 'Not Disabled', 'Missing Disability']

# Ethnicity
asian = metric_scores_dict['average_precision_score']['ethnicity_highlevel']['Asian/Asian British']
black = metric_scores_dict['average_precision_score']['ethnicity_highlevel']['Black/Black British']
unknown_ethnicity = metric_scores_dict['average_precision_score']['ethnicity_highlevel']['Ethnicity Not Known']
mixed_ethnicity = metric_scores_dict['average_precision_score']['ethnicity_highlevel']['Mixed Ethnicity']
other_ethnicity = metric_scores_dict['average_precision_score']['ethnicity_highlevel']['Other Ethnicity']
white = metric_scores_dict['average_precision_score']['ethnicity_highlevel']['White']

ethnicity_list = [asian, black, unknown_ethnicity, mixed_ethnicity, other_ethnicity, white]
ethnicity_names = (['Asian British', 'Black / African / Caribbean / Black British',
                    'Ethnicity Not Known', 'Mixed Ethnicity', 'Other Ethnicity',
                     'White'])

group_dict = {'Age_cut': age_list, 'Gender': gender_list, 
              'hasdisability': disability_list, 
              'ethnicity_highlevel': ethnicity_list}

group_names_dict = {'Age_cut': age_names, 'Gender': gender_names,  
                    'hasdisability': disability_names, 'ethnicity_highlevel': ethnicity_names}

In [None]:
# Calculate ratio of sds

import statistics
max_sd_dict, min_sd_dict = {}, {}
for k in group_dict.keys():
    print(k)
    max_sd = 0
    min_sd = 1
    for g in group_dict[k]:
        sd = statistics.stdev(g)
        #print(sd)
        if sd > max_sd:
            max_sd = sd
            max_sd_dict[k] = max_sd
        if sd < min_sd:
            min_sd = sd
            min_sd_dict[k] = min_sd

min_sds = list(min_sd_dict.values())
max_sds = list(max_sd_dict.values())
sds_ratios = [b / m for b,m in zip(max_sds, min_sds)]
sds_ratios_dict = dict(zip(group_dict.keys(), sds_ratios))
pd.DataFrame([sds_ratios_dict]).to_csv('{}/Models/Fairness_metrics_sds_{}.csv'.format(local_dir, file_stub))

In [None]:
## Test for differences between subgroup and if stat sig different, find which subgroups
# NB This will produce a different answer than looking at whether the CI overall. CI is the CI around the median
# whilst the Friedman tests for differences in the ranking. 
# Looking at CIs leads to more Type 2 errors (failure to identify differences)
# https://statisticsbyjim.com/hypothesis-testing/confidence-intervals-compare-means/
post_test_melted_list = []
friedman_dict, subgroups_bias_dict = {}, {}
for group, group_list in group_dict.items():
    N = len(group_dict[group][0])
    # Are the scores different?
    friedman = friedmanchisquare(*group_list)
    friedman_dict[group] = "Friedman chi squared statistic ={}, n for each group = {}, P={}, two-tailed".format(round(friedman.statistic, 1), N, round(friedman.pvalue, 2))
    print()

    # Post test: if the scores are different, for which subgroups?
    if friedman.pvalue < 0.05:
        post_test = sp.posthoc_nemenyi_friedman(np.array([*group_list]).T)
        post_test.columns = group_names_dict[group]
        post_test.index = group_names_dict[group]  
        

        print(post_test[(post_test < 0.05) & (post_test != -1)])
        

        biased = post_test[(post_test < 0.05) & (post_test != -1)].to_numpy()
        bias_dict = {}
        for idrow, row in enumerate(biased):
            for idcol, col in enumerate(row):
                if not pd.isnull(col):       
                    print(post_test.columns[idcol])
                    print(post_test.index[idrow])
                    bias_dict[post_test.columns[idcol]] = post_test.index[idrow]

        bias_list = []
        for subgroup in bias_dict.keys():
            AP1 = fairness_metrics_CI.loc[fairness_metrics_CI['Characteristic'] == subgroup,'Average precision score'].values[0]
            LL1 = fairness_metrics_CI.loc[fairness_metrics_CI['Characteristic'] == subgroup,'Average precision score 95% CI (LL)'].values[0]
            UL1 = fairness_metrics_CI.loc[fairness_metrics_CI['Characteristic'] == subgroup,'Average precision score 95% CI (UL)'].values[0]
            AP2 = fairness_metrics_CI.loc[fairness_metrics_CI['Characteristic'] == bias_dict[subgroup],'Average precision score'].values[0]
            LL2 = fairness_metrics_CI.loc[fairness_metrics_CI['Characteristic'] == bias_dict[subgroup],'Average precision score 95% CI (LL)'].values[0]
            UL2 = fairness_metrics_CI.loc[fairness_metrics_CI['Characteristic'] == bias_dict[subgroup],'Average precision score 95% CI (UL)'].values[0]
            if fairness_metrics_CI.loc[fairness_metrics_CI['Characteristic'] == subgroup,'Average precision score'].values[0] > fairness_metrics_CI.loc[fairness_metrics_CI['Characteristic'] == bias_dict[subgroup],'Average precision score'].values[0]:
                bias = """The model makes fewer errors for {} than {}. The average precision for {} is {} (95% CI [{}, {}]) whilst the average precision for {} is {} (95% CI [{}, {}])""".format(subgroup, bias_dict[subgroup], subgroup, AP1, LL1, UL1,
                                                   bias_dict[subgroup], AP2, LL2, UL2)
                bias_list.append(bias)
            else:
                bias = """The model makes fewer errors for {} than {}. The average precision for {} is {} (95% CI [{}, {}]) whilst the average precision for {} is {} (95% CI [{}, {}])""".format(bias_dict[subgroup], subgroup, bias_dict[subgroup], AP2, LL2, UL2,
                                                   subgroup, AP1, LL1, UL1)
                
                bias_list.append(bias)

        bias_list = list(set(bias_list))
        subgroups_bias_dict[group] = bias_list

        # Create dataframe for the appendix
        post_test.reset_index(inplace = True)
        post_test_melted = pd.melt(post_test, id_vars=['index'], value_vars=group_names_dict[group])
        post_test_melted.rename(columns = {'index': 'Subgroup 1', 'variable': 'Subgroup 2', 'value': 'P-value'}, inplace = True)
        post_test_melted = post_test_melted.loc[post_test_melted['Subgroup 1'] != post_test_melted['Subgroup 2'],]


        # Only keep unique permutations
        post_test_melted['Duplicated Subgroups'] = [sorted([a,b]) for a,b in zip(post_test_melted['Subgroup 1'], post_test_melted['Subgroup 2'])]
        post_test_melted['Duplicated Subgroups'] = post_test_melted ['Duplicated Subgroups'].astype(str)
        post_test_melted.drop_duplicates(subset=['Duplicated Subgroups'], inplace=True)
        post_test_melted.drop(columns = 'Duplicated Subgroups', inplace = True)
        post_test_melted['Subgroup 1'] = pd.Categorical(post_test_melted['Subgroup 1'], categories = group_names_dict[group], ordered = True)
        post_test_melted['Subgroup 2'] = pd.Categorical(post_test_melted['Subgroup 2'], categories = group_names_dict[group], ordered = True)
        post_test_melted.sort_values(by=['P-value', 'Subgroup 1', 'Subgroup 2'], inplace = True)

        # Compare to thresholds determined by Hochberg's step up procedure
        k = post_test_melted.shape[0]
        sig_threshold = [0.01/k*(r+1) for r in range(0,post_test_melted.shape[0])]
        post_test_melted["Hochberg's significance threshold"] = sig_threshold
        post_test_melted["P-value lower than the significance threshold according to Hochberg's step up procedure"] = post_test_melted['P-value'] <= post_test_melted["Hochberg's significance threshold"]
        post_test_melted['P-value'] = round(post_test_melted['P-value'], 4)
        post_test_melted["Hochberg's significance threshold"] = round(post_test_melted["Hochberg's significance threshold"], 4)

        # See whether CIs overlap
        post_test_melted = post_test_melted.merge(fairness_metrics_CI[['Average precision score 95% CI (LL)', 'Characteristic']],
                                              how = 'left', left_on = 'Subgroup 1', right_on = 'Characteristic')
        post_test_melted = post_test_melted.merge(fairness_metrics_CI[['Average precision score 95% CI (UL)', 'Characteristic']],
                                              how = 'left', left_on = 'Subgroup 1', right_on = 'Characteristic')
        post_test_melted.rename(columns = {'Average precision score 95% CI (LL)': 'Subgroup 1: Average precision score 95% CI (LL)',
                                        'Average precision score 95% CI (UL)': 'Subgroup 1: Average precision score 95% CI (UL)'},
                             inplace = True)
        post_test_melted.drop(columns = ['Characteristic_x', 'Characteristic_y'], inplace = True)
        post_test_melted = post_test_melted.merge(fairness_metrics_CI[['Average precision score 95% CI (LL)', 'Characteristic']],
                                              how = 'left', left_on = 'Subgroup 2', right_on = 'Characteristic')
        post_test_melted = post_test_melted.merge(fairness_metrics_CI[['Average precision score 95% CI (UL)', 'Characteristic']],
                                              how = 'left', left_on = 'Subgroup 2', right_on = 'Characteristic')

        post_test_melted.drop(columns = ['Characteristic_x', 'Characteristic_y'], inplace = True)
        post_test_melted.rename(columns = {'Average precision score 95% CI (LL)': 'Subgroup 2: Average precision score 95% CI (LL)',
                                        'Average precision score 95% CI (UL)': 'Subgroup 2: Average precision score 95% CI (UL)'},
                             inplace = True)

        post_test_melted['Subgroups are significantly different according to a comparison of confidence intervals'] = np.where((post_test_melted['Subgroup 1: Average precision score 95% CI (LL)'] >
                                                                                        post_test_melted['Subgroup 2: Average precision score 95% CI (UL)']) |
                                                                                                                               (post_test_melted['Subgroup 2: Average precision score 95% CI (LL)'] >
                                                                                        post_test_melted['Subgroup 1: Average precision score 95% CI (UL)']), True, False)
        post_test_melted_list.append(post_test_melted)
        
fairness_comparisons = pd.concat(post_test_melted_list, axis = 0)



In [None]:
# Save biases
fairness_comparisons.to_csv('{}/Models/Fairness_metrics_comparing_subgroups_{}.csv'.format(local_dir, file_stub), index = False)
pd.Series(friedman_dict).to_csv('{}/Models/Friedman tests_{}.csv'.format(local_dir, file_stub))
pd.Series(subgroups_bias_dict).to_csv('{}/Models/Bias in model_{}.csv'.format(local_dir, file_stub))
