Consolidates fairness measures for summary graph


In [None]:
import os

print(os.getcwd())

In [None]:
# Pull in list of model performance metrics
import glob
import pandas as pd
file_list = glob.glob("*/*.csv")

file_list = [f for f in file_list if 'Fairness' in f]
file_list = [f for f in file_list if 'Average Fairness Metrics' not in f]
file_list = [f for f in file_list if 'Fairness grouped by characteristic' not in f]
file_list = [f for f in file_list if 'Fairness_metrics_comparing_subgroups' not in f]
file_list = [f for f in file_list if 'Fairness unaggregated' not in f]
file_list = [f for f in file_list if 'Fairness unaggregated with N' not in f]


file_list

In [None]:
file_dict = {}
for file_name in file_list:

    file = pd.read_csv(file_name, index_col = 0) 
    file_name = file_name.replace("Fairness ", "")
    file_name = file_name.replace("Fairness_metrics_CI_", "") 
    file_name = file_name.replace(".csv", "")
    LA, model_id = file_name.split('/')
    file['LA'] = LA
    file['model_id'] = model_id
    if 'False Discovery Rate' in file.columns:
        file.rename(columns = {'False Discovery Rate': 'False discovery rate',
                              'False Omission Rate': 'False omission rate'}, inplace = True)
    if 'Average precision score' in file.columns:
        file.rename(columns = {'Average precision score': 'Pinned average precision'}, inplace = True)
    file_dict[file_name] = file

print(file_dict.keys())

results = pd.concat(file_dict.values(), axis = 0, ignore_index = True)

In [None]:
# Check for categories to be combined
# Expect 16 for ethnicity as not available for LA3
results['Characteristic'].value_counts()


In [None]:
results['Characteristic'].unique()

age = (['Under 1 Year', '1-4 Years', '5-9 Years', '10-15 Years',
       '16+ Years', 'Missing age'])
    
gender = (['Female', 'Male', 'Unknown, Unborn or Indeterminate'])
        
disability = (['Disabled', 'Not Disabled', 'Missing Disability'])
    
ethnicity = (['Asian / Asian British',
       'Black / African / Caribbean / Black British', 'Mixed Ethnicity',
       'Other Ethnicity', 'Ethnicity Not Known', 'White'])

In [None]:
import numpy as np
results['Characteristic'] = np.where((results['Characteristic']=='Ethnicity Not Known') |
                                    (results['Characteristic']=='Declined / Missing') ,
                                    'Ethnicity Not Known', results['Characteristic'])

results['Characteristic'] = np.where((results['Characteristic']=='Black / British Black') |
                                    (results['Characteristic']=='Black / African / Caribbean /Black British') |
                                    (results['Characteristic']=='Black/Black British'),
                                    'Black / African / Caribbean / Black British', results['Characteristic'])

results['Characteristic'] = np.where((results['Characteristic']=='Unknown, Unborn or Indeterminate') |
                                     (results['Characteristic']=='Unborn'),
                                    'Unknown, Unborn or Indeterminate', results['Characteristic'])

results['Characteristic'] = np.where((results['Characteristic']=='Asian British') |
                                     (results['Characteristic']=='Asian/Asian British'),
                                    'Asian / Asian British', results['Characteristic'])


'''
results['Characteristic'] = np.where((results['Characteristic']=='Mixed / Multiple Ethnic Groups   ') |
                                    (results['Characteristic']=='Mixed Ethnicity'),
                                    'Mixed Ethnicity', results['Characteristic'])

results['Characteristic'] = np.where((results['Characteristic']=='Other Ethnic Groups') |
                                    (results['Characteristic']=='Other Ethnicity') |
                                    (results['Characteristic']=='Arab') ,
                                    'Other Ethnicity', results['Characteristic'])
'''

In [None]:
# Check recategorising worded
results['Characteristic'].value_counts()

In [None]:
results['Group'] = np.where(results['Characteristic'].isin(age), 'Age',
                           results['Group'])

results['Group'] = np.where(results['Characteristic'].isin(gender), 'Gender',
                           results['Group'])

results['Group'] = np.where(results['Characteristic'].isin(disability), 'Disability',
                           results['Group'])

results['Group'] = np.where(results['Characteristic'].isin(ethnicity), 'Ethnicity',
                           results['Group'])

results['Group'].value_counts()

In [None]:
results['N'].value_counts(dropna = False)
results.drop(columns = 'N', inplace = True)
fairness_w_N = pd.read_csv('Fairness unaggregated with N.csv', index_col = 0)


In [None]:
results.loc[results['Characteristic'] == 'Missing age',]

In [None]:
print(set(results['Characteristic']).difference(set(fairness_w_N['Characteristic'])))
print(set(fairness_w_N['Characteristic']).difference(set(results['Characteristic'])))

In [None]:
fairness_w_N['Characteristic'] = fairness_w_N['Characteristic'].replace({'Missing': 'Missing Disability',
                                                                       'Asian British': 'Asian / Asian British',
                                                                         'Black / African / Caribbean /Black British': 
                                                                         'Black / African / Caribbean / Black British'})
                                                                

In [None]:
print(set(results['Characteristic']).difference(set(fairness_w_N['Characteristic'])))
print(set(fairness_w_N['Characteristic']).difference(set(results['Characteristic'])))

In [None]:
print(results.shape)
results_with_N = results.merge(fairness_w_N[['Characteristic', 'N', 'LA', 'model_id']], on = ['Characteristic', 'LA', 'model_id'], how = 'left')
print(results_with_N.shape)

In [None]:
print(results_with_N.shape)
results_with_N.drop_duplicates(inplace = True)
print(results_with_N.shape)

In [None]:
# Save unaggregated version
results_with_N.to_csv('Output/Fairness unaggregated.csv')

In [None]:
# Create version for graph
results_for_graph = results[['Group', 'Characteristic', 'Pinned average precision', 
                            'Average precision score 95% CI (LL)', 'Average precision score 95% CI (UL)']]

results_for_graph

In [None]:
results_for_graph.to_csv('Output/Average Fairness Metrics.csv') 

In [None]:
from scipy.stats import mannwhitneyu

disabled = results.loc[results['Characteristic']=='Disabled','Pinned average precision']
not_disabled = results.loc[results['Characteristic']=='Not Disabled','Pinned average precision']
print(len(disabled))

mw_disability = mannwhitneyu(disabled, not_disabled, alternative='two-sided')
print(mw_disability)

male = results.loc[results['Characteristic']=='Male','Pinned average precision']
female = results.loc[results['Characteristic']=='Female','Pinned average precision']
print(len(male))

mw_gender = mannwhitneyu(male, female, alternative='two-sided')
print(mw_gender)


In [None]:
 results['Characteristic'].value_counts()

In [None]:
# 3+ samples, paired, not drawn from a normal distribution
# p.32 https://www.biochemia-medica.com/assets/images/upload/xml_tif/Marusteri_M_-_Comparing_groups_for_statistical_differences.pdf
# https://en.wikipedia.org/wiki/Friedman_test

from scipy.stats import friedmanchisquare

# Age
under_one = results.loc[results['Characteristic']=='Under 1 Year','Pinned average precision']
one_to_four = results.loc[results['Characteristic']=='1-4 Years','Pinned average precision']
five_to_nine = results.loc[results['Characteristic']=='5-9 Years','Pinned average precision']
ten_to_fifteen = results.loc[results['Characteristic']=='10-15 Years','Pinned average precision']
sixteen_plus = results.loc[results['Characteristic']=='16+ Years','Pinned average precision']

friedman_age = friedmanchisquare(under_one, one_to_four, five_to_nine, ten_to_fifteen, sixteen_plus)
print(friedman_age)

# Ethnicity
black_ethnicity = results.loc[(results['Characteristic']== 'Black / African / Caribbean / Black British'),
                    'Pinned average precision']
unknown_ethnicity = results.loc[(results['Characteristic']=='Ethnicity Not Known') ,
                                'Pinned average precision']
mixed_ethnicity = results.loc[(results['Characteristic']=='Mixed Ethnicity'),
                              'Pinned average precision']
other_ethnicity = results.loc[(results['Characteristic']=='Other Ethnicity') ,
                              'Pinned average precision']

white_ethnicity = results.loc[(results['Characteristic']=='White'),
                              'Pinned average precision']
print(len(black_ethnicity))
print((len(unknown_ethnicity))) # Excluding as not available for all LAs
print(len(mixed_ethnicity))
print(len(other_ethnicity))
print(len(white_ethnicity ))

friedman_ethnicity = friedmanchisquare(black_ethnicity, mixed_ethnicity, other_ethnicity, white_ethnicity)
print(friedman_ethnicity)


In [None]:
#!pip install scikit_posthocs
# Age
import numpy as np
import scikit_posthocs as sp

pw_age = sp.posthoc_nemenyi_friedman(np.array([under_one, one_to_four, five_to_nine, ten_to_fifteen, sixteen_plus]).T)


age_names = (['Under 1 Year', '1-4 Years', '5-9 Years', '10-15 Years',
       '16+ Years'])

pw_age.columns = age_names
pw_age.index = age_names

print(pw_age[(pw_age < 0.05) & (pw_age != -1)])


In [None]:
# Ethnicity
pw_ethnicity = sp.posthoc_nemenyi_friedman(np.array([black_ethnicity, mixed_ethnicity, other_ethnicity, white_ethnicity]).T)

ethnicity_names = (['Black / African / Caribbean / Black British', 
                    'Mixed Ethnicity',
                    'Other Ethnicity',
                    'White'])
pw_ethnicity.columns = ethnicity_names 
pw_ethnicity.index = ethnicity_names        
print(pw_ethnicity[(pw_ethnicity < 0.05) & (pw_ethnicity != -1)])




In [None]:
fairness_metrics_summarised = pd.DataFrame(index = ['Pinned average precision', 
                                                   'False discovery Rate', 
                                                   'False omission Rate'],
                                          data = {'Mean': 
                                                 [results['Pinned average precision'].mean(),
                                                 results['False discovery rate'].mean(),
                                                 results['False omission rate'].mean()],
                                                 'Standard Deviation':
                                                 [results['Pinned average precision'].std(),
                                                 results['False discovery rate'].std(),
                                                 results['False omission rate'].std()]})
fairness_metrics_summarised = fairness_metrics_summarised.round(2)
fairness_metrics_summarised.to_csv('Output/Fairness Metrics summarised.csv') # Data for fairness graph


In [None]:
# Create version for report
fairness_metrics_by_subgroup = results.groupby('Characteristic')[['Pinned average precision', 'False discovery rate',
       'False omission rate']].mean().reset_index()
# Sort the metrics
fairness_metrics_by_subgroup['Characteristic'] = pd.Categorical(fairness_metrics_by_subgroup['Characteristic'],
                                                 categories = ['Under 1 Year', '1-4 Years', '5-9 Years', '10-15 Years',
                                   '16+ Years', 'Missing age', 'Female', 'Male',
                                   'Unknown, Unborn or Indeterminate', 'Disabled', 'Not Disabled',
                                   'Missing Disability', 'Asian / Asian British',
                                   'Black / African / Caribbean / Black British', 'Mixed Ethnicity',
                                   'Other Ethnicity', 'Ethnicity Not Known', 'White'], ordered = True)
fairness_metrics_by_subgroup.sort_values(by = 'Characteristic', inplace = True)

# Round the metrics
fairness_metrics_by_subgroup[['Pinned average precision', 'False discovery rate',
       'False omission rate']] = fairness_metrics_by_subgroup[['Pinned average precision', 'False discovery rate',
       'False omission rate']].round(2)



In [None]:
# Create table for report
fairness_metrics_by_subgroup['Characteristic Type'] = np.where(fairness_metrics_by_subgroup['Characteristic'].isin(['Under 1 Year', '1-4 Years', '5-9 Years', '10-15 Years',
                                               '16+ Years', 'Missing age']),'Age Group',
                                                  np.where(fairness_metrics_by_subgroup['Characteristic'].isin(['Female', 'Male', 'Unknown, Unborn or Indeterminate']),
                                                          'Gender',
                                                          np.where(fairness_metrics_by_subgroup['Characteristic'].isin(['Disabled', 'Not Disabled', 'Missing Disability']),'Disability',
                                                             np.where(fairness_metrics_by_subgroup['Characteristic'].isin(['Asian / Asian British',
       'Black / African / Caribbean / Black British', 'Mixed Ethnicity',
       'Other Ethnicity', 'Ethnicity Not Known', 'White']), 'Ethnicity', 'NA'))))

fairness_metrics_by_subgroup = fairness_metrics_by_subgroup[['Characteristic Type', 'Characteristic', 
                                                             'Pinned average precision', 'False discovery rate',
                                                               'False omission rate']]

fairness_metrics_by_subgroup.to_csv('Output/Fairness grouped by characteristic.csv', index = False)
                                              

In [None]:
results['Characteristic'].unique()

age = (['Under 1 Year', '1-4 Years', '5-9 Years', '10-15 Years',
       '16+ Years', 'Missing age'])
    
gender = (['Female', 'Male', 'Unknown, Unborn or Indeterminate'])
        
disability = (['Disabled', 'Not Disabled', 'Missing Disability'])
    
ethnicity = (['Asian / Asian British',
       'Black / African / Caribbean / Black British', 'Mixed Ethnicity',
       'Other Ethnicity', 'Ethnicity Not Known', 'White'])

In [None]:
results_with_N_LA3 = results_with_N.loc[results_with_N['LA'] == 'LA3', [ 'model_id', 'Characteristic',
                                                                        'Group', 'Average precision score 95% CI (LL)',
                                                                       'Average precision score 95% CI (UL)', ]]
results_with_N_LA3

In [None]:

group_names_dict = {'Age': age, 'Gender': gender, 'Disabled': disability}

post_test_melted_list = []
for model in results_with_N_LA3['model_id'].unique():
    for group, names in group_names_dict.items():
        post_test = pd.DataFrame(columns = group_names_dict[group], index = group_names_dict[group])
        post_test.reset_index(inplace = True)
        post_test_melted = pd.melt(post_test, id_vars=['index'], value_vars=group_names_dict[group])
        post_test_melted.rename(columns = {'index': 'Subgroup 1', 'variable': 'Subgroup 2'}, inplace = True)
        post_test_melted = post_test_melted.loc[post_test_melted['Subgroup 1'] != post_test_melted['Subgroup 2'],]


        # Only keep unique permutations
        post_test_melted['Duplicated Subgroups'] = [sorted([a,b]) for a,b in zip(post_test_melted['Subgroup 1'], post_test_melted['Subgroup 2'])]
        post_test_melted['Duplicated Subgroups'] = post_test_melted['Duplicated Subgroups'].astype(str)
        post_test_melted.drop_duplicates(subset=['Duplicated Subgroups'], inplace=True)
        post_test_melted.drop(columns = 'Duplicated Subgroups', inplace = True)
        post_test_melted['Subgroup 1'] = pd.Categorical(post_test_melted['Subgroup 1'], categories = group_names_dict[group], ordered = True)
        post_test_melted['Subgroup 2'] = pd.Categorical(post_test_melted['Subgroup 2'], categories = group_names_dict[group], ordered = True)
        post_test_melted.sort_values(by=['Subgroup 1', 'Subgroup 2'], inplace = True)
        post_test_melted = post_test_melted.merge(results_with_N_LA3[['Average precision score 95% CI (LL)', 'Group', 'Characteristic', 'model_id']],
                                                  how = 'left', left_on = 'Subgroup 1', right_on = 'Characteristic')
        post_test_melted = post_test_melted.merge(results_with_N_LA3[['Average precision score 95% CI (UL)', 'Characteristic', 'model_id']],
                                                  how = 'left', left_on = ['Subgroup 1', 'model_id'], right_on = ['Characteristic', 'model_id'])
        post_test_melted.rename(columns = {'Average precision score 95% CI (LL)': 'Subgroup 1: Average precision score 95% CI (LL)',
                                            'Average precision score 95% CI (UL)': 'Subgroup 1: Average precision score 95% CI (UL)'},
                                 inplace = True)

        post_test_melted.drop(columns = ['Characteristic_x', 'Characteristic_y'], inplace = True)
        post_test_melted = post_test_melted.merge(results_with_N_LA3[['Average precision score 95% CI (LL)', 'Characteristic', 'model_id']],
                                              how = 'left', left_on = ['Subgroup 2', 'model_id'], right_on = ['Characteristic', 'model_id'])
        post_test_melted = post_test_melted.merge(results_with_N_LA3[['Average precision score 95% CI (UL)', 'Characteristic', 'model_id']],
                                              how = 'left', left_on = ['Subgroup 2', 'model_id'], right_on = ['Characteristic', 'model_id'])

        post_test_melted.drop(columns = ['Characteristic_x', 'Characteristic_y'], inplace = True)
        post_test_melted.rename(columns = {'Average precision score 95% CI (LL)': 'Subgroup 2: Average precision score 95% CI (LL)',
                                        'Average precision score 95% CI (UL)': 'Subgroup 2: Average precision score 95% CI (UL)'},
                             inplace = True)

        post_test_melted['Subgroups are significantly different according to a comparison of confidence intervals'] = np.where((post_test_melted['Subgroup 1: Average precision score 95% CI (LL)'] >
                                                                                        post_test_melted['Subgroup 2: Average precision score 95% CI (UL)']) |
                                                                                                                               (post_test_melted['Subgroup 2: Average precision score 95% CI (LL)'] >
                                                                                        post_test_melted['Subgroup 1: Average precision score 95% CI (UL)']), True, False)
        post_test_melted_list.append(post_test_melted)

In [None]:
post_test_melted_LA3 = pd.concat(post_test_melted_list, axis = 0)

In [None]:
print(post_test_melted_LA3.shape)
post_test_melted_LA3.drop_duplicates(inplace = True)
print(post_test_melted_LA3.shape)

In [None]:
post_test_melted_LA3.loc[(post_test_melted_LA3['Group'] == 'Age') &
                         (post_test_melted_LA3['model_id'] == 'rq2_ss_str'),]

In [None]:
print(post_test_melted_LA3.shape)
post_test_melted_LA3.drop_duplicates(inplace = True)
print(post_test_melted_LA3.shape)

In [None]:
post_test_melted_LA3.to_csv('Output/Fairness comparison LA3.csv')

In [None]:
fairness_final_LA3 = post_test_melted_LA3.groupby(['Group', 'model_id'])['Subgroups are significantly different according to a comparison of confidence intervals'].max()

In [None]:
fairness_final_LA3.to_csv('Bias metrics LA3.csv')

In [None]:
results_with_N_LA4_gender = results_with_N.loc[(results_with_N['LA'] == 'LA4') &
                  (results_with_N['model_id'] == 'rq2_ts_all') & 
                  (results_with_N['Group'] == 'Gender'),]



In [None]:
# Gender for LA2
group = 'Gender'
post_test = pd.DataFrame(columns = group_names_dict[group], index = group_names_dict[group])
post_test.reset_index(inplace = True)
post_test_melted = pd.melt(post_test, id_vars=['index'], value_vars=group_names_dict[group])
post_test_melted.rename(columns = {'index': 'Subgroup 1', 'variable': 'Subgroup 2'}, inplace = True)
post_test_melted = post_test_melted.loc[post_test_melted['Subgroup 1'] != post_test_melted['Subgroup 2'],]


# Only keep unique permutations
post_test_melted['Duplicated Subgroups'] = [sorted([a,b]) for a,b in zip(post_test_melted['Subgroup 1'], post_test_melted['Subgroup 2'])]
post_test_melted['Duplicated Subgroups'] = post_test_melted['Duplicated Subgroups'].astype(str)
post_test_melted.drop_duplicates(subset=['Duplicated Subgroups'], inplace=True)
post_test_melted.drop(columns = 'Duplicated Subgroups', inplace = True)
post_test_melted['Subgroup 1'] = pd.Categorical(post_test_melted['Subgroup 1'], categories = group_names_dict[group], ordered = True)
post_test_melted['Subgroup 2'] = pd.Categorical(post_test_melted['Subgroup 2'], categories = group_names_dict[group], ordered = True)
post_test_melted.sort_values(by=['Subgroup 1', 'Subgroup 2'], inplace = True)
post_test_melted = post_test_melted.merge(results_with_N_LA4_gender[['Average precision score 95% CI (LL)', 'Group', 'Characteristic', 'model_id']],
                                          how = 'left', left_on = 'Subgroup 1', right_on = 'Characteristic')
post_test_melted = post_test_melted.merge(results_with_N_LA4_gender[['Average precision score 95% CI (UL)', 'Characteristic', 'model_id']],
                                          how = 'left', left_on = ['Subgroup 1', 'model_id'], right_on = ['Characteristic', 'model_id'])
post_test_melted.rename(columns = {'Average precision score 95% CI (LL)': 'Subgroup 1: Average precision score 95% CI (LL)',
                                    'Average precision score 95% CI (UL)': 'Subgroup 1: Average precision score 95% CI (UL)'},
                         inplace = True)

post_test_melted.drop(columns = ['Characteristic_x', 'Characteristic_y'], inplace = True)
post_test_melted = post_test_melted.merge(results_with_N_LA4_gender[['Average precision score 95% CI (LL)', 'Characteristic', 'model_id']],
                                      how = 'left', left_on = ['Subgroup 2', 'model_id'], right_on = ['Characteristic', 'model_id'])
post_test_melted = post_test_melted.merge(results_with_N_LA4_gender[['Average precision score 95% CI (UL)', 'Characteristic', 'model_id']],
                                      how = 'left', left_on = ['Subgroup 2', 'model_id'], right_on = ['Characteristic', 'model_id'])

post_test_melted.drop(columns = ['Characteristic_x', 'Characteristic_y'], inplace = True)
post_test_melted.rename(columns = {'Average precision score 95% CI (LL)': 'Subgroup 2: Average precision score 95% CI (LL)',
                                'Average precision score 95% CI (UL)': 'Subgroup 2: Average precision score 95% CI (UL)'},
                     inplace = True)

post_test_melted['Subgroups are significantly different according to a comparison of confidence intervals'] = np.where((post_test_melted['Subgroup 1: Average precision score 95% CI (LL)'] >
                                                                                post_test_melted['Subgroup 2: Average precision score 95% CI (UL)']) |
                                                                                                                       (post_test_melted['Subgroup 2: Average precision score 95% CI (LL)'] >
                                                                                post_test_melted['Subgroup 1: Average precision score 95% CI (UL)']), True, False)


In [None]:
post_test_melted.to_csv('Output/Fairness comparison LA4.csv')

In [None]:
fairness_final_LA4_gender = post_test_melted.groupby(['Group', 'model_id'])['Subgroups are significantly different according to a comparison of confidence intervals'].max()

In [None]:
fairness_final_LA4_gender.to_csv('Bias metrics LA4 gender.csv')