# Master Results

This notebook combines all the results from all the LAs to gain an insight into what the results tell us overall.

Illustrated with LA1 and LA2.

In [None]:
import os
os.chdir('/Users/[username]/Documents') # insert [username]

In [None]:
import glob
import os
import pandas as pd
import re

def aggregate_results(file_path, result_name, col_names, la_name, pivot):
    '''Loops through files in the file_path with the result_name.
    col_names is values in the first column that become columns upon pivoting
    if the data is arranged as:
    
        Average precision	0.06
        AUC	0.55
        F score (beta = 0.1)	0.15
        Precision	0.15
        Recall	0.14
    
    la_name is the name or number of the LA
    pivot takes True when data is structured as follows:
    
        Average precision	0.06
        AUC	0.55
        F score (beta = 0.1)	0.15
        Precision	0.15
        Recall	0.14    
    
    pivot takes False when data is structued as follows:
    
        Age_at_Referral_Start_cut	average_precision_score	false_discovery_rate	false_omission_rate
        10-15 years	0.213166898	0.366666667	0.042288557
        5-9 years	0.343099185	0.255319149	0.041564792
        1-4 years	0.342333697	0.31372549	0.031555222
        under 1 year	0.252508361	0.333333333	0.022121015
        16+ years	0.253814083	0.362068966	0.055939227
    
    '''
    df_all = pd.DataFrame(columns = col_names)
    for file in glob.glob("{}/{}*.csv".format(file_path, result_name)):
        if col_names != []:
            df = pd.read_csv(file, header = None)
        else:
            df = pd.read_csv(file, index_col = 0)
        file_n = re.sub('.csv', '', file)
        file_n = re.sub(file_path + '/' + result_name, '', file_n)
        print(file_n)
        df['model'] = file_n
        if pivot is False:
            df_all = pd.concat([df_all, df], axis = 0)
        else:
            df_pivoted = pd.pivot_table(df, index = 'model', columns = 0)
            df_pivoted.columns = [col[1] for col in df_pivoted.columns]
            df_pivoted.index.name = None
            df_all = pd.concat([df_all, df_pivoted], axis = 0)

    if pivot is False:
        return df_all
    else:
        df_all.reset_index(inplace = True)  
        df_all.rename(columns = {'index': 'model'}, inplace = True)
        df_all['LA'] = la_name
        return df_all

## Model scores

In [None]:
model_score_columns = ['AUC', 'Average precision', 'F score (beta = 0.1)', 'Precision','Recall']
model_score_la1 = aggregate_results(file_path = 'LA 1 Results', result_name = 'scores_', col_names = model_score_columns , la_name = 1, pivot = True)
model_score_la2 = aggregate_results(file_path = 'Models/Scores', result_name = 'scores_', col_names = model_score_columns, la_name = 2, pivot = True) 

# Concatenate all info about the model scores together
model_score_all = pd.concat([model_score_la1, model_score_la2], axis = 0, ignore_index = True)


In [None]:
# Extract information about the model
import numpy as np
model_split = model_score_all['model'].str.split('_', n = 3, expand = True) 
model_split.rename(columns ={0: 'Research Question Number', 1: 'Cross-validation Method', 2: 'Data Included', 3: 'Test or Holdout'}, inplace = True)
# At LA1, we used test and holdout data
model_score_all = model_score_all.merge(model_split, left_index = True, right_index = True)
model_score_all['Test or Holdout'] = np.where(model_score_all['LA'] == 2, 'holdout', model_score_all['Test or Holdout'])

In [None]:
model_score_all.drop(columns = 'model', inplace = True, errors = 'ignore')
model_score_all.to_csv('All model scores LA1 and LA2.csv')
model_score_all

In [None]:
### Comparison of means

# LA
average_precision_la_1 = round(model_score_all.loc[model_score_all['LA'] == 1, 'Average precision'].mean(), 2)
average_precision_la_2 = round(model_score_all.loc[model_score_all['LA'] == 2, 'Average precision'].mean(), 2)
print(average_precision_la_1)
print(average_precision_la_2)

# Cross-validation method
average_precision_cross_val_ss = round(model_score_all.loc[model_score_all['Cross-validation Method'] == 'ss', 'Average precision'].mean(), 2)
average_precision_cross_val_ts = round(model_score_all.loc[model_score_all['Cross-validation Method'] == 'ts', 'Average precision'].mean(), 2)
print(average_precision_cross_val_ss)
print(average_precision_cross_val_ts)

# Data included
average_precision_data_included_str = round(model_score_all.loc[model_score_all['Data Included'] == 'str', 'Average precision'].mean(), 2)
average_precision_data_included_all = round(model_score_all.loc[model_score_all['Data Included'] == 'all', 'Average precision'].mean(), 2)
print(average_precision_data_included_str)
print(average_precision_data_included_all)

aggregate_average_precision = pd.DataFrame(data = {'Local authority 1': average_precision_la_1,
                                                  'Local authority 2': average_precision_la_2,
                                                  'Contemporary cross-validation': average_precision_cross_val_ss,
                                                  'Cross-validation over time': average_precision_cross_val_ts,
                                                  'Using just structured data': average_precision_data_included_str,
                                                  'Using structured and text data': average_precision_data_included_all},
                                          index = [0])

aggregate_average_precision.to_csv('Aggregate_model_scores_LAs1and2.csv')

In [None]:
aggregate_average_precision

In [None]:
### Does including all data strictly dominate using structured data?

scores = ['AUC', 'Average precision', 'F score (beta = 0.1)',
           'Precision', 'Recall']
data_included_comparison_all = pd.DataFrame()
for la in [1,2]:
    for cv in ['ss', 'ts']:
        for rq in ['rq1', 'rq2']:
            try:
                model_str = model_score_all.loc[(model_score_all['LA'] == la) & 
                                    (model_score_all['Cross-validation Method'] == cv) &
                                    (model_score_all['Research Question Number'] == rq) &
                                    (model_score_all['Data Included'] == 'str'), scores]
                model_str.reset_index(inplace = True, drop = True)


                model_all = model_score_all.loc[(model_score_all['LA'] == la) & 
                                    (model_score_all['Cross-validation Method'] == cv) &
                                    (model_score_all['Research Question Number'] == rq) &
                                    (model_score_all['Data Included'] == 'all'), scores]
                model_all.reset_index(inplace = True, drop = True)

                # Check whether all dominates str
                data_included_comparison = model_all >=  model_str
                data_included_comparison['LA'] = la
                data_included_comparison['Cross-validation Method'] = cv
                data_included_comparison['Research Question Number'] = rq
                data_included_comparison_all = pd.concat([data_included_comparison_all, data_included_comparison], axis = 0, ignore_index = True)
            except:
                continue
data_included_comparison_all.to_csv('Comparison of data included LA1 and LA2.csv')

In [None]:
# True where including the text data improves the score over just using the structured data
data_included_comparison_all

In [None]:
# Comparison of cross-validation methods
scores = ['AUC', 'Average precision', 'F score (beta = 0.1)',
           'Precision', 'Recall']
cross_validation_comparison_all = pd.DataFrame()
for la in [1,2]:
    for data in ['str', 'all']:
        for rq in ['rq1', 'rq2']:
            try:
                model_ss = model_score_all.loc[(model_score_all['LA'] == la) & 
                                    (model_score_all['Cross-validation Method'] == 'ss') &
                                    (model_score_all['Research Question Number'] == rq) &
                                    (model_score_all['Data Included'] == data), scores]
                model_ss.reset_index(inplace = True, drop = True)


                model_ts = model_score_all.loc[(model_score_all['LA'] == la) & 
                                    (model_score_all['Cross-validation Method'] == 'ts') &
                                    (model_score_all['Research Question Number'] == rq) &
                                    (model_score_all['Data Included'] == data), scores]
                model_ts.reset_index(inplace = True, drop = True)

                # Check whether all dominates str
                cross_validation_comparison = model_ss >=  model_ts
                cross_validation_comparison['LA'] = la
                cross_validation_comparison['Data Included'] = data
                cross_validation_comparison['Research Question Number'] = rq
                cross_validation_comparison_all = pd.concat([cross_validation_comparison_all, cross_validation_comparison], axis = 0, ignore_index = True)
            except:
                continue

cross_validation_comparison_all.to_csv('Comparison of cross-validation methods LA1 and LA2.csv')

In [None]:
# True = cross validating contemporaneously has a better score than cross-validating over time
cross_validation_comparison_all

## Intuitive Results

In [None]:
intuitive_metrics_columns = (['% of risky cases in top 10%',
'% of safe cases in bottom 10%',
'Number of true positives in 1000 cases',
'Number of true negatives in 1000 cases',
'Number of false positives in 1000 cases',
'Number of false negatives in 1000 cases'])

intuitive_results_la1 = aggregate_results(file_path = 'LA 1 Results', result_name = 'Intuitive metrics ', col_names = intuitive_metrics_columns, la_name = 1, pivot = True)
intuitive_results_la2 = aggregate_results(file_path = '/Models/Scores', result_name = 'Intuitive metrics ', col_names = intuitive_metrics_columns, la_name = 2, pivot = True)

# Concatenate all info about the model scores together
#intuitive_results_all = pd.concat([intuitive_results_la1, intuitive_results_la2], axis = 0, ignore_index = True)
intuitive_results_all = intuitive_results_la2

In [None]:
# Extract information about the model
import numpy as np
model_split = intuitive_results_all['model'].str.split('_', n = 3, expand = True) 
model_split.rename(columns ={0: 'Research Question Number', 1: 'Cross-validation Method', 2: 'Data Included'}, inplace = True)
intuitive_results_all = intuitive_results_all.merge(model_split, left_index = True, right_index = True)
intuitive_results_all.drop(columns = ['% of risky cases in top 10%', '% of safe cases in bottom 10%'], inplace = True)

In [None]:
# Number of false / true positives and negatives in 1000 cases
intuitive_results_all

In [None]:
# Look at ratios of False Positives to True Positives
comparison_FP_TP_all = pd.DataFrame(columns = ['Ratio FP / TP', 'LA', 'Research Question Number',
       'Cross-validation Method', 'Data Included'])

for la in [1,2]:
    for data in ['str', 'all']:
        for cv in ['ss', 'ts']:
            for rq in ['rq1', 'rq2']:
                try:
                    mask = ((intuitive_results_all['LA'] == la) & 
                    (intuitive_results_all['Cross-validation Method'] == cv) &
                    (intuitive_results_all['Research Question Number'] == rq) &
                    (intuitive_results_all['Data Included'] == data))
                    
                    comparison_FP_TP = pd.DataFrame(columns = ['LA', 'Research Question Number',
                    'Cross-validation Method', 'Data Included', 'Ratio FP / TP'])
                    
                    comparison_FP_TP.loc[0, 'LA'] = la
                    comparison_FP_TP['Research Question Number'] = rq
                    comparison_FP_TP['Cross-validation Method'] = cv
                    comparison_FP_TP['Data Included'] = data
                    comparison_FP_TP['Ratio FP / TP'] = round(int(intuitive_results_all.loc[mask, 'Number of false positives in 1000 cases']) / int(intuitive_results_all.loc[mask, 'Number of true positives in 1000 cases']), 2)

                    comparison_FP_TP_all = pd.concat([comparison_FP_TP_all, comparison_FP_TP], axis = 0, ignore_index = True)
                except:
                    continue

comparison_FP_TP_all = comparison_FP_TP_all[['LA', 'Research Question Number', 'Cross-validation Method', 'Data Included', 'Ratio FP / TP']]
comparison_FP_TP_all.sort_values(by = 'Ratio FP / TP', inplace = True)
comparison_FP_TP_all.to_csv('Comparison of False Positives to False Negatives LA2.csv')

In [None]:
# Want the ratio to be as close to 0 as possible
# Where Ratio FP / TP > 1, a case identified as likely to escalate is more likely to have been falsely identified
comparison_FP_TP_all

## Fairness

In [None]:
## LA1
fairness_gender_la1 = aggregate_results(file_path = 'LA 1 Results', result_name = 'Fairness_metrics_Gender_', col_names = [], la_name = 1, pivot = False)
fairness_age_la1 = aggregate_results(file_path = 'LA 1 Results', result_name = 'Fairness_metrics_Age_at_Referral_Start_cut_', col_names = [], la_name = 1, pivot = False)
fairness_disabled_la1 = aggregate_results(file_path = 'LA 1 Results', result_name = 'Fairness_metrics_Disabled_', col_names = [], la_name = 1, pivot = False)
fairness_ethnicity_la1 = aggregate_results(file_path = 'LA 1 Results', result_name = 'Fairness_metrics_Ethnicity_grouped_', col_names = [], la_name = 1, pivot = False)

# Add LA name and drop rows with NA in metrics
fairness_gender_la1['LA'] = 1
fairness_gender_la1.dropna(subset = ['average_precision_score', 'false_discovery_rate','false_omission_rate'], inplace = True)
fairness_age_la1['LA'] = 1
fairness_age_la1.dropna(subset = ['average_precision_score', 'false_discovery_rate','false_omission_rate'], inplace = True)
fairness_age_la1.rename(columns = {'Age_at_Referral_Start_cut': 'Age_cut'}, inplace = True) # Rename so names are same for all LAs
fairness_disabled_la1['LA'] = 1
fairness_disabled_la1.dropna(subset = ['average_precision_score', 'false_discovery_rate','false_omission_rate'], inplace = True)
fairness_ethnicity_la1['LA'] = 1
fairness_ethnicity_la1.dropna(subset = ['average_precision_score', 'false_discovery_rate','false_omission_rate'], inplace = True)



In [None]:
## LA2
# NB ethnicity not available
fairness_gender_la2 = aggregate_results(file_path = '/Models', result_name = 'Fairness_metrics_Gender_', col_names = [], la_name = 2, pivot = False)
fairness_age_la2 = aggregate_results(file_path = '/Models', result_name = 'Fairness_metrics_Age_cut_', col_names = [], la_name = 2, pivot = False)
fairness_disabled_la2 = aggregate_results(file_path = '/Models', result_name = 'Fairness_metrics_Disabled_', col_names = [], la_name = 2, pivot = False)

# Add LA name and drop rows with NA in metrics and here the group
fairness_gender_la2['LA'] = 2
fairness_gender_la2.dropna(subset = ['Gender', 'average_precision_score', 'false_discovery_rate','false_omission_rate'], inplace = True)
fairness_age_la2['LA'] = 2
fairness_age_la2.dropna(subset = ['Age_cut', 'average_precision_score', 'false_discovery_rate','false_omission_rate'], inplace = True)
fairness_age_la2 = fairness_age_la2.loc[fairness_age_la2['Age_cut'] != 'missing',]
fairness_disabled_la2['LA'] = 2
fairness_disabled_la2.dropna(subset = ['Disabled', 'average_precision_score', 'false_discovery_rate','false_omission_rate'], inplace = True)


In [None]:
# Bring together for LA1 and LA2
fairness_gender = pd.concat([fairness_gender_la1, fairness_gender_la2], axis = 0)
fairness_gender.reset_index(inplace = True, drop = True)

fairness_age = pd.concat([fairness_age_la1, fairness_age_la2], axis = 0)
fairness_age.reset_index(inplace = True, drop = True)

fairness_disabled = pd.concat([fairness_disabled_la1, fairness_disabled_la2], axis = 0)
fairness_disabled.reset_index(inplace = True, drop = True)

# Ethnicity not available for LA2
fairness_ethnicity = fairness_ethnicity_la1

In [None]:
def add_fairness_rankings(df_all, subgroup_name):
    '''
    Subsets the data by model and then ranks the performance metric of each subgroup.
    Finds the median ranking for each subgroup when all models are considered together.
    
    Parameters
    - df_all takes in the output of aggregate_results where result_name = 'Fairness_metrics_*'
    - subgroup_name: name of the group e.g. Gender. Needs to match the column name in df_all
    
    '''
    fairness_rankings_all = pd.DataFrame()
    # assumes low ranks are good
    for la in [1, 2]:
        for model in ['rq1_ss_str', 'rq1_ts_str', 'rq2_ts_str', 'rq2_ss_str']:
            df_fair = df_all.loc[(df_all['model'] == model) & (df_all['LA'] == la),]
            # Average Precision (higher is better => descending so highest value receives the lowest (i.e. best) rank)
            df_fair['average_precision_score_rank'] = df_fair['average_precision_score'].rank(ascending = False)
            # False Discovery Rate (lower is better => ascending so lowest value receives the lowest (i.e. best) rank)
            df_fair['false_discovery_rate_rank'] = df_fair['false_discovery_rate'].rank(ascending = True) 
            # False Omission Rate
            df_fair['false_omission_rate_rank'] = df_fair['false_omission_rate'].rank(ascending = True)
            fairness_rankings_all = pd.concat([fairness_rankings_all, df_fair], axis = 0) 
            fairness_rankings_all_median = fairness_rankings_all.groupby([subgroup_name])[['average_precision_score_rank', 'false_discovery_rate_rank', 'false_omission_rate_rank']].median()
            fairness_rankings_all_median = fairness_rankings_all_median.add_prefix('median_')
            fairness_rankings_all_median.reset_index(inplace = True)
            fairness_rankings_all_median.sort_values(by = 'median_average_precision_score_rank', inplace = True)
    return(fairness_rankings_all_median)

In [None]:
# Assumes low ranks are good
fairness_gender_w_rankings = add_fairness_rankings(fairness_gender, 'Gender')
fairness_age_w_rankings = add_fairness_rankings(fairness_age, 'Age_cut')
fairness_disabled_w_rankings = add_fairness_rankings(fairness_disabled, 'Disabled')
fairness_ethnicity_w_rankings = add_fairness_rankings(fairness_ethnicity, 'Ethnicity_grouped')

fairness_gender_w_rankings.to_csv('Fairness - median ranking by subgroup Gender.csv')
fairness_age_w_rankings.to_csv('Fairness - median ranking by subgroup Age.csv')
fairness_disabled_w_rankings.to_csv('Fairness - median ranking by subgroup Disabled.csv')
fairness_ethnicity_w_rankings.to_csv('Fairness - median ranking by subgroup Ethnicity.csv')

## Prediction Intervals

In [None]:
prediction_intervals_la2 = aggregate_results(file_path = '/Models/Prediction Intervals', result_name = 'Prediction intervals - max, ave', col_names = [], la_name = 2, pivot = False)


In [None]:
# Find the average of average width of prediction interval and width of prediction interval at threshold value
# Gives an indication of how precise the prediction probabilities are
average_prediction_intervals_la2 = pd.DataFrame()
average_prediction_intervals_la2.loc[0, 'Average average width of prediction interval'] = round(prediction_intervals_la2['Average width of prediction interval'].mean(), 4)
average_prediction_intervals_la2.loc[0, 'Average width of prediction interval at threshold value'] = round(prediction_intervals_la2['Width of prediction interval at threshold value'].mean(), 4)
average_prediction_intervals_la2.to_csv('Average prediction intervals LA2.csv')