# Creating Illustrations of the Comparisons between performance of different approaches to predictions on SPOT Dataset
## We create two sets of analyses: 1) For predictions with the Public Health budget item 2) For predictions without the Public Health budget items

In [57]:
# Imports
import pandas as pd
import os
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import matplotlib.pyplot as plt
import glob
import re

In [67]:
# Define which experiments to analyse
dir_expirements = './output/spot'

# NOTE: Change this to decide which experiments to analyse
# get names of folders in a directory that match a pattern
experiment_names = sorted(glob.glob(dir_expirements+'/*/*'))
experiment_names = [name for name in experiment_names if ('summary' not in name) and ('figures' not in name) ]
experiment_names_WPH = [name for name in experiment_names if 'RPH' not in name] 
experiment_names_RPH = [name for name in experiment_names if 'RPH' in name]

In [68]:
experiment_names_RPH

['./output/spot/EleutherAI_gpt-neox-20b/_PSo_AGmv_POlmp_DId_RPH',
 './output/spot/EleutherAI_gpt-neox-20b/_PSo_K2_AGmv_POlmp_DId_RPH',
 './output/spot/EleutherAI_gpt-neox-20b/_PSpo_AGmv_POlmg_DId_RPH',
 './output/spot/EleutherAI_gpt-neox-20b/_PSpo_AGmv_POlmp_DId_RPH',
 './output/spot/EleutherAI_gpt-neox-20b/_PSpo_K2_AGmv_POlmg_DId_RPH',
 './output/spot/EleutherAI_gpt-neox-20b/_PSpo_K2_AGmv_POlmp_DId_RPH',
 './output/spot/EleutherAI_gpt-neox-20b/_PSyn_AGmv_POlmg_DId_RPH',
 './output/spot/EleutherAI_gpt-neox-20b/_PSyn_AGmv_POlmp_DId_RPH',
 './output/spot/EleutherAI_gpt-neox-20b/_PSyn_AGmv_POrb_DId_RPH',
 './output/spot/EleutherAI_gpt-neox-20b/_PSyn_K2_AGmv_POlmg_DId_RPH',
 './output/spot/EleutherAI_gpt-neox-20b/_PSyn_K2_AGmv_POlmp_DId_RPH',
 './output/spot/EleutherAI_gpt-neox-20b/_PSyn_K2_AGmv_POrb_DId_RPH',
 './output/spot/gpt35turbo/directly_RPH',
 './output/spot/gpt35turbo/indirectly_RPH',
 './output/spot/gpt35turbo/neutral_RPH']

In [69]:
# Define the column name for the label and column name for the prediction
target_column = 'label'
prediction_column = 'preds_aggregate'

## Testing Loading of predictions for one experiment
df_preds = pd.read_csv(f'{experiment_names_WPH[0]}/predictions.csv', keep_default_na=False)
targets = df_preds[target_column].tolist()
preds = df_preds[prediction_column].tolist()

In [70]:
len( df_preds['indicator'].unique() )

279

In [71]:
df_preds.columns

Index(['budget_item', 'id', 'indicator', 'type', 'label', 'preds_aggregate',
       'preds_ensemble_parsed', 'preds_ensemble', 'preds_prompts'],
      dtype='object')

In [72]:
# Silence UndefinedMetricWarning
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

def calc_metrics_for_experiment_group(experiment_names, budget_item=None ):
    li_exp_metrics = []

    # For each experiment name calculate the metrics: precision, recall, f1, accuracy
    for exp_name in experiment_names:

        # Get experiment predictions
        df_preds = pd.read_csv(f'{exp_name}/predictions.csv', keep_default_na=False)

        if budget_item is not None:
            df_preds = df_preds[df_preds['budget_item'] == budget_item]
        
        targets = df_preds[target_column].tolist()
        preds = df_preds[prediction_column].tolist()

        # formatting preds
        preds = [ p.strip('. ') for p in preds ]
        
        # Calculate metrics
        (prec_yes, prec_no), (recall_yes, recall_no), (f1_yes, f1_no), _ = precision_recall_fscore_support(targets, preds, labels=['Yes','No'], average=None)
        prec = (prec_yes + prec_no  ) /2 
        recall = (recall_yes + recall_no ) /2
        f1 = (f1_yes + f1_no ) /2

        
        acc = accuracy_score(targets, preds)

        # Store metrics
        li_exp_metrics.append( 
            {'exp_name':exp_name, 
                'precision_yes':prec_yes, 'precision':prec, 'precision_no':prec_no,
                'recall_yes':recall_yes, 'recall':recall, 'recall_no':recall_no,
                'f1_yes':f1_yes, 'f1_no':f1_no, 'f1':f1, 
                'accuracy':acc} )

    df_metrics = pd.DataFrame.from_records(li_exp_metrics)
    df_metrics = df_metrics.set_index('exp_name')
    df_metrics = df_metrics.sort_values('accuracy', ascending=False)

    return df_metrics

df_metrics_WPH = calc_metrics_for_experiment_group(experiment_names_WPH)
df_metrics_RPH = calc_metrics_for_experiment_group(experiment_names_RPH)


In [73]:
df_metrics_RPH.head()

Unnamed: 0_level_0,precision_yes,precision,precision_no,recall_yes,recall,recall_no,f1_yes,f1_no,f1,accuracy
exp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
./output/spot/gpt35turbo/directly_RPH,0.793103,0.795029,0.796954,0.781553,0.771845,0.762136,0.787286,0.779156,0.783221,0.771845
./output/spot/gpt35turbo/neutral_RPH,0.721774,0.788519,0.855263,0.868932,0.75,0.631068,0.788546,0.726257,0.757402,0.75
./output/spot/gpt35turbo/indirectly_RPH,0.659649,0.761276,0.862903,0.912621,0.716019,0.519417,0.765784,0.648485,0.707134,0.716019
./output/spot/EleutherAI_gpt-neox-20b/_PSyn_K2_AGmv_POrb_DId_RPH,0.521531,0.521849,0.522167,0.529126,0.521845,0.514563,0.525301,0.518337,0.521819,0.521845
./output/spot/EleutherAI_gpt-neox-20b/_PSyn_K2_AGmv_POlmg_DId_RPH,0.511962,0.512138,0.512315,0.519417,0.512136,0.504854,0.515663,0.508557,0.51211,0.512136


In [74]:
# Saving to file
def saving_to_file(dir_expirements, df_metrics, exp_name):
    os.makedirs(os.path.join(dir_expirements, 'summary'), exist_ok=True)
    df_metrics.to_csv( os.path.join(dir_expirements, 'summary', f'{exp_name}.csv') )

saving_to_file(dir_expirements, df_metrics_WPH, 'WPH')
saving_to_file(dir_expirements, df_metrics_RPH, 'RPH')

In [75]:
# Comparing the Performance with split by label
metric_names = [ 'accuracy','f1', 'precision', 'recall', 'f1_yes', 'precision_yes', 'recall_yes', 'f1_no', 'precision_no', 'recall_no' ]
metric_names_fmt = ['Accuracy', 'F1 Score', 'Recall', 'F1 Score Yes', 'Precision Yes', 'Recall - Yes', 'F1 Score - No', 'Precision - No', 'Recall - No' ]

os.makedirs( os.path.join(dir_expirements, 'figures') )
def image_top_n_metrics(df_metrics, dir_expirements, metric_names, metric_names_fmt=None, top_n=5, RPH=False):

    if metric_names_fmt is None:
        metric_names_fmt = metric_names
        
    df_metrics_top = df_metrics.sort_values('accuracy', ascending=False).head( top_n )

    x_axis_labels = {
        0: '1st',
        1: '2nd',
        2: '3rd',
        3: '4th',
        4: '5th',
    }

    for metric_name, metric_name_fmt in zip(metric_names, metric_names_fmt):
        values = df_metrics_top[metric_name]
        
        fig, ax = plt.subplots()
        ax.bar(range(len(values)), values)
        ax.set_xticks(range(len(values)))
        ax.set_xticklabels([x_axis_labels[val] for val in values])
        ax.set_title(metric_name)
        ax.set_xlabel('Values')
        ax.set_ylabel(metric_name)
        for i, v in enumerate(values):
            ax.text(i, v, f"{v:.2f}", ha='center')
        
        plt.savefig( os.path.join(dir_expirements, 'figures', f'{metric_name_fmt}_top_{top_n}'+ 'RPH'*RPH +'.png' ) )
        
        plt.show()

# image_top_n_metrics(df_metrics_WPH, dir_expirements, metric_names, metric_names_fmt, top_n=5, RPH=False)
image_top_n_metrics(df_metrics_RPH, dir_expirements, metric_names, metric_names_fmt, top_n=5, RPH=True)

FileExistsError: [Errno 17] File exists: './output/spot/figures'

In [None]:
# Calculating Performance metric with a analyses performed on data split by budget item value

WPH_budget_items = ['Public Health', 'Child Health', 'Healthcare', 'Mental Health',
       'Health Protection', 'Highways', 'Social Care - Adults',
       'Social Care - Child', 'Health Improvement', 'Education',
       'Drugs and Alcohol', 'Housing', 'Sexual Health', 'Tobacco Control',
       'Planning', 'Env & Reg']

RPH_budget_items = ['Child Health', 'Healthcare', 'Mental Health',
       'Health Protection', 'Highways', 'Social Care - Adults',
       'Social Care - Child', 'Health Improvement', 'Education',
       'Drugs and Alcohol', 'Housing', 'Sexual Health', 'Tobacco Control',
       'Planning', 'Env & Reg']

dict_budgetitem_metrics_WPH = {}
dict_budgetitem_metrics_RPH = {}

#Creating Metrics for each budget item
for budget_item in WPH_budget_items:
       dict_budgetitem_metrics_WPH[budget_item] = calc_metrics_for_experiment_group(experiment_names_WPH, budget_item=budget_item)

for budget_item in RPH_budget_items:
       dict_budgetitem_metrics_RPH[budget_item] = calc_metrics_for_experiment_group(experiment_names_RPH, budget_item=budget_item)

# Saving each metric table to file
for budget_item, metrics in dict_budgetitem_metrics_WPH.items():
       saving_to_file(dir_expirements, metrics, f'WPH_{budget_item}')

for budget_item, metrics in dict_budgetitem_metrics_RPH.items():
       saving_to_file(dir_expirements, metrics, f'RPH_{budget_item}')


