# Analyses for the SPOT Alignment Experiments

In [3]:
# Imports
import pandas as pd
import os
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import matplotlib.pyplot as plt
import glob
import re
import yaml
dir_expirements = '../output/spot/exp_runs'
dir_figures = '../output/spot/figures'
dir_tables = '../output/spot/tables'

#### Factors to Compare Models on
- All models and variants comparison
- Comparing effect of 'parse_sytle' and 'prompt_style' : ['yes_no','open','categorise','cot_categorise']  and ['rules','categories_rules', 'categories_perplexity']
- Effect Type : ['indirectly','directly','arbitrary']
- 'model parameter count' : judged by model name
- finetuned : [True, False]
- unbias predictions

#### TODOs Diagrams/Tables to Produce
- 1) Tables: Accuracy, F1, Precision and Recall (of non-finetuned and finetuned models with varied parse_style_prompt_style but effect_type fixed to 'directly')
- 2) [Probabilistic predictions]: AUC-ROC, Brier Score (of non-finetuned and finetuned models with varied parse_style_prompt_style but effect_type fixed to 'indirectly' )
- 3) Performance cross sectioned by the budget item category
- 4) Performance cross sectioned by the budget item pre and post finetuning - In relation to dataset distribution e.g. more articles on 'health' than 'education' so we expect better performance increase on 'health' than 'education'. And the possible effects on downstream model from this e.g. are effects with more academic research better modelled. Or if there are correllated factors, we will essentially learn better studied factors more
- 5) Performance change in top performing models when effect tyle is rotated between effect types
- 6) Table showing Performance change w/ and w/o using -unbiase predictions flag for top models


### Helper Functions

In [60]:
import json
import pandas as pd
import os
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
# import roc curve printing, roc_auc_score, brier_score_loss
from sklearn.metrics import roc_curve, auc, roc_auc_score, brier_score_loss
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
import matplotlib.pyplot as plt
import operator
import copy
import numpy as np

def load_data() -> list[dict]:
    """Experiments are stored as a csv file in the following format: budget_item,indicator,label,pred_aggregated,prompts,predictions,discourse
        Each experiment also contains a config file

        Returns a list of dictionaries, where 

        Input format of csv file containing predictions: budget_item,indicator,label,pred_aggregated,prompts,predictions,discourse
       """
    
    experiment_paths = sorted(glob.glob(dir_expirements+'/*'))
    li_exp = []
    for path in experiment_paths:
        path_config = os.path.join(path,'config.yaml')
        path_results = os.path.join(path,'predictions_b2i.csv')

        # Load config file
        with open(path_config) as f:
            config   =  yaml.safe_load(f)
              
        # Load results file
        df = pd.read_csv(path_results)

        li_exp.append( {'config':config, 'results':df} )
    
    return li_exp

def filter_exp( li_exp, **filter_kwargs ) -> list[dict]:

    li_exp_filtered = []

    # first filter logic based on checking if any item in  the filter_kwarg is a substring of the config value
    for exp in li_exp:
        if 'llm_names' in filter_kwargs:
            if any( llm_name in exp['config']['llm_name'].lower() for llm_name in filter_kwargs['llm_names'] ):
                li_exp_filtered.append(exp)
        else:
            li_exp_filtered.append(exp)
    
    filter_kwargs.pop('llm_names', None)

    # second filter logic based on checking if any item in list is in the config    
    for idx in range(len(li_exp_filtered)-1, -1, -1):
        exp = li_exp_filtered[idx]
        if any( ( (exp['config'].get(filter_name) not in filter_values) for filter_name, filter_values in filter_kwargs.items() ) ):
            # li_exp_filtered.append(exp)
            li_exp_filtered.pop(idx)

    return li_exp_filtered

def calc_eval_metrics( li_exp, metrics:list = ['accuracy','precision','recall','f1','auc_roc','brier_score'], breakdown_by_budgetitem:bool=False, average_type=None ) -> dict:
    """
        kwargs must be: keys,values = config argument, list of values to filter on

        Filters experiments based on kwargs, then calculates metrics for each experiment
    """
    
    li_exp = copy.deepcopy(li_exp)

    # Calculate metrics for each experiment
    tgt_col = 'related'
    pred_col_dict = 'pred_aggregated'
    pred_col_label = 'pred_label' # For deterministic evaluation
    # pred_col_prob = 'pred_prob'
    
    ## Making pred_label column
    for exp in li_exp:

        exp['results'][pred_col_label] = exp['results'][pred_col_dict].apply(lambda dict_: max(eval(dict_).items(), key=operator.itemgetter(1))[0])
    
    if 'auc_roc' in metrics or 'brier_score' in metrics or 'calibration_error' in metrics:
        # Making pred_prob column e.g. Prob of predicting Yes
        for exp in li_exp:
            #note: the logic in the below line may be  a bit wrong, since it extracts the pred_col_prob of highest value, instead of the pred_col_prob for the label 'Yes' or 'No'
            # exp['results'][pred_col_prob] = exp['results'][pred_col_dict].apply(lambda dict_: max(eval(dict_).items(), key=operator.itemgetter(1))[1])

            exp['results']['pred_yes_prob'] = exp['results'][pred_col_dict].apply(lambda dict_: eval(dict_)['Yes'])
            exp['results']['pred_no_prob'] = exp['results'][pred_col_dict].apply(lambda dict_: eval(dict_)['No'])

            exp['results'][tgt_col+'_yes_prob'] = exp['results'][tgt_col].map(lambda x: 1.0 if x=='Yes' else 0.0)
            exp['results'][tgt_col+'no_prob'] = exp['results'][tgt_col].map(lambda x: 1.0 if x=='No' else 0.0)          


    if breakdown_by_budgetitem is False:
        ## Accuracy
        if 'accuracy' in metrics:
            for exp in li_exp:
                # Calculate accuracy
                accuracy = accuracy_score(exp['results'][tgt_col], exp['results'][pred_col_label])
                if 'metrics' in exp:
                    exp['metrics'].update({'accuracy':accuracy})
                else:
                    exp['metrics'] = {'accuracy':accuracy}
            
        ## Precision, recall, f1
        fpr_metrics = [metric for metric in metrics if metric in ['precision','recall','f1']]
        if len(fpr_metrics) > 0:
            for exp in li_exp:
                # Calculate li_exp, recall, f1
                precision, recall, f1, support = precision_recall_fscore_support(exp['results'][tgt_col],
                                                                        exp['results'][pred_col_label],
                                                                        labels=['Yes'] if average_type != 'binary' else ['Yes'],
                                                                        #    average = None,
                                                                            pos_label = 'Yes' if average_type == 'binary' else 1,
                                                                            average=average_type,
                                                                            #  average='micro'
                                                                            zero_division=np.nan
                                                                            )
                
                _ = {k:v for k,v in zip(['f1', 'precision','recall'], [f1, precision, recall]) if k in fpr_metrics}
                # _['support'] = support
                if 'metrics' in exp:
                    exp['metrics'].update(_)
                else:
                    exp['metrics'] = _
        
        ## ROC_AUC
        if 'roc_auc' in metrics:
            for exp in li_exp:
                # Calculate roc_auc
                roc_auc = roc_auc_score(exp['results'][tgt_col+'_yes_prob'], exp['results']['pred_yes_prob'] )
                if 'metrics' in exp:
                    exp['metrics'].update({'roc_auc':roc_auc})
                else:
                    exp['metrics'] = {'roc_auc':roc_auc}
        

        
        if 'calibration_error' in metrics:
            for exp in li_exp:
                # Calculate calibration_error
                
                calibration_error = np.mean(np.abs( exp['results'][tgt_col+'_yes_prob'] -  exp['results']['pred_yes_prob'] ) )
                
                if 'metrics' in exp:
                    exp['metrics'].update({'calibration_error':calibration_error})
                else:
                    exp['metrics'] = {'calibration_error':calibration_error}
    
    else:
        raise NotImplementedError('breakdown_by_budgetitem=True not implemented yet')
        for exp in li_exp:
            exp['metrics'] = {}
            for budget_item in exp['results']['budget_item'].unique():
                df = exp['results'][exp['results']['budget_item']==budget_item]
                ## Accuracy
                if 'accuracy' in metrics:
                    # Calculate accuracy
                    accuracy = accuracy_score(df[tgt_col], df[pred_col_label])
                    exp['metrics'].update({f'accuracy_{budget_item}':accuracy})
                
                ## Precision, recall, f1
                fpr_metrics = [metric for metric in metrics if metric in ['precision','recall','f1']]
                if len(fpr_metrics) > 0:
                    # Calculate li_exp, recall, f1
                    precision, recall, f1, support = precision_recall_fscore_support(df[tgt_col],
                                                                            df[pred_col_label],
                                                                            labels=['Yes'] if average_type != 'binary' else None,
                                                                            pos_label = 'Yes' if average_type == 'binary' else 1,
                                                                                average=average_type,
                                                                                zero_division=np.nan
                                                                                )
                    
                    _ = {k:v for k,v in zip(['f1', 'precision','recall'], [f1, precision, recall]) if k in fpr_metrics}
                    # _['support'] = support
                    exp['metrics'].update({f'{k}_{budget_item}':v for k,v in _.items()})
            
                ## ROC_AUC
                if 'roc_auc' in metrics:
                    # Calculate roc_auc
                    roc_auc = roc_auc_score(df[tgt_col], df[pred_col_prob], labels=['Yes','No'])
                    exp['metrics'].update({f'roc_auc_{budget_item}':roc_auc})
                
                ## Brier Score
                if 'brier_score' in metrics:
                    # Calculate brier_score
                    brier_score = brier_score_loss(df[tgt_col], df[pred_col_prob], pos_label='Yes')
                    exp['metrics'].update({f'brier_score_{budget_item}':brier_score})

                if 'calibration_error' in metrics:
                    # Calculate calibration_error
                    calibration_error = np.abs(df[tgt_col] - df[pred_col_prob])                    
                    exp['metrics'].update({'calibration_error':calibration_error})

    return li_exp

def convert_li_exp_to_df(li_exp: list) -> pd.DataFrame:
    """Converts list of experiments to a dataframe"""
    li_exp_metrics = []
    for exp in li_exp:
        
        exp_metrics = exp['metrics']
        
        llm_name = exp['config']['llm_name']
        edge_value = exp['config']['edge_value']
        effect_type = exp['config']['effect_type']
        finetuned = exp['config']['finetuned']
        parse_style = exp['config']['parse_style']
        prompt_style = exp['config']['prompt_style']
        uc = exp['config'].get('unbias_categorisations', False)
        exp_metrics.update({'llm_name':llm_name,
                            'edge_value':edge_value,
                            'effect_type':effect_type,
                            'finetuned':finetuned,
                            'parse_style':parse_style,
                            'prompt_style':prompt_style,
                            'uc':uc
                            })
        
        li_exp_metrics.append(exp_metrics)

    df = pd.DataFrame(li_exp_metrics)

    # Put the llm_name column first
    cols = df.columns.tolist()
    cols.insert(0, cols.pop(cols.index('llm_name')))
    df = df.reindex(columns=cols)

    return df

def create_diagrams_from_dataframe(df_exp: pd.DataFrame,
                                   columns_to_create_diagrams_for=['accuracy','roc_auc','brier_score','precision','recall','f1'],
                                   save_dir='./prompt_engineering/analysis/spot_output',
                                   exp_name='CompareAll') -> dict:
    
    # Create directory if it doesn't exist
    save_dir = os.path.join(save_dir, exp_name)
    os.makedirs(save_dir, exist_ok=True)
    
    # Dictionary to store the paths of the saved diagrams
    saved_diagrams = {}
    
    # Iterate over each column to create a diagram
    for column in columns_to_create_diagrams_for:
        if column in df_exp.columns:
            plt.figure(figsize=(10, 6))
            df_exp[column].hist(bins=20)
            plt.title(f'Histogram of {column}')
            plt.xlabel(column)
            plt.ylabel('Frequency')
            
            # Save the diagram
            file_path = os.path.join(save_dir, f'{column}_histogram.png')
            plt.savefig(file_path)
            
            # Store the path in the dictionary
            saved_diagrams[column] = file_path
            
            # Close the plot to free up memory
            plt.close()
            
    return saved_diagrams

## Analyses

In [5]:
# Comparing the results of the spot experiments
li_all_experiments = load_data( )

### 0) Determining the best model for each parameter size

In [31]:
#Best gpt model
li_exps_filtgpt = filter_exp(li_all_experiments, llm_names=['gpt'] )
li_exps_filtgpt_w_res = calc_eval_metrics(li_exps_filtgpt, metrics=['accuracy','precision','recall','f1','calibration_error','roc_auc'], average_type='macro' )
df_exps_filtgpt = convert_li_exp_to_df(li_exps_filtgpt_w_res)
# Sort by accuracy
df_exps_filtgpt.sort_values(by=['accuracy'], ascending=False, inplace=True)
display(df_exps_filtgpt.head(4))

Unnamed: 0,llm_name,accuracy,f1,precision,recall,roc_auc,calibration_error,edge_value,effect_type,finetuned,parse_style,prompt_style,uc
0,gpt-3.5-turbo,0.815534,0.829932,0.778723,0.88835,0.817961,0.182039,binary_weight,directly,False,categories_rules,open,
1,gpt-3.5-turbo,0.764563,0.79476,0.722222,0.883495,0.771845,0.228155,binary_weight,directly,False,rules,yes_no,


In [63]:
# Varying effect type
# #Best 7bn Model
li_exps_filt7b = filter_exp(li_all_experiments, llm_names=['7b'], parse_style=['categories_perplexity'], prompt_style=['categorise'], finetuned=[False], unbias_categorisations=[False] )
li_exps_filt7b_w_res = calc_eval_metrics(li_exps_filt7b, metrics=['accuracy','precision','recall','f1', 'calibration_error'], average_type='binary' )
df_exps_filt7b = convert_li_exp_to_df(li_exps_filt7b_w_res)
# Sort by accuracy
df_exps_filt7b.sort_values(by=['accuracy'], ascending=False, inplace=True)
display(df_exps_filt7b.head(5).round(3) )
print("\n")

#Best 13bn Model
li_exps_filt13b = filter_exp(li_all_experiments, llm_names=['13b'], parse_style=['categories_perplexity'], prompt_style=['categorise'], finetuned=[False], unbias_categorisations=[False] )
li_exps_filt13b_w_res = calc_eval_metrics(li_exps_filt13b, metrics=['accuracy','precision','recall','f1','calibration_error'], average_type='binary' )
df_exps_filt13b = convert_li_exp_to_df(li_exps_filt13b_w_res)
# Sort by accuracy
df_exps_filt13b.sort_values(by=['accuracy'], ascending=False, inplace=True)
display(df_exps_filt13b.head(5).round(3) )
print("\n")


Unnamed: 0,llm_name,accuracy,f1,precision,recall,calibration_error,edge_value,effect_type,finetuned,parse_style,prompt_style,uc
2,stabilityai/StableBeluga-7B,0.697,0.703,0.687,0.72,0.396,distribution,indirectly,False,categories_perplexity,categorise,False
1,stabilityai/StableBeluga-7B,0.688,0.649,0.736,0.58,0.401,distribution,arbitrary,False,categories_perplexity,categorise,False
0,stabilityai/StableBeluga-7B,0.685,0.661,0.711,0.618,0.388,distribution,directly,False,categories_perplexity,categorise,False






Unnamed: 0,llm_name,accuracy,f1,precision,recall,calibration_error,edge_value,effect_type,finetuned,parse_style,prompt_style,uc
0,stabilityai/StableBeluga-13B,0.644,0.722,0.591,0.928,0.446,distribution,directly,False,categories_perplexity,categorise,False
1,stabilityai/StableBeluga-13B,0.596,0.704,0.554,0.966,0.451,distribution,arbitrary,False,categories_perplexity,categorise,False
2,stabilityai/StableBeluga-13B,0.558,0.688,0.53,0.981,0.457,distribution,indirectly,False,categories_perplexity,categorise,False






Unnamed: 0,llm_name,accuracy,f1,precision,recall,calibration_error,edge_value,effect_type,finetuned,parse_style,prompt_style,uc
0,upstage/llama-30b-instruct-2048,0.721,0.698,0.757,0.647,0.318,distribution,directly,False,categories_perplexity,categorise,False


In [69]:
# Varying unbiased categorisations
# #Best 7bn Model
li_exps_filt7b = filter_exp(li_all_experiments, llm_names=['7b'], parse_style=['categories_perplexity'], prompt_style=['categorise'], finetuned=[False], effect_type=['directly'] )
li_exps_filt7b_w_res = calc_eval_metrics(li_exps_filt7b, metrics=['accuracy','precision','recall','f1', 'calibration_error'], average_type='binary' )
df_exps_filt7b = convert_li_exp_to_df(li_exps_filt7b_w_res)
# Sort by accuracy
df_exps_filt7b.sort_values(by=['accuracy'], ascending=False, inplace=True)
display(df_exps_filt7b.head(5).round(3) )
print("\n")

#Best 13bn Model
li_exps_filt13b = filter_exp(li_all_experiments, llm_names=['13b'], parse_style=['categories_perplexity'], prompt_style=['categorise'], finetuned=[False], effect_type=['directly'] )
li_exps_filt13b_w_res = calc_eval_metrics(li_exps_filt13b, metrics=['accuracy','precision','recall','f1','calibration_error'], average_type='binary' )
df_exps_filt13b = convert_li_exp_to_df(li_exps_filt13b_w_res)
# Sort by accuracy
df_exps_filt13b.sort_values(by=['accuracy'], ascending=False, inplace=True)
display(df_exps_filt13b.head(5).round(3) )
print("\n")

# #Best 30bn Model
li_exps_filt30b = filter_exp(li_all_experiments, llm_names=['30b','60b','70b'],  parse_style=['categories_perplexity'],prompt_style=['categorise'], finetuned=[False], effect_type=['directly'] ) 
li_exps_filt30b_w_res = calc_eval_metrics(li_exps_filt30b, metrics=['accuracy','precision','recall','f1', 'calibration_error'], average_type='binary' )
df_exps_filt30b = convert_li_exp_to_df(li_exps_filt30b_w_res)
# Sort by accuracy
df_exps_filt30b.sort_values(by=['accuracy'], ascending=False, inplace=True)
display(df_exps_filt30b.head(6).round(3), )

Unnamed: 0,llm_name,accuracy,f1,precision,recall,calibration_error,edge_value,effect_type,finetuned,parse_style,prompt_style,uc
1,stabilityai/StableBeluga-7B,0.685,0.661,0.711,0.618,0.388,distribution,directly,False,categories_perplexity,categorise,False
0,stabilityai/StableBeluga-7B,0.538,0.186,0.759,0.106,0.429,distribution,directly,False,categories_perplexity,categorise,True






Unnamed: 0,llm_name,accuracy,f1,precision,recall,calibration_error,edge_value,effect_type,finetuned,parse_style,prompt_style,uc
1,stabilityai/StableBeluga-13B,0.644,0.722,0.591,0.928,0.446,distribution,directly,False,categories_perplexity,categorise,False
0,stabilityai/StableBeluga-13B,0.507,0.038,0.667,0.019,0.463,distribution,directly,False,categories_perplexity,categorise,True






Unnamed: 0,llm_name,accuracy,f1,precision,recall,calibration_error,edge_value,effect_type,finetuned,parse_style,prompt_style,uc
0,upstage/llama-30b-instruct-2048,0.721,0.703,0.749,0.662,0.331,distribution,directly,False,categories_perplexity,categorise,True
1,upstage/llama-30b-instruct-2048,0.721,0.698,0.757,0.647,0.318,distribution,directly,False,categories_perplexity,categorise,False


In [72]:
# Varying finetuned
# #Best 7bn Model
li_exps_filt7b = filter_exp(li_all_experiments, llm_names=['7b'],  effect_type=['indirectly'], unbias_categorisations=[False] )
li_exps_filt7b_w_res = calc_eval_metrics(li_exps_filt7b, metrics=['accuracy','precision','recall','f1', 'calibration_error'], average_type='binary' )
df_exps_filt7b = convert_li_exp_to_df(li_exps_filt7b_w_res)
# Sort by accuracy
df_exps_filt7b.sort_values(by=['accuracy'], ascending=False, inplace=True)
display(df_exps_filt7b.head(5).round(3) )
print("\n")

#Best 13bn Model
li_exps_filt13b = filter_exp(li_all_experiments, llm_names=['13b'],  effect_type=['indirectly'], unbias_categorisations=[False] )
li_exps_filt13b_w_res = calc_eval_metrics(li_exps_filt13b, metrics=['accuracy','precision','recall','f1','calibration_error'], average_type='binary' )
df_exps_filt13b = convert_li_exp_to_df(li_exps_filt13b_w_res)
# Sort by accuracy
df_exps_filt13b.sort_values(by=['accuracy'], ascending=False, inplace=True)
display(df_exps_filt13b.head(5).round(3) )
print("\n")


Unnamed: 0,llm_name,accuracy,f1,precision,recall,calibration_error,edge_value,effect_type,finetuned,parse_style,prompt_style,uc
2,stabilityai/StableBeluga-7B,0.697,0.703,0.687,0.72,0.396,distribution,indirectly,False,categories_perplexity,categorise,False
6,stabilityai/StableBeluga-7B,0.695,0.705,0.679,0.734,0.41,distribution,indirectly,True,categories_perplexity,categorise,False
4,stabilityai/StableBeluga-7B,0.666,0.734,0.608,0.928,0.334,binary_weight,indirectly,True,rules,yes_no,False
0,stabilityai/StableBeluga-7B,0.661,0.734,0.602,0.942,0.339,binary_weight,indirectly,False,rules,yes_no,False
5,stabilityai/StableBeluga-7B,0.522,0.673,0.51,0.99,0.478,binary_weight,indirectly,True,categories_rules,open,False






Unnamed: 0,llm_name,accuracy,f1,precision,recall,calibration_error,edge_value,effect_type,finetuned,parse_style,prompt_style,uc
0,stabilityai/StableBeluga-13B,0.654,0.732,0.595,0.952,0.346,binary_weight,indirectly,True,rules,yes_no,False
4,stabilityai/StableBeluga-13B,0.651,0.731,0.593,0.952,0.349,binary_weight,indirectly,False,rules,yes_no,False
2,stabilityai/StableBeluga-13B,0.562,0.692,0.533,0.986,0.458,distribution,indirectly,True,categories_perplexity,categorise,False
6,stabilityai/StableBeluga-13B,0.558,0.688,0.53,0.981,0.457,distribution,indirectly,False,categories_perplexity,categorise,False
1,stabilityai/StableBeluga-13B,0.51,0.669,0.504,0.995,0.49,binary_weight,indirectly,True,categories_rules,open,False






### 1) Accuracy, F1, Precision and Recall [ non-probabilistic / non-finetuned and finetuned with varied parse_style_prompt_style but effect type == 'directly' ]

In [99]:
li_exps_filt1 = filter_exp(li_all_experiments, effect_type=['directly'], llm_names=['7b'])
li_exps_filt1_w_res = calc_eval_metrics(li_exps_filt1, metrics=['accuracy','precision','recall','f1','calibration_error'] )
df_exps_filt1 = convert_li_exp_to_df(li_exps_filt1_w_res)
display(df_exps_filt1)

0      1.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
411    0.0
412    0.0
413    1.0
414    1.0
415    1.0
Name: related_prob, Length: 416, dtype: float64 


0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
411    1.0
412    1.0
413    1.0
414    1.0
415    1.0
Name: pred_prob, Length: 416, dtype: float64 


0      0.0
1      1.0
2      1.0
3      1.0
4      1.0
      ... 
411    1.0
412    1.0
413    0.0
414    0.0
415    0.0
Length: 416, dtype: float64 


[0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0.
 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0.
 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1.
 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0.
 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1.
 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0.

Unnamed: 0,llm_name,accuracy,f1,precision,recall,support,calibration_error,edge_value,effect_type,finetuned,parse_style,prompt_style,uc
0,upstage/llama-30b-instruct-2048,0.75,[0.7668161434977578],[0.7154811715481172],[0.8260869565217391],[207],0.502404,binary_weight,directly,False,rules,yes_no,False
1,upstage/llama-30b-instruct-2048,0.721154,[0.7025641025641026],[0.7486338797814208],[0.6618357487922706],[207],0.552457,distribution,directly,False,categories_perplexity,categorise,True
2,upstage/llama-30b-instruct-2048,0.721154,[0.6979166666666666],[0.7570621468926554],[0.6473429951690821],[207],0.550707,distribution,directly,False,categories_perplexity,categorise,False
3,upstage/llama-30b-instruct-2048,0.742788,[0.7784679089026917],[0.6811594202898551],[0.9082125603864735],[207],0.502404,binary_weight,directly,False,categories_rules,open,False
4,upstage/llama-30b-instruct-2048,0.709135,[0.7603960396039603],[0.6442953020134228],[0.927536231884058],[207],0.504517,distribution,directly,False,categories_perplexity,cot_categorise,True
5,upstage/llama-30b-instruct-2048,0.701923,[0.752],[0.6416382252559727],[0.9082125603864735],[207],0.50205,distribution,directly,False,categories_perplexity,cot_categorise,False


### 2) [Probabilistic predictions]: AUC-ROC, Brier Score (of non-finetuned and finetuned models with varied parse_style_prompt_style but effect_type fixed to 'indirectly' )

In [100]:
li_exps_filt2 = filter_exp(li_all_experiments, effect_type=['directly'], edge_value=['distribution'], llm_names=['30b'] )
li_exps_filt2_w_res = calc_eval_metrics(li_exps_filt2, metrics=['accuracy','f1','roc_auc','brier_score', 'calibration_error'] )
df_exps_filt2 = convert_li_exp_to_df(li_exps_filt2_w_res)
display(df_exps_filt2)

0      1.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
411    0.0
412    0.0
413    1.0
414    1.0
415    1.0
Name: related_prob, Length: 416, dtype: float64 


0      0.721
1      0.911
2      0.886
3      0.973
4      0.996
       ...  
411    0.985
412    0.936
413    0.529
414    0.828
415    0.624
Name: pred_prob, Length: 416, dtype: float64 


0      0.279
1      0.911
2      0.886
3      0.973
4      0.996
       ...  
411    0.985
412    0.936
413    0.471
414    0.172
415    0.376
Length: 416, dtype: float64 


[0.279 0.911 0.886 0.973 0.996 0.048 0.974 0.256 0.226 0.83  0.775 0.146
 0.185 0.966 0.187 0.277 0.117 0.444 0.047 0.201 0.568 0.806 0.988 0.16
 0.449 0.989 0.187 0.988 0.967 0.643 0.295 0.263 0.988 0.383 0.5   0.973
 0.118 0.984 0.982 0.546 0.946 0.911 0.127 0.978 0.504 0.255 0.283 0.225
 0.92  0.763 0.479 0.842 0.055 0.156 0.513 0.607 0.989 0.962 0.225 0.358
 0.665 0.158 0.968 0.309 0.309 0.486 0.982 0.962 0.399 0.05  0.989 0.278
 0.959 0.997 0.323 0.722 0

Unnamed: 0,llm_name,accuracy,f1,support,auc_roc,brier_score,calibration_error,edge_value,effect_type,finetuned,parse_style,prompt_style,uc
0,upstage/llama-30b-instruct-2048,0.721154,[0.7025641025641026],[207],0.27327,0.415431,0.552457,distribution,directly,False,categories_perplexity,categorise,True
1,upstage/llama-30b-instruct-2048,0.721154,[0.6979166666666666],[207],0.262141,0.439655,0.550707,distribution,directly,False,categories_perplexity,categorise,False
2,upstage/llama-30b-instruct-2048,0.709135,[0.7603960396039603],[207],0.674398,0.495406,0.504517,distribution,directly,False,categories_perplexity,cot_categorise,True
3,upstage/llama-30b-instruct-2048,0.701923,[0.752],[207],0.498058,0.498558,0.50205,distribution,directly,False,categories_perplexity,cot_categorise,False


### 3) Performance cross sectioned by the budget item category

In [165]:
li_exps_filt3 = filter_exp(li_all_experiments, effect_type=['directly'] )
li_exps_filt3_w_res = calc_eval_metrics(li_exps_filt3, metrics=['accuracy','f1','auc_roc','brier_score'], breakdown_by_budgetitem=True )
df_exps_filt3 = convert_li_exp_to_df(li_exps_filt3_w_res)
df_exps_filt3

Unnamed: 0,accuracy_Healthcare,f1_Healthcare,support_Healthcare,f1_Child Health,support_Child Health,f1_Social Care - Adults,support_Social Care - Adults,f1_Env & Reg,support_Env & Reg,f1_Social Care - Child,...,f1_Education services,support_Education services,f1_Public mental health,support_Public mental health,f1_Substance misuse - Drug misuse - adults & Substance misuse - Alcohol misuse - adults & Substance misuse - Preventing and reducing harm from drug misuse in adults & Substance misuse - Preventing and reducing harm from alcohol misuse in adults & Substance misuse - (drugs and alcohol) - youth services,support_Substance misuse - Drug misuse - adults & Substance misuse - Alcohol misuse - adults & Substance misuse - Preventing and reducing harm from drug misuse in adults & Substance misuse - Preventing and reducing harm from alcohol misuse in adults & Substance misuse - (drugs and alcohol) - youth services,f1_Planning and development services,support_Planning and development services,f1_Central services,support_Central services
0,0.7,0.8125,,0.85,,0.901786,,0.75,,0.961538,...,,,,,,,,,,
1,,0.790698,,0.804924,,0.916667,,0.875,,0.923077,...,,,,,,,,,,
2,,,,,,,,,,,...,0.75,,0.595238,,0.75,,0.875,,0.8,
3,,,,,,,,,,,...,0.722222,,0.651261,,0.875,,0.7,,0.75,
4,,,,,,,,,,,...,0.76087,,0.227273,,0.25,,0.8,,0.8,
5,,,,,,,,,,,...,0.815789,,0.65,,0.75,,0.75,,0.833333,
6,,,,,,,,,,,...,0.671429,,0.6875,,0.8,,0.8,,0.833333,
7,,,,,,,,,,,...,0.563158,,0.70979,,0.875,,1.0,,0.25,
8,,,,,,,,,,,...,0.6875,,0.633333,,0.75,,0.875,,0.8,
9,,,,,,,,,,,...,0.5,,0.40625,,0.75,,0.75,,0.416667,


### 4) Performance cross sectioned by the budget item pre and post finetuning - In relation to dataset distribution e.g. more articles on 'health' than 'education' so we expect better performance increase on 'health' than 'education'. And the possible effects on downstream model from this e.g. are effects with more academic research better modelled. Or if there are correllated factors, we will essentially learn better studied factors more

### 5) Performance change in top performing models when effect tyle is rotated between effect types

### 6) Table showing Performance change w/ and w/o using -unbiase predictions flag for top models