# Analyses for the SPOT Alignment Experiments

In [1]:
# Imports
import pandas as pd
import os
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import matplotlib.pyplot as plt
import glob
import re
import yaml
dir_expirements = '../output/spot/exp_runs'
dir_figures = '../output/spot/figures'
dir_tables = '../output/spot/tables'

#### Factors to Compare Models on
- All models and variants comparison
- Comparing effect of 'parse_sytle' and 'prompt_style' : ['yes_no','open','categorise','cot_categorise']  and ['rules','categories_rules', 'categories_perplexity']
- Effect Type : ['indirectly','directly','arbitrary']
- 'model parameter count' : judged by model name
- finetuned : [True, False]
- unbias predictions

#### TODOs Diagrams/Tables to Produce
- 1) Tables: Accuracy, F1, Precision and Recall (of non-finetuned and finetuned models with varied parse_style_prompt_style but effect_type fixed to 'directly')
- 2) [Probabilistic predictions]: AUC-ROC, Brier Score (of non-finetuned and finetuned models with varied parse_style_prompt_style but effect_type fixed to 'indirectly' )
- 3) Performance cross sectioned by the budget item category
- 4) Performance cross sectioned by the budget item pre and post finetuning - In relation to dataset distribution e.g. more articles on 'health' than 'education' so we expect better performance increase on 'health' than 'education'. And the possible effects on downstream model from this e.g. are effects with more academic research better modelled. Or if there are correllated factors, we will essentially learn better studied factors more
- 5) Performance change in top performing models when effect tyle is rotated between effect types
- 6) Table showing Performance change w/ and w/o using -unbiase predictions flag for top models


### Helper Functions

In [2]:
import json
import pandas as pd
import os
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
# import roc curve printing, roc_auc_score, brier_score_loss
from sklearn.metrics import roc_curve, auc, roc_auc_score, brier_score_loss
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
import matplotlib.pyplot as plt
import operator
import copy

def load_data() -> list[dict]:
    """Experiments are stored as a csv file in the following format: budget_item,indicator,label,pred_aggregated,prompts,predictions,discourse
        Each experiment also contains a config file

        Returns a list of dictionaries, where 

        Input format of csv file containing predictions: budget_item,indicator,label,pred_aggregated,prompts,predictions,discourse
       """
    
    experiment_paths = sorted(glob.glob(dir_expirements+'/*'))
    li_exp = []
    for path in experiment_paths:
        path_config = os.path.join(path,'config.yaml')
        path_results = os.path.join(path,'predictions_b2i.csv')

        # Load config file
        with open(path_config) as f:
            config   =  yaml.safe_load(f)
              
        # Load results file
        df = pd.read_csv(path_results)

        li_exp.append( {'config':config, 'results':df} )
    
    return li_exp

def filter_exp( li_exp, **filter_kwargs ) -> list[dict]:

    li_exp_filtered = []

    # first filter logic based on checking if any item in  the filter_kwarg is a substring of the config value
    for exp in li_exp:
        if 'llm_names' in filter_kwargs:
            if any( llm_name in exp['config']['llm_name'].lower() for llm_name in filter_kwargs['llm_names'] ):
                li_exp_filtered.append(exp)
        else:
            li_exp_filtered.append(exp)
    
    filter_kwargs.pop('llm_names', None)

    # second filter logic based on checking if any item in list is in the config    
    for idx in range(len(li_exp_filtered)-1, -1, -1):
        exp = li_exp_filtered[idx]
        if any( ( (exp['config'].get(filter_name) not in filter_values) for filter_name, filter_values in filter_kwargs.items() ) ):
            # li_exp_filtered.append(exp)
            li_exp_filtered.pop(idx)

    return li_exp_filtered

def calc_eval_metrics( li_exp, metrics:list = ['accuracy','precision','recall','f1','auc_roc','brier_score'], breakdown_by_budgetitem:bool=False ) -> dict:
    """
        kwargs must be: keys,values = config argument, list of values to filter on

        Filters experiments based on kwargs, then calculates metrics for each experiment
    """

    li_exp = copy.deepcopy(li_exp)

    # Calculate metrics for each experiment
    tgt_col = 'related'
    pred_col_dict = 'pred_aggregated'
    pred_col_label = 'pred_label' # For deterministic evaluation
    pred_col_prob = 'pred_prob'
    
    ## Making pred_label column
    for exp in li_exp:
        # print(exp['results'].columns)
        exp['results'][pred_col_label] = exp['results'][pred_col_dict].apply(lambda dict_: max(eval(dict_).items(), key=operator.itemgetter(1))[0])
    
    if 'auc_roc' in metrics or 'brier_score' in metrics:
        # Making pred_prob column e.g. Prob of predicting Yes
        for exp in li_exp:
            exp['results'][pred_col_prob] = exp['results'][pred_col_dict].apply(lambda dict_: max(eval(dict_).items(), key=operator.itemgetter(1))[1])

    if breakdown_by_budgetitem is False:
        ## Accuracy
        if 'accuracy' in metrics:
            for exp in li_exp:
                # Calculate accuracy
                accuracy = accuracy_score(exp['results'][tgt_col], exp['results'][pred_col_label])
                if 'metrics' in exp:
                    exp['metrics'].update({'accuracy':accuracy})
                else:
                    exp['metrics'] = {'accuracy':accuracy}
            
        ## Precision, recall, f1
        fpr_metrics = [metric for metric in metrics if metric in ['precision','recall','f1']]
        if len(fpr_metrics) > 0:
            for exp in li_exp:
                # Calculate li_exp, recall, f1
                precision, recall, f1, support = precision_recall_fscore_support(exp['results'][tgt_col],
                                                                        exp['results'][pred_col_label],
                                                                        labels=['Yes','No'],
                                                                        #    average = None,
                                                                            # pos_label = 'Yes',
                                                                            average='macro',
                                                                            #  average='micro'
                                                                            zero_division=0
                                                                            )
                
                metrics = {k:v for k,v in zip(['f1', 'precision','recall'], [precision, recall, f1]) if k in fpr_metrics}
                metrics['support'] = support
                if 'metrics' in exp:
                    exp['metrics'].update(metrics)
                else:
                    exp['metrics'] = metrics
        
        ## AUC_ROC
        if 'auc_roc' in metrics:
            for exp in li_exp:
                # Calculate auc_roc
                auc_roc = roc_auc_score(exp['results'][tgt_col], exp['results'][pred_col_prob], labels=['Yes','No'])
                if 'metrics' in exp:
                    exp['metrics'].update({'auc_roc':auc_roc})
                else:
                    exp['metrics'] = {'auc_roc':auc_roc}
        
        ## Brier Score
        if 'brier_score' in metrics:
            for exp in li_exp:
                # Calculate brier_score
                brier_score = brier_score_loss(exp['results'][tgt_col], exp['results'][pred_col_prob], pos_label='Yes')
                if 'metrics' in exp:
                    exp['metrics'].update({'brier_score':brier_score})
                else:
                    exp['metrics'] = {'brier_score':brier_score}
    
    else:
        for exp in li_exp:
            exp['metrics'] = {}
            for budget_item in exp['results']['budget_item'].unique():
                df = exp['results'][exp['results']['budget_item']==budget_item]
                ## Accuracy
                if 'accuracy' in metrics:
                    # Calculate accuracy
                    accuracy = accuracy_score(df[tgt_col], df[pred_col_label])
                    exp['metrics'].update({f'accuracy_{budget_item}':accuracy})
                
                ## Precision, recall, f1
                fpr_metrics = [metric for metric in metrics if metric in ['precision','recall','f1']]
                if len(fpr_metrics) > 0:
                    # Calculate li_exp, recall, f1
                    precision, recall, f1, support = precision_recall_fscore_support(df[tgt_col],
                                                                            df[pred_col_label],
                                                                            labels=['Yes','No'],
                                                                                average='macro',
                                                                                zero_division=1
                                                                                )
                    
                    metrics = {k:v for k,v in zip(['f1', 'precision','recall'], [precision, recall, f1]) if k in fpr_metrics}
                    metrics['support'] = support
                    exp['metrics'].update({f'{k}_{budget_item}':v for k,v in metrics.items()})
            
                ## AUC_ROC
                if 'auc_roc' in metrics:
                    # Calculate auc_roc
                    auc_roc = roc_auc_score(df[tgt_col], df[pred_col_prob], labels=['Yes','No'])
                    exp['metrics'].update({f'auc_roc_{budget_item}':auc_roc})
                
                ## Brier Score
                if 'brier_score' in metrics:
                    # Calculate brier_score
                    brier_score = brier_score_loss(df[tgt_col], df[pred_col_prob], pos_label='Yes')
                    exp['metrics'].update({f'brier_score_{budget_item}':brier_score})


    return li_exp

def convert_li_exp_to_df(li_exp: list) -> pd.DataFrame:
    """Converts list of experiments to a dataframe"""
    li_exp_metrics = []
    for exp in li_exp:
        exp_metrics = exp['metrics']
        
        llm_name = exp['config']['llm_name']
        edge_value = exp['config']['edge_value']
        effect_type = exp['config']['effect_type']
        finetuned = exp['config']['finetuned']
        parse_style = exp['config']['parse_style']
        prompt_style = exp['config']['prompt_style']
        exp_metrics.update({'llm_name':llm_name,
                            'edge_value':edge_value,
                            'effect_type':effect_type,
                            'finetuned':finetuned,
                            'parse_style':parse_style,
                            'prompt_style':prompt_style})
        
        li_exp_metrics.append(exp_metrics)

    df = pd.DataFrame(li_exp_metrics)

    # Put the llm_name column first
    cols = df.columns.tolist()
    cols.insert(0, cols.pop(cols.index('llm_name')))
    df = df.reindex(columns=cols)

    return df

def create_diagrams_from_dataframe(df_exp: pd.DataFrame,
                                   columns_to_create_diagrams_for=['accuracy','auc_roc','brier_score','precision','recall','f1'],
                                   save_dir='./prompt_engineering/analysis/spot_output',
                                   exp_name='CompareAll') -> dict:
    
    # Create directory if it doesn't exist
    save_dir = os.path.join(save_dir, exp_name)
    os.makedirs(save_dir, exist_ok=True)
    
    # Dictionary to store the paths of the saved diagrams
    saved_diagrams = {}
    
    # Iterate over each column to create a diagram
    for column in columns_to_create_diagrams_for:
        if column in df_exp.columns:
            plt.figure(figsize=(10, 6))
            df_exp[column].hist(bins=20)
            plt.title(f'Histogram of {column}')
            plt.xlabel(column)
            plt.ylabel('Frequency')
            
            # Save the diagram
            file_path = os.path.join(save_dir, f'{column}_histogram.png')
            plt.savefig(file_path)
            
            # Store the path in the dictionary
            saved_diagrams[column] = file_path
            
            # Close the plot to free up memory
            plt.close()
            
    return saved_diagrams


In [3]:
# Comparing the results of the spot experiments
li_all_experiments = load_data( )

## Analyses

### 0) Determining the best model for each parameter size

In [7]:
#Best 7bn Model
li_exps_filt7b = filter_exp(li_all_experiments, effect_type=['directly'], llm_names=['7b'] )
li_exps_filt7b_w_res = calc_eval_metrics(li_exps_filt7b, metrics=['accuracy','precision','recall','f1'] )
df_exps_filt7b = convert_li_exp_to_df(li_exps_filt7b_w_res)
# Sort by accuracy
df_exps_filt7b.sort_values(by=['accuracy'], ascending=False, inplace=True)
print(df_exps_filt7b.head(4), "\n\n\n")

#Best 13bn Model
li_exps_filt13b = filter_exp(li_all_experiments, effect_type=['directly'], llm_names=['13b'] )
li_exps_filt13b_w_res = calc_eval_metrics(li_exps_filt13b, metrics=['accuracy','precision','recall','f1'] )
df_exps_filt13b = convert_li_exp_to_df(li_exps_filt13b_w_res)
# Sort by accuracy
df_exps_filt13b.sort_values(by=['accuracy'], ascending=False, inplace=True)
print(df_exps_filt13b.head(6), "\n\n\n" )

#Best 30bn Model
li_exps_filt30b = filter_exp(li_all_experiments, effect_type=['directly'], llm_names=['30b','60b','70b'] )
li_exps_filt30b_w_res = calc_eval_metrics(li_exps_filt30b, metrics=['accuracy','precision','recall','f1'] )
df_exps_filt30b = convert_li_exp_to_df(li_exps_filt30b_w_res)
# Sort by accuracy
df_exps_filt30b.sort_values(by=['accuracy'], ascending=False, inplace=True)
print(df_exps_filt30b.head(4), "\n\n\n")

                       llm_name  accuracy        f1  precision    recall  \
6   stabilityai/StableBeluga-7B  0.692308  0.699033   0.692740  0.689986   
11  stabilityai/StableBeluga-7B  0.687500  0.689552   0.687239  0.686456   
0   stabilityai/StableBeluga-7B  0.685096  0.694299   0.685609  0.681695   
4   stabilityai/StableBeluga-7B  0.685096  0.688183   0.684777  0.683558   

   support     edge_value effect_type  finetuned            parse_style  \
6     None  binary_weight    directly       True                  rules   
11    None   distribution    directly       True  categories_perplexity   
0     None  binary_weight    directly      False                  rules   
4     None   distribution    directly      False  categories_perplexity   

   prompt_style  
6        yes_no  
11   categorise  
0        yes_no  
4    categorise   



                        llm_name  accuracy        f1  precision    recall  \
0   stabilityai/StableBeluga-13B  0.723558  0.728661   0.723910  0.72220

In [10]:
li_exps_filt7b

[]

### 1) Accuracy, F1, Precision and Recall [ non-probabilistic / non-finetuned and finetuned with varied parse_style_prompt_style but effect type == 'directly' ]

In [154]:
li_exps_filt1 = filter_exp(li_all_experiments, effect_type=['directly'] )
li_exps_filt1_w_res = calc_eval_metrics(li_exps_filt1, metrics=['accuracy','precision','recall','f1'] )
df_exps_filt1 = convert_li_exp_to_df(li_exps_filt1_w_res)


### 2) [Probabilistic predictions]: AUC-ROC, Brier Score (of non-finetuned and finetuned models with varied parse_style_prompt_style but effect_type fixed to 'indirectly' )

In [162]:
li_exps_filt2 = filter_exp(li_all_experiments, effect_type=['directly'], edge_value=['distribution'] )
li_exps_filt2_w_res = calc_eval_metrics(li_exps_filt2, metrics=['accuracy','f1','auc_roc','brier_score'] )
df_exps_filt2 = convert_li_exp_to_df(li_exps_filt2_w_res)
df_exps_filt2

Unnamed: 0,accuracy,f1,support
0,0.665865,0.699348,
1,0.584135,0.710598,
2,0.675481,0.681994,
3,0.550481,0.577744,
4,0.709135,0.726176,
5,0.661058,0.688462,
6,0.543269,0.717336,
7,0.533654,0.588595,


### 3) Performance cross sectioned by the budget item category

In [165]:
li_exps_filt3 = filter_exp(li_all_experiments, effect_type=['directly'] )
li_exps_filt3_w_res = calc_eval_metrics(li_exps_filt3, metrics=['accuracy','f1','auc_roc','brier_score'], breakdown_by_budgetitem=True )
df_exps_filt3 = convert_li_exp_to_df(li_exps_filt3_w_res)
df_exps_filt3

Unnamed: 0,accuracy_Healthcare,f1_Healthcare,support_Healthcare,f1_Child Health,support_Child Health,f1_Social Care - Adults,support_Social Care - Adults,f1_Env & Reg,support_Env & Reg,f1_Social Care - Child,...,f1_Education services,support_Education services,f1_Public mental health,support_Public mental health,f1_Substance misuse - Drug misuse - adults & Substance misuse - Alcohol misuse - adults & Substance misuse - Preventing and reducing harm from drug misuse in adults & Substance misuse - Preventing and reducing harm from alcohol misuse in adults & Substance misuse - (drugs and alcohol) - youth services,support_Substance misuse - Drug misuse - adults & Substance misuse - Alcohol misuse - adults & Substance misuse - Preventing and reducing harm from drug misuse in adults & Substance misuse - Preventing and reducing harm from alcohol misuse in adults & Substance misuse - (drugs and alcohol) - youth services,f1_Planning and development services,support_Planning and development services,f1_Central services,support_Central services
0,0.7,0.8125,,0.85,,0.901786,,0.75,,0.961538,...,,,,,,,,,,
1,,0.790698,,0.804924,,0.916667,,0.875,,0.923077,...,,,,,,,,,,
2,,,,,,,,,,,...,0.75,,0.595238,,0.75,,0.875,,0.8,
3,,,,,,,,,,,...,0.722222,,0.651261,,0.875,,0.7,,0.75,
4,,,,,,,,,,,...,0.76087,,0.227273,,0.25,,0.8,,0.8,
5,,,,,,,,,,,...,0.815789,,0.65,,0.75,,0.75,,0.833333,
6,,,,,,,,,,,...,0.671429,,0.6875,,0.8,,0.8,,0.833333,
7,,,,,,,,,,,...,0.563158,,0.70979,,0.875,,1.0,,0.25,
8,,,,,,,,,,,...,0.6875,,0.633333,,0.75,,0.875,,0.8,
9,,,,,,,,,,,...,0.5,,0.40625,,0.75,,0.75,,0.416667,


### 4) Performance cross sectioned by the budget item pre and post finetuning - In relation to dataset distribution e.g. more articles on 'health' than 'education' so we expect better performance increase on 'health' than 'education'. And the possible effects on downstream model from this e.g. are effects with more academic research better modelled. Or if there are correllated factors, we will essentially learn better studied factors more

### 5) Performance change in top performing models when effect tyle is rotated between effect types

### 6) Table showing Performance change w/ and w/o using -unbiase predictions flag for top models