# Analyses for the SPOT Alignment Experiments

In [1]:
# Imports
import pandas as pd
import os
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import matplotlib.pyplot as plt
import glob
import re
import yaml

### Helper Functions

In [2]:
import json
import pandas as pd
import os
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
# import roc curve printing, roc_auc_score, brier_score_loss
from sklearn.metrics import roc_curve, auc, roc_auc_score, brier_score_loss
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
import matplotlib.pyplot as plt
import operator
import copy
import numpy as np

def load_data(dir_expirements) -> list[dict]:
    """Experiments are stored as a csv file in the following format: budget_item,indicator,label,pred_aggregated,prompts,predictions,discourse
        Each experiment also contains a config file

        Returns a list of dictionaries, where 

        Input format of csv file containing predictions: budget_item,indicator,label,pred_aggregated,prompts,predictions,discourse
       """
    
    experiment_paths = sorted(glob.glob(dir_expirements+'/*'))
    li_exp = []
    for path in experiment_paths:
        path_config = os.path.join(path,'config.yaml')
        path_results = os.path.join(path,'predictions_b2i.csv')

        # Load config file
        with open(path_config) as f:
            config   =  yaml.safe_load(f)
              
        # Load results file
        df = pd.read_csv(path_results)

        li_exp.append( {'config':config, 'results':df} )
    
    return li_exp

def filter_exp( li_exp: list[dict], **filter_kwargs ) -> list[dict]:

    li_exp_filtered = []

    # first filter logic based on checking if any item in  the filter_kwarg is a substring of the config value
    for exp in li_exp:
        if 'llm_names' in filter_kwargs:
            if any( llm_name in exp['config']['llm_name'].lower() for llm_name in filter_kwargs['llm_names'] ):
                li_exp_filtered.append(exp)
        else:
            li_exp_filtered.append(exp)
    
    filter_kwargs.pop('llm_names', None)

    # second filter logic based on checking if any item in list is in the config    
    for idx in range(len(li_exp_filtered)-1, -1, -1):
        exp = li_exp_filtered[idx]
        if any( ( (exp['config'].get(filter_name) not in filter_values) for filter_name, filter_values in filter_kwargs.items() ) ):
            # li_exp_filtered.append(exp)
            li_exp_filtered.pop(idx)

    return li_exp_filtered

def calc_eval_metrics( li_exp, metrics:list = ['accuracy','precision','recall','f1','roc_auc','brier_score','mae'], breakdown_by_budgetitem:bool=False, average_type=None ) -> dict:
    """
        kwargs must be: keys,values = config argument, list of values to filter on

        Filters experiments based on kwargs, then calculates metrics for each experiment
    """
    
    li_exp = copy.deepcopy(li_exp)

    # Calculate metrics for each experiment
    tgt_col = 'related'
    pred_col_dict = 'pred_aggregated'
    pred_col_label = 'pred_label' # For deterministic evaluation
    # pred_col_prob = 'pred_prob'
    
    ## Making pred_label column
    for exp in li_exp:

        exp['results'][pred_col_label] = exp['results'][pred_col_dict].apply(lambda dict_: max(eval(dict_).items(), key=operator.itemgetter(1))[0])
    
    if 'roc_auc' in metrics or 'brier_score' in metrics or 'calibration_error' or 'mae' in metrics:
        # Making pred_prob column e.g. Prob of predicting Yes
        for exp in li_exp:
            #note: the logic in the below line may be  a bit wrong, since it extracts the pred_col_prob of highest value, instead of the pred_col_prob for the label 'Yes' or 'No'
            # exp['results'][pred_col_prob] = exp['results'][pred_col_dict].apply(lambda dict_: max(eval(dict_).items(), key=operator.itemgetter(1))[1])

            exp['results']['pred_yes_prob'] = exp['results'][pred_col_dict].apply(lambda dict_: eval(dict_)['Yes'])
            exp['results']['pred_no_prob'] = exp['results'][pred_col_dict].apply(lambda dict_: eval(dict_)['No'])

            exp['results'][tgt_col+'_yes_prob'] = exp['results'][tgt_col].map(lambda x: 1.0 if x=='Yes' else 0.0)
            exp['results'][tgt_col+'no_prob'] = exp['results'][tgt_col].map(lambda x: 1.0 if x=='No' else 0.0)          


    if breakdown_by_budgetitem is False:
        ## Accuracy
        if 'accuracy' in metrics:
            for exp in li_exp:
                # Calculate accuracy
                accuracy = accuracy_score(exp['results'][tgt_col], exp['results'][pred_col_label])
                if 'metrics' in exp:
                    exp['metrics'].update({'accuracy':accuracy})
                else:
                    exp['metrics'] = {'accuracy':accuracy}
            
        ## Precision, recall, f1
        fpr_metrics = [metric for metric in metrics if metric in ['precision','recall','f1']]
        if len(fpr_metrics) > 0:
            for exp in li_exp:
                # Calculate li_exp, recall, f1
                precision, recall, f1, support = precision_recall_fscore_support(exp['results'][tgt_col],
                                                                        exp['results'][pred_col_label],
                                                                        labels=['Yes'] if average_type != 'binary' else ['Yes'],
                                                                        #    average = None,
                                                                            pos_label = 'Yes' if average_type == 'binary' else 1,
                                                                            average=average_type,
                                                                            #  average='micro'
                                                                            zero_division=np.nan
                                                                            )
                
                _ = {k:v for k,v in zip(['f1', 'precision','recall'], [f1, precision, recall]) if k in fpr_metrics}
                # _['support'] = support
                if 'metrics' in exp:
                    exp['metrics'].update(_)
                else:
                    exp['metrics'] = _
        
        ## ROC_AUC
        if 'roc_auc' in metrics:
            for exp in li_exp:
                # Calculate roc_auc
                roc_auc = roc_auc_score(exp['results'][tgt_col+'_yes_prob'], exp['results']['pred_yes_prob'] )
                if 'metrics' in exp:
                    exp['metrics'].update({'roc_auc':roc_auc})
                else:
                    exp['metrics'] = {'roc_auc':roc_auc}
        

        
        if 'mae' in metrics:
            for exp in li_exp:
                # Calculate calibration_error
                
                calibration_error = np.mean(np.abs( exp['results'][tgt_col+'_yes_prob'] -  exp['results']['pred_yes_prob'] ) )
                
                if 'metrics' in exp:
                    exp['metrics'].update({'mae':calibration_error})
                else:
                    exp['metrics'] = {'mae':calibration_error}
    
    else:
        raise NotImplementedError('breakdown_by_budgetitem=True not implemented yet')
        for exp in li_exp:
            exp['metrics'] = {}
            for budget_item in exp['results']['budget_item'].unique():
                df = exp['results'][exp['results']['budget_item']==budget_item]
                ## Accuracy
                if 'accuracy' in metrics:
                    # Calculate accuracy
                    accuracy = accuracy_score(df[tgt_col], df[pred_col_label])
                    exp['metrics'].update({f'accuracy_{budget_item}':accuracy})
                
                ## Precision, recall, f1
                fpr_metrics = [metric for metric in metrics if metric in ['precision','recall','f1']]
                if len(fpr_metrics) > 0:
                    # Calculate li_exp, recall, f1
                    precision, recall, f1, support = precision_recall_fscore_support(df[tgt_col],
                                                                            df[pred_col_label],
                                                                            labels=['Yes'] if average_type != 'binary' else None,
                                                                            pos_label = 'Yes' if average_type == 'binary' else 1,
                                                                                average=average_type,
                                                                                zero_division=np.nan
                                                                                )
                    
                    _ = {k:v for k,v in zip(['f1', 'precision','recall'], [f1, precision, recall]) if k in fpr_metrics}
                    # _['support'] = support
                    exp['metrics'].update({f'{k}_{budget_item}':v for k,v in _.items()})
            
                ## ROC_AUC
                if 'roc_auc' in metrics:
                    # Calculate roc_auc
                    roc_auc = roc_auc_score(df[tgt_col], df[pred_col_prob], labels=['Yes','No'])
                    exp['metrics'].update({f'roc_auc_{budget_item}':roc_auc})
                
                ## Brier Score
                if 'brier_score' in metrics:
                    # Calculate brier_score
                    brier_score = brier_score_loss(df[tgt_col], df[pred_col_prob], pos_label='Yes')
                    exp['metrics'].update({f'brier_score_{budget_item}':brier_score})

                if 'calibration_error' in metrics:
                    # Calculate calibration_error
                    calibration_error = np.abs(df[tgt_col] - df[pred_col_prob])                    
                    exp['metrics'].update({'calibration_error':calibration_error})

    return li_exp

def convert_li_exp_to_df(li_exp: list) -> pd.DataFrame:
    """Converts list of experiments to a dataframe"""
    li_exp_metrics = []
    for exp in li_exp:
        
        exp_metrics = exp['metrics']
        
        llm_name = exp['config']['llm_name']
        edge_value = exp['config']['edge_value']
        effect_type = exp['config']['effect_type']
        finetuned = exp['config']['finetuned']
        parse_style = exp['config']['parse_style']
        prompt_style = exp['config']['prompt_style']
        uc = exp['config'].get('unbias_categorisations', False)
        exp_metrics.update({'llm_name':llm_name,
                            'edge_value':edge_value,
                            'effect_type':effect_type,
                            'finetuned':finetuned,
                            'parse_style':parse_style,
                            'prompt_style':prompt_style,
                            'uc':uc
                            })
        
        li_exp_metrics.append(exp_metrics)

    df = pd.DataFrame(li_exp_metrics)

    # Put the llm_name column first
    cols = df.columns.tolist()
    cols.insert(0, cols.pop(cols.index('llm_name')))
    df = df.reindex(columns=cols)

    return df

def create_diagrams_from_dataframe(df_exp: pd.DataFrame,
                                   columns_to_create_diagrams_for=['accuracy','roc_auc','brier_score','precision','recall','f1'],
                                   save_dir='./prompt_engineering/analysis/spot_output',
                                   exp_name='CompareAll') -> dict:
    
    # Create directory if it doesn't exist
    save_dir = os.path.join(save_dir, exp_name)
    os.makedirs(save_dir, exist_ok=True)
    
    # Dictionary to store the paths of the saved diagrams
    saved_diagrams = {}
    
    # Iterate over each column to create a diagram
    for column in columns_to_create_diagrams_for:
        if column in df_exp.columns:
            plt.figure(figsize=(10, 6))
            df_exp[column].hist(bins=20)
            plt.title(f'Histogram of {column}')
            plt.xlabel(column)
            plt.ylabel('Frequency')
            
            # Save the diagram
            file_path = os.path.join(save_dir, f'{column}_histogram.png')
            plt.savefig(file_path)
            
            # Store the path in the dictionary
            saved_diagrams[column] = file_path
            
            # Close the plot to free up memory
            plt.close()
            
    return saved_diagrams

## Analyses

In [75]:
# Comparing the results of the spot experiments
# Directories
dir_exp_extensive =  "../../prompt_engineering/output/spot/extensive"


li_all_experiments_extensive = load_data( dir_exp_extensive )

### Ablation - Model Size

In [None]:
dir_exp_ablation_size = '../../prompt_engineering/output/spot/ablation_size'
li_all_experiments_ablation_size = load_data( dir_exp_ablation_size  )
li_exps_ablation_size = calc_eval_metrics(li_all_experiments_ablation_size, metrics=['f1','mae','roc_auc'] )
df_exps_ablation_size = convert_li_exp_to_df(li_exps_ablation_size)
display(df_exps_ablation_size)

### Prompt Style CPUQ - With or without Question & reasonsing




In [None]:
dir_ablation_reasoning_step = "../../prompt_engineering/output/spot/abaltion_reasoning_step"
li_all_experiments_ablation_reasoning_step = load_data(dir_ablation_reasoning_step)
li_exps_ablation_reasoning_step = filter_exp(li_all_experiments_ablation_reasoning_step )

li_exps_ablation_reasoning_step_w_res = calc_eval_metrics(li_exps_ablation_reasoning_step, metrics=['f1','mae','roc_auc'] )


df_exps_ablation_reasoning_step = convert_li_exp_to_df(li_exps_ablation_reasoning_step_w_res)
display(df_exps_ablation_reasoning_step)

### Performance Metrics for binary vs prob


In [None]:
dir_ablation_binary_vs_prob  = "../../prompt_engineering/output/spot/ablation_binary_vs_prob"
li_all_experiments_ablation_binary_vs_prob = load_data(dir_ablation_binary_vs_prob)


li_exps_ablation_binary_vs_prob_w_res = calc_eval_metrics(li_all_experiments_ablation_binary_vs_prob, metrics=['f1','mae','roc_auc'] )


df_exps_ablation_binary_vs_prob = convert_li_exp_to_df(li_exps_ablation_binary_vs_prob_w_res)
display(df_exps_ablation_binary_vs_prob)


## Ablation naive prompting

In [None]:
dir_ablation_naive_prompting = "../../prompt_engineering/output/spot/extensive"
li_all_experiments_ablation_naive_prompting = load_data(dir_ablation_naive_prompting)
li_exps_ablation_naive_prompting = filter_exp(li_all_experiments_ablation_naive_prompting, prompt_style=['yes_no']  )

li_exps_ablation_naive_prompting_w_res = calc_eval_metrics(li_exps_ablation_naive_prompting, metrics=['f1','mae','roc_auc'] )

df_exps_ablation_naive_prompting = convert_li_exp_to_df(li_exps_ablation_naive_prompting_w_res)
display(df_exps_ablation_naive_prompting)

# Varying Unbiased Categorisations


In [None]:
metrics= ['f1','mae','roc_auc','accuracy','precision','recall','calibration_error']

dir_ablation_uc = "../../prompt_engineering/output/spot/ablation_uc"
li_all_experiments_ablation_uc = load_data(dir_ablation_uc)
li_exps_ablation_uc = filter_exp(li_all_experiments_ablation_uc )

li_exps_ablation_uc_w_res = calc_eval_metrics(li_exps_ablation_uc, metrics=metrics )

df_exps_ablation_uc = convert_li_exp_to_df(li_exps_ablation_uc_w_res)
display(df_exps_ablation_uc)


# 3. Filtering the data and calculating the performance differences
df = df_exps_ablation_uc
# rip model size from llm_name as 3B is 3bn, 7B is 7bn, 13B is 13bn, 30B is 30bn e.g. just search for that tag and then insert 1bn or 3bn or70bn. Chekc if the subprhase is in the name
df['model_size'] = df['llm_name'].apply(lambda x: re.search(r'\d+B', x).group(0).replace('B', 'bn'))

model_sizes = ['3bn', '8bn','70bn']
filtered_df = df[(df['uc'].isin([False,True])) & 
                 (df['model_size'].isin(model_sizes))]

diff_df = pd.DataFrame(columns=["model_size", "metric", "uc"])
for model in ['3bn', '8bn','70bn']:

    for metric in metrics
    non_uc_f1 = filtered_df[(filtered_df['model_size'] == model) & 
                                   (filtered_df['uc'] == False)]['f1'].values[0][0]

    non_uc_mae = filtered_df[(filtered_df['model_size'] == model) & 
                                   (filtered_df['uc'] == False)]['mae'].values[0]

    non_uc_roc_auc = filtered_df[(filtered_df['model_size'] == model) & 
                                   (filtered_df['uc'] == False)]['roc_auc'].values[0]    

    
    

    
    uc_f1_diff = filtered_df[(filtered_df['model_size'] == model) & 
                                         (filtered_df['uc'] == True )]['f1'].values[0][0] - non_uc_f1
    
    uc_mae_diff = filtered_df[(filtered_df['model_size'] == model) & 
                                         (filtered_df['uc'] == True )]['mae'].values[0] - non_uc_mae

    uc_roc_auc_diff = filtered_df[(filtered_df['model_size'] == model) & 
                                            (filtered_df['uc'] == True )]['roc_auc'].values[0] - non_uc_roc_auc
    



    # diff_df = diff_df.append({"model_size": model, "metric": "f1", "uc": uc_f1_diff }, ignore_index=True)
    # diff_df = diff_df.append({"model_size": model, "metric": "mae", "uc": uc_mae_diff,}, ignore_index=True)
    # diff_df = diff_df.append({"model_size": model, "metric": "roc_auc", "uc": uc_roc_auc_diff }, ignore_index=True)
    # Use pd.concat instead of append
    new_rows = pd.DataFrame([
        {"model_size": model, "metric": "f1", "uc": uc_f1_diff},
        {"model_size": model, "metric": "mae", "uc": uc_mae_diff},
        {"model_size": model, "metric": "roc_auc", "uc": uc_roc_auc_diff}
    ])
    diff_df = pd.concat([diff_df, new_rows], ignore_index=True)



# 4. Plotting the differences in a bar chart with reduced gap
barWidth = 0.7
gap = 0.7
r1 = np.arange(0, 2.8*len(diff_df[diff_df['metric'] == 'f1']) - gap, 2.8)
r2 = [x + barWidth for x in r1]
r3 = [x + 2*barWidth for x in r1]


fig, ax = plt.subplots(figsize=(7, 5))
bars1 = ax.bar(r1, diff_df[diff_df['metric'] == 'f1']['uc'].values  , width=barWidth, label='f1', color='b')
bars2 = ax.bar(r2, diff_df[diff_df['metric'] == 'mae']['uc'].values, width=barWidth, label='mae', color='r')
bars3 = ax.bar(r3, diff_df[diff_df['metric'] == 'roc_auc']['uc'].values, width=barWidth, label='roc_auc', color='c')

def label_bars(bars):
    for bar in bars:
        yval = bar.get_height()
        if yval > 0:
            ax.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 3), ha='center', va='bottom')
        else:
            ax.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 3), ha='center', va='top')

label_bars(bars1)
label_bars(bars2)
label_bars(bars3)
# ax.set_title('Difference in Performance Metrics Due to Unbiased Categorisation')
ax.set_xlabel('Model Size', fontweight='bold', fontsize=14)
ax.set_ylabel('Difference', fontweight='bold', fontsize=14)
ax.set_xticks([(r + 1.1*barWidth) for r in r1])
ax.set_xticklabels(model_sizes, fontsize=12)
ax.legend(fontsize=14)
plt.tight_layout()
plt.show()
import os
os.makedirs('figures', exist_ok=True)
fig.savefig('figures/ablation_alignment_unbiased_categorisation.png', dpi=300)


In [None]:
metrics= ['f1','mae','roc_auc','accuracy','precision','recall']
# metrics= ['f1','mae']

dir_ablation_uc = "../../prompt_engineering/output/spot/ablation_uc"
li_all_experiments_ablation_uc = load_data(dir_ablation_uc)
li_exps_ablation_uc = filter_exp(li_all_experiments_ablation_uc )

li_exps_ablation_uc_w_res = calc_eval_metrics(li_exps_ablation_uc, metrics=metrics )

df_exps_ablation_uc = convert_li_exp_to_df(li_exps_ablation_uc_w_res)
display(df_exps_ablation_uc)

# 3. Filtering the data and calculating the performance differences
df = df_exps_ablation_uc
# rip model size from llm_name as 3B is 3bn, 7B is 7bn, 13B is 13bn, 30B is 30bn e.g. just search for that tag and then insert 1bn or 3bn or70bn. Chekc if the subprhase is in the name
df['model_size'] = df['llm_name'].apply(lambda x: re.search(r'\d+B', x).group(0).replace('B', 'bn'))

model_sizes = ['3bn', '8bn','70bn']
filtered_df = df[(df['uc'].isin([False,True])) & 
                 (df['model_size'].isin(model_sizes))]

diff_df = pd.DataFrame(columns=["model_size", "metric", "uc"])
for model in ['3bn', '8bn','70bn']:
    for metric in metrics:
        # Get non-UC value
        non_uc_value = filtered_df[(filtered_df['model_size'] == model) & 
                                 (filtered_df['uc'] == False)][metric].values[0]
        
        # Get UC value
        uc_value = filtered_df[(filtered_df['model_size'] == model) & 
                             (filtered_df['uc'] == True)][metric].values[0]
        
        # Handle f1 which is stored as a list
        # if metric == 'f1':
        #     non_uc_value = non_uc_value[0]
        #     uc_value = uc_value[0]

        # Handle metrics that are stored as lists or arrays
        if isinstance(non_uc_value, (list, np.ndarray)):
            non_uc_value = non_uc_value[0]
        if isinstance(uc_value, (list, np.ndarray)):
            uc_value = uc_value[0]
        
        # Calculate difference
        diff = uc_value - non_uc_value
        
        # Add to dataframe
        new_row = pd.DataFrame([{
            "model_size": model,
            "metric": metric,
            "uc": diff
        }])
        diff_df = pd.concat([diff_df, new_row], ignore_index=True)

# Update plotting code to handle all metrics
barWidth =0.3
gap = 0.9
num_metrics = len(metrics)
r_base = np.arange(0, 2.8*len(model_sizes) - gap, 2.8)

fig, ax = plt.subplots(figsize=(7, 7))
bars = []
colors = plt.cm.get_cmap('tab20')(np.linspace(0, 1, num_metrics))

for idx, metric in enumerate(metrics):
    r = [x + idx*barWidth for x in r_base]
    metric_data = diff_df[diff_df['metric'] == metric]['uc'].values
    bars.append(ax.bar(r, metric_data, width=barWidth, label=metric, color=colors[idx]))

# Label bars function remains the same
def label_bars(bars):
    for bar in bars:
        yval = bar.get_height()
        if yval > 0:
            ax.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 3), ha='center', va='bottom')
        else:
            ax.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 3), ha='center', va='top')

for bar_group in bars:
    label_bars(bar_group)

ax.set_xlabel('Model Size', fontweight='bold', fontsize=14)
ax.set_ylabel('Difference', fontweight='bold', fontsize=14)
ax.set_xticks([r + (num_metrics-1)*barWidth/2 for r in r_base])
ax.set_xticklabels(model_sizes, fontsize=12)
ax.legend(fontsize=14)
plt.tight_layout()
plt.show()

fig.savefig('figures/ablation_alignment_unbiased_categorisation.png', dpi=300)
