In [2]:
import pandas as pd
import json
import numpy as np
from scipy.stats import spearmanr



df = pd.read_csv("../../data/eval_results/gpt/100q_with_gpt_f1_scores.csv")

In [4]:
# correlation between simple_conf and gpt_conf_score > 50
df['gpt_conf_above_50'] = df['gpt_conf_score'] > 50
simple_vs_gpt = df['simple_conf'].corr(df['gpt_conf_above_50'].astype(int))
print(f"Correlation between simple_conf and gpt_conf_score > 50: {simple_vs_gpt}")
# correlation between adv_conf and gpt_conf_score > 50
adv_vs_gpt = df['adv_conf'].corr(df['gpt_conf_above_50'].astype(int))
print(f"Correlation between adv_conf and gpt_conf_score > 50: {adv_vs_gpt}")

Correlation between simple_conf and gpt_conf_score > 50: -0.08212660237876346
Correlation between adv_conf and gpt_conf_score > 50: -0.032580551837601474


In [3]:
# see the length of the intersection of simple_conf and adv_conf where gpt_conf_score > 50
# so get simple_conf == true and adv_conf == true and gpt_conf_score > 50
simple_and_gpt = df[(df['simple_conf'] == True)  & (df['gpt_conf_score'] > 50)]
print(f"Number of instances where simple_conf == true and gpt_conf_score > 50: {len(simple_and_gpt)}")
adv_and_gpt = df[(df['adv_conf'] == True)  & (df['gpt_conf_score'] > 50)]
print(f"Number of instances where adv_conf == true and gpt_conf_score > 50: {len(adv_and_gpt)}")
simple_and_adv_and_gpt = df[(df['simple_conf'] == True) & (df['adv_conf'] == True) & (df['gpt_conf_score'] > 50)]
print(f"Number of instances where simple_conf == true and adv_conf == true and gpt_conf_score > 50: {len(simple_and_adv_and_gpt)}")


Number of instances where simple_conf == true and gpt_conf_score > 50: 541
Number of instances where adv_conf == true and gpt_conf_score > 50: 125
Number of instances where simple_conf == true and adv_conf == true and gpt_conf_score > 50: 125


In [8]:
rho_simple_intersect, _ = spearmanr(simple_and_gpt['simple_metric'], simple_and_gpt['gpt_f1_score'])
rho_adv_intersect, _ = spearmanr(adv_and_gpt['adv_metric'], adv_and_gpt['gpt_f1_score'])
rho_simple_all, _ = spearmanr(df['simple_metric'], df['gpt_f1_score'])
rho_adv_all, _ = spearmanr(df['adv_metric'], df['gpt_f1_score'])
rho_all_intersect_simple, _ = spearmanr(simple_and_adv_and_gpt['simple_metric'], simple_and_adv_and_gpt['gpt_f1_score'])
rho_all_intersect_adv, _ = spearmanr(simple_and_adv_and_gpt['adv_metric'], simple_and_adv_and_gpt['gpt_f1_score'])
# correlation between just_reward and gpt_f1_score for when gpt_conf_score > 50, we dont look at simple_conf or adv_conf here

df_gpt_above_50 = df[df['gpt_conf_score'] > 50]
rho_just_reward, _ = spearmanr(df_gpt_above_50['just_reward'], df_gpt_above_50['gpt_f1_score'])

print(f"Spearman correlation (rho) between simple_metric and gpt_f1_score for simple_conf == true and gpt_conf_score > 50: {rho_simple_intersect}")
print(f"Spearman correlation (rho) between adv_metric and gpt_f1_score for adv_conf == true and gpt_conf_score > 50: {rho_adv_intersect}")
print(f"Spearman correlation (rho) between simple_metric and gpt_f1_score for all data: {rho_simple_all}")
print(f"Spearman correlation (rho) between adv_metric and gpt_f1_score for all data: {rho_adv_all}")
print(f"Spearman correlation (rho) between simple_metric and gpt_f1_score for simple_conf == true and adv_conf == true and gpt_conf_score > 50: {rho_all_intersect_simple}")
print(f"Spearman correlation (rho) between adv_metric and gpt_f1_score for simple_conf == true and adv_conf == true and gpt_conf_score > 50: {rho_all_intersect_adv}")  
print(f"Spearman correlation (rho) between just_reward and gpt_f1_score for simple_conf == true and gpt_conf_score > 50: {rho_just_reward}")

Spearman correlation (rho) between simple_metric and gpt_f1_score for simple_conf == true and gpt_conf_score > 50: 0.8417196806214539
Spearman correlation (rho) between adv_metric and gpt_f1_score for adv_conf == true and gpt_conf_score > 50: 0.8146418627983509
Spearman correlation (rho) between simple_metric and gpt_f1_score for all data: nan
Spearman correlation (rho) between adv_metric and gpt_f1_score for all data: nan
Spearman correlation (rho) between simple_metric and gpt_f1_score for simple_conf == true and adv_conf == true and gpt_conf_score > 50: 0.8146418627983509
Spearman correlation (rho) between adv_metric and gpt_f1_score for simple_conf == true and adv_conf == true and gpt_conf_score > 50: 0.8146418627983509
Spearman correlation (rho) between just_reward and gpt_f1_score for simple_conf == true and gpt_conf_score > 50: 0.8709836637300012


In [None]:
score_to_cands ,_ = spearmanr(df['gpt_conf_score'], df['gpt_cands'].apply(lambda x: len(eval(x)) if isinstance(x,str) else np.nan))
print(f"Spearman correlation (rho) between gpt_conf_score and number of gpt_cands: {score_to_cands}")

In [11]:
# average number of elements in gpt_cands column where gpt_conf_score > 50
avg_num_gpt_cands_above_50 = df[df['gpt_conf_score'] > 50]['gpt_cands'].apply(lambda x: len(json.loads(x))).mean()
print(f"Average number of elements in gpt_cands where gpt_conf_score > 50: {avg_num_gpt_cands_above_50}")

Average number of elements in gpt_cands where gpt_conf_score > 50: 5.145812807881773


In [3]:
# general correlation between simple_metric and gpt_f1_score
general_rho_simple, _ = spearmanr(df['simple_metric'], df['gpt_f1_score'])
general_rho_adv, _ = spearmanr(df['adv_metric'], df['gpt_f1_score'])
print(f"General Spearman correlation (rho) between simple_metric and gpt_f1_score: {general_rho_simple}")
print(f"General Spearman correlation (rho) between adv_metric and gpt_f1_score: {general_rho_adv}")

General Spearman correlation (rho) between simple_metric and gpt_f1_score: nan
General Spearman correlation (rho) between adv_metric and gpt_f1_score: nan
