In [16]:
import pandas as pd
import json
import numpy as np
from scipy.stats import spearmanr



df = pd.read_csv("../../data/eval_results/gpt/100q_with_gpt_f1_scores.csv")

In [41]:
# correlation between simple_conf and gpt_conf_score > gpt_threshold
gpt_threshold = 75
df[f'gpt_conf_above_threshold'] = df['gpt_conf_score'] > gpt_threshold
simple_vs_gpt = df['simple_conf'].corr(df[f'gpt_conf_above_threshold'].astype(int))
print(f"Correlation between simple_conf and gpt_conf_score > threshold: {simple_vs_gpt}")
# correlation between adv_conf and gpt_conf_score > gpt_threshold
adv_vs_gpt = df['adv_conf'].corr(df[f'gpt_conf_above_threshold'].astype(int))
print(f"Correlation between adv_conf and gpt_conf_score > threshold: {adv_vs_gpt}")

Correlation between simple_conf and gpt_conf_score > threshold: -0.1484949005973215
Correlation between adv_conf and gpt_conf_score > threshold: -0.07913986195278683


In [42]:
# average num_mentions where gpt_conf_score > gpt_threshold
df_gpt_above_threshold = df[df['gpt_conf_score'] > gpt_threshold]
avg_num_mentions = df_gpt_above_threshold['num_mentions'].mean()
print(f"Average num_mentions where gpt_conf_score > {gpt_threshold}: {avg_num_mentions}")


Average num_mentions where gpt_conf_score > 75: 0.6134453781512605


In [25]:
# see the length of the intersection of simple_conf and adv_conf where gpt_conf_score > gpt_threshold
# so get simple_conf == true and adv_conf == true and gpt_conf_score > gpt_threshold
simple_and_gpt = df[(df['simple_conf'] == True)  & (df['gpt_conf_score'] > gpt_threshold)]
print(f"Number of instances where simple_conf == true and gpt_conf_score > {gpt_threshold}: {len(simple_and_gpt)}")
adv_and_gpt = df[(df['adv_conf'] == True)  & (df['gpt_conf_score'] > gpt_threshold)]
print(f"Number of instances where adv_conf == true and gpt_conf_score > {gpt_threshold}: {len(adv_and_gpt)}")
simple_and_adv_and_gpt = df[(df['simple_conf'] == True) & (df['adv_conf'] == True) & (df['gpt_conf_score'] > gpt_threshold)]
print(f"Number of instances where simple_conf == true and adv_conf == true and gpt_conf_score > {gpt_threshold}: {len(simple_and_adv_and_gpt)}")


Number of instances where simple_conf == true and gpt_conf_score > 75: 90
Number of instances where adv_conf == true and gpt_conf_score > 75: 15
Number of instances where simple_conf == true and adv_conf == true and gpt_conf_score > 75: 15


In [26]:
rho_simple_intersect, _ = spearmanr(simple_and_gpt['simple_metric'], simple_and_gpt['gpt_f1_score'])
rho_adv_intersect, _ = spearmanr(adv_and_gpt['adv_metric'], adv_and_gpt['gpt_f1_score'])
rho_simple_all, _ = spearmanr(df['simple_metric'], df['gpt_f1_score'])
rho_adv_all, _ = spearmanr(df['adv_metric'], df['gpt_f1_score'])
rho_all_intersect_simple, _ = spearmanr(simple_and_adv_and_gpt['simple_metric'], simple_and_adv_and_gpt['gpt_f1_score'])
rho_all_intersect_adv, _ = spearmanr(simple_and_adv_and_gpt['adv_metric'], simple_and_adv_and_gpt['gpt_f1_score'])
# correlation between just_reward and gpt_f1_score for when gpt_conf_score > {gpt_threshold}, we dont look at simple_conf or adv_conf here

df_gpt_above_threshold = df[df['gpt_conf_score'] > gpt_threshold]
rho_just_reward, _ = spearmanr(df_gpt_above_threshold['just_reward'], df_gpt_above_threshold['gpt_f1_score'])

print(f"Spearman correlation (rho) between simple_metric and gpt_f1_score for simple_conf == true and gpt_conf_score > {gpt_threshold}: {rho_simple_intersect}")
print(f"Spearman correlation (rho) between adv_metric and gpt_f1_score for adv_conf == true and gpt_conf_score > {gpt_threshold}: {rho_adv_intersect}")
print(f"Spearman correlation (rho) between simple_metric and gpt_f1_score for all data: {rho_simple_all}")
print(f"Spearman correlation (rho) between adv_metric and gpt_f1_score for all data: {rho_adv_all}")
print(f"Spearman correlation (rho) between simple_metric and gpt_f1_score for simple_conf == true and adv_conf == true and gpt_conf_score > {gpt_threshold}: {rho_all_intersect_simple}")
print(f"Spearman correlation (rho) between adv_metric and gpt_f1_score for simple_conf == true and adv_conf == true and gpt_conf_score > {gpt_threshold}: {rho_all_intersect_adv}")  
print(f"Spearman correlation (rho) between just_reward and gpt_f1_score for simple_conf == true and gpt_conf_score > {gpt_threshold}: {rho_just_reward}")

Spearman correlation (rho) between simple_metric and gpt_f1_score for simple_conf == true and gpt_conf_score > 75: 0.8913115148713601
Spearman correlation (rho) between adv_metric and gpt_f1_score for adv_conf == true and gpt_conf_score > 75: nan
Spearman correlation (rho) between simple_metric and gpt_f1_score for all data: nan
Spearman correlation (rho) between adv_metric and gpt_f1_score for all data: nan
Spearman correlation (rho) between simple_metric and gpt_f1_score for simple_conf == true and adv_conf == true and gpt_conf_score > 75: nan
Spearman correlation (rho) between adv_metric and gpt_f1_score for simple_conf == true and adv_conf == true and gpt_conf_score > 75: nan
Spearman correlation (rho) between just_reward and gpt_f1_score for simple_conf == true and gpt_conf_score > 75: 0.8886934519031919


  rho_adv_intersect, _ = spearmanr(adv_and_gpt['adv_metric'], adv_and_gpt['gpt_f1_score'])
  rho_all_intersect_simple, _ = spearmanr(simple_and_adv_and_gpt['simple_metric'], simple_and_adv_and_gpt['gpt_f1_score'])
  rho_all_intersect_adv, _ = spearmanr(simple_and_adv_and_gpt['adv_metric'], simple_and_adv_and_gpt['gpt_f1_score'])


In [20]:
# general correlation between simple_metric and gpt_f1_score
general_rho_simple, _ = spearmanr(df['simple_metric'], df['gpt_f1_score'])
general_rho_adv, _ = spearmanr(df['adv_metric'], df['gpt_f1_score'])
print(f"General Spearman correlation (rho) between simple_metric and gpt_f1_score: {general_rho_simple}")
print(f"General Spearman correlation (rho) between adv_metric and gpt_f1_score: {general_rho_adv}")

General Spearman correlation (rho) between simple_metric and gpt_f1_score: nan
General Spearman correlation (rho) between adv_metric and gpt_f1_score: nan
