In [1]:
import pandas as pd
import os
import numpy as np
from scipy.stats import spearmanr
from scipy.stats import pearsonr, spearmanr, kendalltau
from scipy.stats import mannwhitneyu
from sklearn.metrics import roc_auc_score
from cliffs_delta import cliffs_delta as cd

In [2]:
def cohens_d(x, y):
    x, y = np.asarray(x), np.asarray(y)
    nx, ny = len(x), len(y)
    vx, vy = np.var(x, ddof=1), np.var(y, ddof=1)
    sp = np.sqrt(((nx-1)*vx + (ny-1)*vy) / (nx + ny - 2))
    return (np.mean(x) - np.mean(y)) / sp

def get_corr_stats(df_analysis, binary=False, feats=["surprisal"], score_col="score", binary_col='labels'):

    pearson_coef = []
    pearson_p = []
    spearman_coef = []
    spearman_p = []
    kendall_coef = []
    kendall_p = []
    
    rank_biserial_r = []
    mw_u = []
    mw_p = []
    cohens_ds = []
    auc_s = []
    cliffs_delta = []
    cliffs_size = []

    if not binary:
        threshold = 0.5
        y_true = np.array([1]*len(df_analysis[df_analysis[score_col]>=threshold]) + [0]*len(df_analysis[df_analysis[score_col]<threshold]))
    else:
        y_true = np.array([1]*len(df_analysis[df_analysis[binary_col]=='novel']) + [0]*len(df_analysis[df_analysis[binary_col]=="conventional"]))

    for col in feats:

        if not binary:
        
            # 1. Pearson
            pearson_c, pearson_pv = pearsonr(df_analysis[score_col], df_analysis[col])
            pearson_coef.append(pearson_c)
            pearson_p.append(pearson_pv)

            # 2. Spearman
            spearman_c, spearman_pv = spearmanr(df_analysis[score_col], df_analysis[col])
            spearman_coef.append(spearman_c)
            spearman_p.append(spearman_pv)

            # 3. Kendall
            kendall_c, kendall_pv = kendalltau(df_analysis[score_col], df_analysis[col])
            kendall_coef.append(kendall_c)
            kendall_p.append(kendall_pv)

            # 4. Mann-Whitney U
            mw_u_stat, mw_pv = mannwhitneyu(df_analysis[df_analysis[score_col]>=threshold][col],
                                                df_analysis[df_analysis[score_col]<threshold][col])
            mw_u.append(mw_u_stat)
            mw_p.append(mw_pv)

            # 5. Rank-biserial correlation
            n1, n2 = len(df_analysis[df_analysis[score_col]>=threshold]), len(df_analysis[df_analysis[score_col]<threshold])
            rank_biserial = 2*mw_u_stat/(n1*n2) - 1
            rank_biserial_r.append(rank_biserial)

            # 6. Cohen's d
            d = cohens_d(df_analysis[df_analysis[score_col]>=threshold][col], df_analysis[df_analysis[score_col]<threshold][col])
            cohens_ds.append(d)

            # 7. AUC
            surpr_all = np.concatenate([df_analysis[df_analysis[score_col]>=threshold][col], df_analysis[df_analysis[score_col]<threshold][col]])
            auc = roc_auc_score(y_true, surpr_all)   # use +surpr_all if higher surprisal => Positive
            auc_s.append(auc)

            # 8. Cliff's delta
            d, size = cd(df_analysis[df_analysis[score_col]>=threshold][col], df_analysis[df_analysis[score_col]<threshold][col])
            cliffs_delta.append(d)
            cliffs_size.append(size)
        else:

            # 4. Mann-Whitney U
            mw_u_stat, mw_pv = mannwhitneyu(df_analysis[df_analysis[binary_col]=='novel'][col],
                                                df_analysis[df_analysis[binary_col]=="conventional"][col])
            mw_u.append(mw_u_stat)
            mw_p.append(mw_pv)
            
            # 5. Rank-biserial correlation
            n1, n2 = len(df_analysis[df_analysis[binary_col]=='novel']), len(df_analysis[df_analysis[binary_col]=="conventional"])
            rank_biserial = 2*mw_u_stat/(n1*n2) - 1
            rank_biserial_r.append(rank_biserial)
            
            # 6. Cohen's d
            d = cohens_d(df_analysis[df_analysis[binary_col]=='novel'][col], df_analysis[df_analysis[binary_col]=="conventional"][col])
            cohens_ds.append(d)
            
            # 7. AUC
            surpr_all = np.concatenate([df_analysis[df_analysis[binary_col]=='novel'][col], df_analysis[df_analysis[binary_col]=="conventional"][col]])
            auc = roc_auc_score(y_true, surpr_all)   # use +surpr_all if higher surprisal => Positive
            auc_s.append(auc)
            
            # 8. Cliff's delta
            d, size = cd(df_analysis[df_analysis[binary_col]=='novel'][col], df_analysis[df_analysis[binary_col]=="conventional"][col])
            cliffs_delta.append(d)
            cliffs_size.append(size)
            

    if not binary:        
        df_stats = pd.DataFrame({
            "feature": feats,
            "pearson_coef": pearson_coef,
            "pearson_p": pearson_p,
            "spearman_coef": spearman_coef,
            "spearman_p": spearman_p,
            "rank_biserial_r": rank_biserial_r,
            "auc": auc_s,
            "mw_p": mw_p,
            "cohens_d": cohens_ds,
            "cliffs_delta": cliffs_delta,
            "cliffs_size": cliffs_size
        })
    else:
        df_stats = pd.DataFrame({
            "feature": feats,
            "rank_biserial_r": rank_biserial_r,
            "auc": auc_s,
            "mw_p": mw_p,
            "cohens_d": cohens_ds,
            "cliffs_delta": cliffs_delta,
            "cliffs_size": cliffs_size
        })

    return df_stats


# VUA-ratings results

In [3]:
all_results = []
models_ids = []
for f in os.listdir("results"):
    if f.endswith(".parquet") and "vua" in f:
        print(f"Loading {f}...")
        df_part = pd.read_parquet(os.path.join("results", f))
        all_results.append(df_part)
        models_ids.append(f.split("_")[-3])

Loading vua-metanov_surprisal_meta-llama_Llama-3.2-3B_cloze_pimentel.parquet...
Loading vua-metanov_surprisal_openai-community_gpt2_cloze_pimentel.parquet...
Loading vua-metanov_surprisal_meta-llama_Llama-3.1-8B-Instruct_cloze_pimentel.parquet...
Loading vua-metanov_surprisal_Qwen_Qwen2.5-7B_cloze_pimentel.parquet...
Loading vua-metanov_surprisal_openai-community_gpt2-xl_cloze_pimentel.parquet...
Loading vua-metanov_surprisal_meta-llama_Llama-3.2-3B-Instruct_cloze_pimentel.parquet...
Loading vua-metanov_surprisal_meta-llama_Llama-3.1-8B_cloze_pimentel.parquet...
Loading vua-metanov_surprisal_Qwen_Qwen2.5-14B_cloze_pimentel.parquet...
Loading vua-metanov_surprisal_openai-community_gpt2-medium_cloze_pimentel.parquet...
Loading vua-metanov_surprisal_meta-llama_Llama-3.2-1B_cloze_pimentel.parquet...
Loading vua-metanov_surprisal_Qwen_Qwen2.5-0.5B-Instruct_cloze_pimentel.parquet...
Loading vua-metanov_surprisal_Qwen_Qwen2.5-0.5B_cloze_pimentel.parquet...
Loading vua-metanov_surprisal_openai

In [4]:
models_ids, len(models_ids)

(['Llama-3.2-3B',
  'gpt2',
  'Llama-3.1-8B-Instruct',
  'Qwen2.5-7B',
  'gpt2-xl',
  'Llama-3.2-3B-Instruct',
  'Llama-3.1-8B',
  'Qwen2.5-14B',
  'gpt2-medium',
  'Llama-3.2-1B',
  'Qwen2.5-0.5B-Instruct',
  'Qwen2.5-0.5B',
  'gpt2-large',
  'Llama-3.2-1B-Instruct',
  'Qwen2.5-7B-Instruct',
  'Qwen2.5-14B-Instruct'],
 16)

In [5]:
all_results[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16202 entries, 0 to 16201
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   document_id             16202 non-null  object
 1   sentence_id             16202 non-null  object
 2   sentence                16202 non-null  object
 3   words_list              16202 non-null  object
 4   offsets                 16202 non-null  object
 5   vua_metaphor_labels     16202 non-null  object
 6   do_dinh_scores          16202 non-null  object
 7   reimann_novelty_labels  16202 non-null  object
 8   genre                   16202 non-null  object
 9   subtoken_ids            16202 non-null  object
 10  subtoken_strs           16202 non-null  object
 11  surprisal_buggy         16202 non-null  object
 12  surprisal_fixed         16202 non-null  object
 13  subtoken_ids_cloze      16202 non-null  object
 14  subtoken_strs_cloze     16202 non-null  object
 15  su

In [6]:
df_analysis = pd.DataFrame()
for k, df in enumerate(all_results):
    do_dinh_scores = []
    surp_buggy = []
    surp_fixed = []
    surp_buggy_cloze = []
    surp_fixed_cloze = []
    for i, row in df.iterrows():
        for j, label in enumerate(row['vua_metaphor_labels']):
            if label == True:
                scs = row['do_dinh_scores'][j]
                scs = max([float(s) for s in scs.split(",")])
                if scs > -1:
                    do_dinh_scores.append(scs)
                    surp_buggy.append(row['surprisal_buggy'][j].item())
                    surp_fixed.append(row['surprisal_fixed'][j].item())
                    surp_buggy_cloze.append(row['surprisal_buggy_cloze'][j].item())
                    surp_fixed_cloze.append(row['surprisal_fixed_cloze'][j].item())
        
                    
    df_analysis['score'] = do_dinh_scores
    df_analysis[f"surprisal_buggy_{models_ids[k]}"] = surp_buggy
    df_analysis[f"surprisal_fixed_{models_ids[k]}"] = surp_fixed
    df_analysis[f"surprisal_buggy_cloze_{models_ids[k]}"] = surp_buggy_cloze
    df_analysis[f"surprisal_fixed_cloze_{models_ids[k]}"] = surp_fixed_cloze


In [7]:
df_analysis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15155 entries, 0 to 15154
Data columns (total 65 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   score                                        15155 non-null  float64
 1   surprisal_buggy_Llama-3.2-3B                 15155 non-null  float64
 2   surprisal_fixed_Llama-3.2-3B                 15155 non-null  float64
 3   surprisal_buggy_cloze_Llama-3.2-3B           15155 non-null  float64
 4   surprisal_fixed_cloze_Llama-3.2-3B           15155 non-null  float64
 5   surprisal_buggy_gpt2                         15155 non-null  float64
 6   surprisal_fixed_gpt2                         15155 non-null  float64
 7   surprisal_buggy_cloze_gpt2                   15155 non-null  float64
 8   surprisal_fixed_cloze_gpt2                   15155 non-null  float64
 9   surprisal_buggy_Llama-3.1-8B-Instruct        15155 non-null  float64
 10

In [8]:
corr_res_df = get_corr_stats(df_analysis, binary=False, feats=[f"surprisal_buggy_{mid}" for mid in models_ids] + [f"surprisal_fixed_{mid}" for mid in models_ids] + [f"surprisal_buggy_cloze_{mid}" for mid in models_ids] + [f"surprisal_fixed_cloze_{mid}" for mid in models_ids], score_col="score", binary_col='labels')
corr_res_df

Unnamed: 0,feature,pearson_coef,pearson_p,spearman_coef,spearman_p,rank_biserial_r,auc,mw_p,cohens_d,cliffs_delta,cliffs_size
0,surprisal_buggy_Llama-3.2-3B,0.320215,0.000000e+00,0.301315,0.000000e+00,0.491383,0.745691,2.973274e-56,1.004674,0.491383,large
1,surprisal_buggy_gpt2,0.412258,0.000000e+00,0.409555,0.000000e+00,0.630008,0.815004,2.839013e-91,1.358969,0.630008,large
2,surprisal_buggy_Llama-3.1-8B-Instruct,0.307090,0.000000e+00,0.288649,1.260471e-288,0.495135,0.747568,4.350308e-57,1.020528,0.495135,large
3,surprisal_buggy_Qwen2.5-7B,0.328355,0.000000e+00,0.309615,0.000000e+00,0.493323,0.746662,1.102671e-56,1.015082,0.493323,large
4,surprisal_buggy_gpt2-xl,0.366349,0.000000e+00,0.355127,0.000000e+00,0.557553,0.778777,6.767423e-72,1.171678,0.557553,large
...,...,...,...,...,...,...,...,...,...,...,...
59,surprisal_fixed_cloze_Qwen2.5-0.5B,0.392178,0.000000e+00,0.384340,0.000000e+00,0.576742,0.788371,8.463839e-77,1.304927,0.576742,large
60,surprisal_fixed_cloze_gpt2-large,0.455469,0.000000e+00,0.459231,0.000000e+00,0.647690,0.823845,2.329887e-96,1.400866,0.647690,large
61,surprisal_fixed_cloze_Llama-3.2-1B-Instruct,0.349787,0.000000e+00,0.333738,0.000000e+00,0.533265,0.766632,6.308102e-66,1.191905,0.533265,large
62,surprisal_fixed_cloze_Qwen2.5-7B-Instruct,0.232095,1.752780e-184,0.215450,1.207894e-158,0.416679,0.708339,6.015453e-41,0.895953,0.416679,medium


In [9]:
corr_res_df[(corr_res_df['feature'].str.contains("surprisal_fixed")) & (corr_res_df['feature'].str.contains("gpt"))]

Unnamed: 0,feature,pearson_coef,pearson_p,spearman_coef,spearman_p,rank_biserial_r,auc,mw_p,cohens_d,cliffs_delta,cliffs_size
17,surprisal_fixed_gpt2,0.419356,0.0,0.41697,0.0,0.638173,0.819087,1.3243690000000001e-93,1.387056,0.638173,large
20,surprisal_fixed_gpt2-xl,0.37267,0.0,0.36161,0.0,0.566399,0.783199,3.897887e-74,1.198972,0.566399,large
24,surprisal_fixed_gpt2-medium,0.38876,0.0,0.382743,0.0,0.599864,0.799932,6.316364e-83,1.281146,0.599864,large
28,surprisal_fixed_gpt2-large,0.381318,0.0,0.372997,0.0,0.585406,0.792703,4.566711e-79,1.222675,0.585406,large
49,surprisal_fixed_cloze_gpt2,0.489977,0.0,0.499379,0.0,0.686721,0.84336,4.412006e-108,1.564582,0.686721,large
52,surprisal_fixed_cloze_gpt2-xl,0.446167,0.0,0.451687,0.0,0.629093,0.814546,5.159335e-91,1.334066,0.629093,large
56,surprisal_fixed_cloze_gpt2-medium,0.466418,0.0,0.47307,0.0,0.668073,0.834037,2.1405529999999997e-102,1.431831,0.668073,large
60,surprisal_fixed_cloze_gpt2-large,0.455469,0.0,0.459231,0.0,0.64769,0.823845,2.329887e-96,1.400866,0.64769,large


In [10]:
models_ids_ordered = [
    'gpt2',
    'gpt2-medium',
    'gpt2-large',
    'gpt2-xl',
        
    'Llama-3.2-1B',
    'Llama-3.2-1B-Instruct',
    'Llama-3.2-3B',
    'Llama-3.2-3B-Instruct',
    'Llama-3.1-8B',
    'Llama-3.1-8B-Instruct',    

    'Qwen2.5-0.5B',    
    'Qwen2.5-0.5B-Instruct',
    'Qwen2.5-7B',
    'Qwen2.5-7B-Instruct',    
    'Qwen2.5-14B',    
    'Qwen2.5-14B-Instruct',
]

In [11]:
ps = []
sp = []
rb = []
auc = []
mw_p = []
c_delta = []
cloze_gain = []
instruct_gain = []
for m in models_ids_ordered:
    feat = f"surprisal_fixed_{m}"
    ps.append(corr_res_df[corr_res_df['feature']==feat].pearson_coef.item())
    sp.append(corr_res_df[corr_res_df['feature']==feat].spearman_coef.item())
    rb.append(corr_res_df[corr_res_df['feature']==feat].rank_biserial_r.item())
    auc.append(corr_res_df[corr_res_df['feature']==feat].auc.item())
    mw_p.append(corr_res_df[corr_res_df['feature']==feat].mw_p.item())
    c_delta.append(corr_res_df[corr_res_df['feature']==feat].cliffs_delta.item())
    feat_cloze = f"surprisal_fixed_cloze_{m}"
    cloze_gain.append(corr_res_df[corr_res_df['feature']==feat_cloze].rank_biserial_r.item() - corr_res_df[corr_res_df['feature']==feat].rank_biserial_r.item())

In [12]:
vua_dinh_t_df = pd.DataFrame({
    "model": models_ids_ordered,
    "pearson_coef": ps,
    "spearman_coef": sp,
    "rank_biserial": rb,
    "auc": auc,
    "mw_p": mw_p,
    "cliffs_delta": c_delta,
    "cloze_gain": cloze_gain
})
vua_dinh_t_df

Unnamed: 0,model,pearson_coef,spearman_coef,rank_biserial,auc,mw_p,cliffs_delta,cloze_gain
0,gpt2,0.419356,0.41697,0.638173,0.819087,1.3243690000000001e-93,0.638173,0.048548
1,gpt2-medium,0.38876,0.382743,0.599864,0.799932,6.316364e-83,0.599864,0.068209
2,gpt2-large,0.381318,0.372997,0.585406,0.792703,4.566711e-79,0.585406,0.062283
3,gpt2-xl,0.37267,0.36161,0.566399,0.783199,3.897887e-74,0.566399,0.062694
4,Llama-3.2-1B,0.345477,0.329036,0.531529,0.765764,1.64612e-65,0.531529,0.072926
5,Llama-3.2-1B-Instruct,0.352542,0.33652,0.571543,0.785772,1.8710540000000002e-75,0.571543,-0.038278
6,Llama-3.2-3B,0.327924,0.307517,0.501866,0.750933,1.335718e-58,0.501866,-0.005119
7,Llama-3.2-3B-Instruct,0.334822,0.318221,0.54171,0.770855,5.6796790000000005e-68,0.54171,-0.110761
8,Llama-3.1-8B,0.314009,0.292605,0.488208,0.744104,1.4948400000000002e-55,0.488208,0.019069
9,Llama-3.1-8B-Instruct,0.313899,0.294948,0.503829,0.751915,4.792042e-59,0.503829,-0.077552


In [13]:
instruct_gain = []
for m in models_ids_ordered:
    feat = f"surprisal_fixed_{m}"
    feat_instruct = f"{feat}-Instruct"
    if feat_instruct in corr_res_df['feature'].values:
        instruct_gain.append(corr_res_df[corr_res_df['feature']==feat_instruct].rank_biserial_r.item() - corr_res_df[corr_res_df['feature']==feat].rank_biserial_r.item())
    else:
        instruct_gain.append(None)

In [14]:
vua_dinh_t2_df = pd.DataFrame({
    "model": models_ids_ordered,
    "instruct_gain": instruct_gain
})
vua_dinh_t2_df

Unnamed: 0,model,instruct_gain
0,gpt2,
1,gpt2-medium,
2,gpt2-large,
3,gpt2-xl,
4,Llama-3.2-1B,0.040014
5,Llama-3.2-1B-Instruct,
6,Llama-3.2-3B,0.039844
7,Llama-3.2-3B-Instruct,
8,Llama-3.1-8B,0.015622
9,Llama-3.1-8B-Instruct,


# VUA-dictionary results

In [15]:
models_ids, len(models_ids)

(['Llama-3.2-3B',
  'gpt2',
  'Llama-3.1-8B-Instruct',
  'Qwen2.5-7B',
  'gpt2-xl',
  'Llama-3.2-3B-Instruct',
  'Llama-3.1-8B',
  'Qwen2.5-14B',
  'gpt2-medium',
  'Llama-3.2-1B',
  'Qwen2.5-0.5B-Instruct',
  'Qwen2.5-0.5B',
  'gpt2-large',
  'Llama-3.2-1B-Instruct',
  'Qwen2.5-7B-Instruct',
  'Qwen2.5-14B-Instruct'],
 16)

In [16]:
all_results[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16202 entries, 0 to 16201
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   document_id             16202 non-null  object
 1   sentence_id             16202 non-null  object
 2   sentence                16202 non-null  object
 3   words_list              16202 non-null  object
 4   offsets                 16202 non-null  object
 5   vua_metaphor_labels     16202 non-null  object
 6   do_dinh_scores          16202 non-null  object
 7   reimann_novelty_labels  16202 non-null  object
 8   genre                   16202 non-null  object
 9   subtoken_ids            16202 non-null  object
 10  subtoken_strs           16202 non-null  object
 11  surprisal_buggy         16202 non-null  object
 12  surprisal_fixed         16202 non-null  object
 13  subtoken_ids_cloze      16202 non-null  object
 14  subtoken_strs_cloze     16202 non-null  object
 15  su

In [17]:
df_analysis = pd.DataFrame()
for k, df in enumerate(all_results):
    reimann_labels = []
    surp_buggy = []
    surp_fixed = []
    surp_buggy_cloze = []
    surp_fixed_cloze = []
    gens = []
    for i, row in df.iterrows():
        for j, label in enumerate(row['vua_metaphor_labels']):
            if label == True:
                scs = row['do_dinh_scores'][j]
                scs = max([float(s) for s in scs.split(",")])
                if scs > -1:
                    if row['reimann_novelty_labels'][j] == True:
                        reimann_labels.append("novel")
                    else:
                        reimann_labels.append("conventional")
                    surp_buggy.append(row['surprisal_buggy'][j].item())
                    surp_fixed.append(row['surprisal_fixed'][j].item())
                    surp_buggy_cloze.append(row['surprisal_buggy_cloze'][j].item())
                    surp_fixed_cloze.append(row['surprisal_fixed_cloze'][j].item())
                    gens.append(row['genre'])

    df_analysis['reimann_labels'] = reimann_labels
    df_analysis[f"surprisal_buggy_{models_ids[k]}"] = surp_buggy
    df_analysis[f"surprisal_fixed_{models_ids[k]}"] = surp_fixed
    df_analysis[f"surprisal_buggy_cloze_{models_ids[k]}"] = surp_buggy_cloze
    df_analysis[f"surprisal_fixed_cloze_{models_ids[k]}"] = surp_fixed_cloze
    df_analysis[f"genre"] = gens


In [18]:
df_analysis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15155 entries, 0 to 15154
Data columns (total 66 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   reimann_labels                               15155 non-null  object 
 1   surprisal_buggy_Llama-3.2-3B                 15155 non-null  float64
 2   surprisal_fixed_Llama-3.2-3B                 15155 non-null  float64
 3   surprisal_buggy_cloze_Llama-3.2-3B           15155 non-null  float64
 4   surprisal_fixed_cloze_Llama-3.2-3B           15155 non-null  float64
 5   genre                                        15155 non-null  object 
 6   surprisal_buggy_gpt2                         15155 non-null  float64
 7   surprisal_fixed_gpt2                         15155 non-null  float64
 8   surprisal_buggy_cloze_gpt2                   15155 non-null  float64
 9   surprisal_fixed_cloze_gpt2                   15155 non-null  float64
 10

In [19]:
corr_res_df = get_corr_stats(df_analysis, binary=True, feats=[f"surprisal_buggy_{mid}" for mid in models_ids] + [f"surprisal_fixed_{mid}" for mid in models_ids] + [f"surprisal_buggy_cloze_{mid}" for mid in models_ids] + [f"surprisal_fixed_cloze_{mid}" for mid in models_ids], score_col="score", binary_col='reimann_labels')
corr_res_df

Unnamed: 0,feature,rank_biserial_r,auc,mw_p,cohens_d,cliffs_delta,cliffs_size
0,surprisal_buggy_Llama-3.2-3B,0.447458,0.723729,6.424206e-54,0.975800,0.447458,medium
1,surprisal_buggy_gpt2,0.579478,0.789739,3.552858e-89,1.246111,0.579478,large
2,surprisal_buggy_Llama-3.1-8B-Instruct,0.437940,0.718970,1.004047e-51,0.929559,0.437940,medium
3,surprisal_buggy_Qwen2.5-7B,0.461694,0.730847,2.749185e-57,1.024853,0.461694,medium
4,surprisal_buggy_gpt2-xl,0.525908,0.762954,8.771888e-74,1.131283,0.525908,large
...,...,...,...,...,...,...,...
59,surprisal_fixed_cloze_Qwen2.5-0.5B,0.563635,0.781817,1.809192e-84,1.256417,0.563635,large
60,surprisal_fixed_cloze_gpt2-large,0.621402,0.810701,2.946428e-102,1.365454,0.621402,large
61,surprisal_fixed_cloze_Llama-3.2-1B-Instruct,0.483266,0.741633,1.365239e-62,1.064454,0.483266,large
62,surprisal_fixed_cloze_Qwen2.5-7B-Instruct,0.420359,0.710180,8.532921e-48,0.905683,0.420359,medium


In [20]:
rb = []
auc = []
mw_p = []
c_delta = []
cloze_gain = []
for m in models_ids_ordered:
    feat = f"surprisal_fixed_{m}"
    rb.append(corr_res_df[corr_res_df['feature']==feat].rank_biserial_r.item())
    auc.append(corr_res_df[corr_res_df['feature']==feat].auc.item())
    mw_p.append(corr_res_df[corr_res_df['feature']==feat].mw_p.item())
    c_delta.append(corr_res_df[corr_res_df['feature']==feat].cliffs_delta.item())
    feat_cloze = f"surprisal_fixed_cloze_{m}"
    cloze_gain.append(corr_res_df[corr_res_df['feature']==feat_cloze].rank_biserial_r.item() - corr_res_df[corr_res_df['feature']==feat].rank_biserial_r.item())

In [21]:
vua_reimann_t_df = pd.DataFrame({
    "model": models_ids_ordered,
    "rank_biserial": rb,
    "auc": auc,
    "mw_p": mw_p,
    "cliffs_delta": c_delta,
    "cloze_gain": cloze_gain
})
vua_reimann_t_df

Unnamed: 0,model,rank_biserial,auc,mw_p,cliffs_delta,cloze_gain
0,gpt2,0.581174,0.790587,1.094118e-89,0.581174,0.039752
1,gpt2-medium,0.556553,0.778276,2.086702e-82,0.556553,0.045184
2,gpt2-large,0.538914,0.769457,2.199687e-77,0.538914,0.082488
3,gpt2-xl,0.527637,0.763819,2.94751e-74,0.527637,0.0253
4,Llama-3.2-1B,0.479858,0.739929,9.754953e-62,0.479858,0.079907
5,Llama-3.2-1B-Instruct,0.494999,0.7475,1.41047e-65,0.494999,-0.011733
6,Llama-3.2-3B,0.445622,0.722811,1.716969e-53,0.445622,0.055624
7,Llama-3.2-3B-Instruct,0.469793,0.734897,2.992496e-59,0.469793,-0.023346
8,Llama-3.1-8B,0.431481,0.715741,2.9092439999999997e-50,0.431481,0.026892
9,Llama-3.1-8B-Instruct,0.437062,0.718531,1.591749e-51,0.437062,-0.001929


In [22]:
instruct_gain = []
for m in models_ids_ordered:
    feat = f"surprisal_fixed_{m}"
    feat_instruct = f"{feat}-Instruct"
    if feat_instruct in corr_res_df['feature'].values:
        instruct_gain.append(corr_res_df[corr_res_df['feature']==feat_instruct].rank_biserial_r.item() - corr_res_df[corr_res_df['feature']==feat].rank_biserial_r.item())
    else:
        instruct_gain.append(None)

In [23]:
vua_reimann_t2_df = pd.DataFrame({
    "model": models_ids_ordered,
    "instruct_gain": instruct_gain
})
vua_reimann_t2_df

Unnamed: 0,model,instruct_gain
0,gpt2,
1,gpt2-medium,
2,gpt2-large,
3,gpt2-xl,
4,Llama-3.2-1B,0.015141
5,Llama-3.2-1B-Instruct,
6,Llama-3.2-3B,0.024171
7,Llama-3.2-3B-Instruct,
8,Llama-3.1-8B,0.00558
9,Llama-3.1-8B-Instruct,


# LAI2009 results

In [24]:
all_results = []
models_ids = []
for f in os.listdir("results"):
    if f.endswith(".parquet") and "LAI" in f:
        print(f"Loading {f}...")
        df_part = pd.read_parquet(os.path.join("results", f))
        all_results.append(df_part)
        models_ids.append(f.split("_")[-3])

Loading LAI2009-METANOV_mod_openai-community_gpt2_cloze_pimentel.parquet...
Loading LAI2009-METANOV_mod_meta-llama_Llama-3.2-1B_cloze_pimentel.parquet...
Loading LAI2009-METANOV_mod_meta-llama_Llama-3.1-8B_cloze_pimentel.parquet...
Loading LAI2009-METANOV_mod_meta-llama_Llama-3.2-1B-Instruct_cloze_pimentel.parquet...
Loading LAI2009-METANOV_mod_Qwen_Qwen2.5-14B-Instruct_cloze_pimentel.parquet...
Loading LAI2009-METANOV_mod_Qwen_Qwen2.5-7B-Instruct_cloze_pimentel.parquet...
Loading LAI2009-METANOV_mod_openai-community_gpt2-xl_cloze_pimentel.parquet...
Loading LAI2009-METANOV_mod_openai-community_gpt2-medium_cloze_pimentel.parquet...
Loading LAI2009-METANOV_mod_meta-llama_Llama-3.2-3B-Instruct_cloze_pimentel.parquet...
Loading LAI2009-METANOV_mod_Qwen_Qwen2.5-7B_cloze_pimentel.parquet...
Loading LAI2009-METANOV_mod_meta-llama_Llama-3.2-3B_cloze_pimentel.parquet...
Loading LAI2009-METANOV_mod_meta-llama_Llama-3.1-8B-Instruct_cloze_pimentel.parquet...
Loading LAI2009-METANOV_mod_Qwen_Qwen2

In [25]:
models_ids, len(models_ids)

(['gpt2',
  'Llama-3.2-1B',
  'Llama-3.1-8B',
  'Llama-3.2-1B-Instruct',
  'Qwen2.5-14B-Instruct',
  'Qwen2.5-7B-Instruct',
  'gpt2-xl',
  'gpt2-medium',
  'Llama-3.2-3B-Instruct',
  'Qwen2.5-7B',
  'Llama-3.2-3B',
  'Llama-3.1-8B-Instruct',
  'Qwen2.5-0.5B',
  'Qwen2.5-0.5B-Instruct',
  'gpt2-large',
  'Qwen2.5-14B'],
 16)

In [26]:
all_results[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   sentence               208 non-null    object 
 1   target_word            208 non-null    object 
 2   novelty_label          208 non-null    object 
 3   offsets                208 non-null    object 
 4   subtoken_ids           208 non-null    object 
 5   subtoken_strs          208 non-null    object 
 6   surprisal_buggy        208 non-null    float64
 7   surprisal_fixed        208 non-null    float64
 8   subtoken_ids_cloze     208 non-null    object 
 9   subtoken_strs_cloze    208 non-null    object 
 10  surprisal_buggy_cloze  208 non-null    float64
 11  surprisal_fixed_cloze  208 non-null    float64
dtypes: float64(4), object(8)
memory usage: 19.6+ KB


In [27]:
df_analysis = pd.DataFrame()
for k, df in enumerate(all_results):
    labels = []
    surp_buggy = []
    surp_fixed = []
    surp_buggy_cloze = []
    surp_fixed_cloze = []
    for i, row in df.iterrows():
        labels.append(row['novelty_label'])
        surp_buggy.append(row['surprisal_buggy'])
        surp_fixed.append(row['surprisal_fixed'])
        surp_buggy_cloze.append(row['surprisal_buggy_cloze'])
        surp_fixed_cloze.append(row['surprisal_fixed_cloze'])
    df_analysis['labels'] = labels
    df_analysis[f"surprisal_buggy_{models_ids[k]}"] = surp_buggy
    df_analysis[f"surprisal_fixed_{models_ids[k]}"] = surp_fixed
    df_analysis[f"surprisal_buggy_cloze_{models_ids[k]}"] = surp_buggy_cloze
    df_analysis[f"surprisal_fixed_cloze_{models_ids[k]}"] = surp_fixed_cloze

In [28]:
df_analysis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 65 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   labels                                       208 non-null    object 
 1   surprisal_buggy_gpt2                         208 non-null    float64
 2   surprisal_fixed_gpt2                         208 non-null    float64
 3   surprisal_buggy_cloze_gpt2                   208 non-null    float64
 4   surprisal_fixed_cloze_gpt2                   208 non-null    float64
 5   surprisal_buggy_Llama-3.2-1B                 208 non-null    float64
 6   surprisal_fixed_Llama-3.2-1B                 208 non-null    float64
 7   surprisal_buggy_cloze_Llama-3.2-1B           208 non-null    float64
 8   surprisal_fixed_cloze_Llama-3.2-1B           208 non-null    float64
 9   surprisal_buggy_Llama-3.1-8B                 208 non-null    float64
 10  su

In [29]:
corr_res_df = get_corr_stats(df_analysis, binary=True, feats=[f"surprisal_buggy_{mid}" for mid in models_ids] + [f"surprisal_fixed_{mid}" for mid in models_ids] + [f"surprisal_buggy_cloze_{mid}" for mid in models_ids] + [f"surprisal_fixed_cloze_{mid}" for mid in models_ids], score_col="score", binary_col='labels')
corr_res_df

Unnamed: 0,feature,rank_biserial_r,auc,mw_p,cohens_d,cliffs_delta,cliffs_size
0,surprisal_buggy_gpt2,0.275888,0.637944,5.894364e-04,0.476211,0.275888,small
1,surprisal_buggy_Llama-3.2-1B,0.462278,0.731139,8.467884e-09,0.808066,0.462278,medium
2,surprisal_buggy_Llama-3.1-8B,0.492419,0.746209,8.546854e-10,0.864563,0.492419,large
3,surprisal_buggy_Llama-3.2-1B-Instruct,0.460244,0.730122,9.836398e-09,0.780932,0.460244,medium
4,surprisal_buggy_Qwen2.5-14B-Instruct,0.479845,0.739922,2.262393e-09,0.776822,0.479845,large
...,...,...,...,...,...,...,...
59,surprisal_fixed_cloze_Llama-3.1-8B-Instruct,0.520340,0.760170,9.031750e-11,0.971510,0.520340,large
60,surprisal_fixed_cloze_Qwen2.5-0.5B,0.349667,0.674834,1.326250e-05,0.650370,0.349667,medium
61,surprisal_fixed_cloze_Qwen2.5-0.5B-Instruct,0.336908,0.668454,2.707700e-05,0.617381,0.336908,medium
62,surprisal_fixed_cloze_gpt2-large,0.255178,0.627589,1.481004e-03,0.483504,0.255178,small


In [30]:
corr_res_df[(corr_res_df['feature'].str.contains("surprisal_fixed")) & (corr_res_df['feature'].str.contains("gpt2"))]

Unnamed: 0,feature,rank_biserial_r,auc,mw_p,cohens_d,cliffs_delta,cliffs_size
16,surprisal_fixed_gpt2,0.276442,0.638221,0.0005745739,0.475562,0.276442,small
22,surprisal_fixed_gpt2-xl,0.413646,0.706823,2.564672e-07,0.74859,0.413646,medium
23,surprisal_fixed_gpt2-medium,0.361871,0.680936,6.550535e-06,0.633297,0.361871,medium
30,surprisal_fixed_gpt2-large,0.397374,0.698687,7.415136e-07,0.698448,0.397374,medium
48,surprisal_fixed_cloze_gpt2,0.199889,0.599945,0.01279296,0.345545,0.199889,small
54,surprisal_fixed_cloze_gpt2-xl,0.253513,0.626757,0.001590496,0.456952,0.253513,small
55,surprisal_fixed_cloze_gpt2-medium,0.243713,0.621857,0.00240112,0.441738,0.243713,small
62,surprisal_fixed_cloze_gpt2-large,0.255178,0.627589,0.001481004,0.483504,0.255178,small


In [31]:
models_ids_ordered = [
    'gpt2',
    'gpt2-medium',
    'gpt2-large',
    'gpt2-xl',
        
    'Llama-3.2-1B',
    'Llama-3.2-1B-Instruct',
    'Llama-3.2-3B',
    'Llama-3.2-3B-Instruct',
    'Llama-3.1-8B',
    'Llama-3.1-8B-Instruct',    

    'Qwen2.5-0.5B',    
    'Qwen2.5-0.5B-Instruct',
    'Qwen2.5-7B',
    'Qwen2.5-7B-Instruct',    
    'Qwen2.5-14B',    
    'Qwen2.5-14B-Instruct',
]

In [32]:
rb = []
auc = []
mw_p = []
c_delta = []
cloze_gain = []
for m in models_ids_ordered:
    feat = f"surprisal_fixed_{m}"
    rb.append(corr_res_df[corr_res_df['feature']==feat].rank_biserial_r.item())
    auc.append(corr_res_df[corr_res_df['feature']==feat].auc.item())
    mw_p.append(corr_res_df[corr_res_df['feature']==feat].mw_p.item())
    c_delta.append(corr_res_df[corr_res_df['feature']==feat].cliffs_delta.item())
    feat_cloze = f"surprisal_fixed_cloze_{m}"
    cloze_gain.append(corr_res_df[corr_res_df['feature']==feat_cloze].rank_biserial_r.item() - corr_res_df[corr_res_df['feature']==feat].rank_biserial_r.item())

In [33]:
lai_df = pd.DataFrame({
    "model": models_ids_ordered,
    "rank_biserial": rb,
    "auc": auc,
    "mw_p": mw_p,
    "cliffs_delta": c_delta,
    "cloze_gain": cloze_gain
})
lai_df

Unnamed: 0,model,rank_biserial,auc,mw_p,cliffs_delta,cloze_gain
0,gpt2,0.276442,0.638221,0.0005745739,0.276442,-0.076553
1,gpt2-medium,0.361871,0.680936,6.550535e-06,0.361871,-0.118158
2,gpt2-large,0.397374,0.698687,7.415136e-07,0.397374,-0.142197
3,gpt2-xl,0.413646,0.706823,2.564672e-07,0.413646,-0.160133
4,Llama-3.2-1B,0.449519,0.72476,2.144836e-08,0.449519,-0.132581
5,Llama-3.2-1B-Instruct,0.453957,0.726979,1.556742e-08,0.453957,-0.024408
6,Llama-3.2-3B,0.450814,0.725407,1.954043e-08,0.450814,0.003143
7,Llama-3.2-3B-Instruct,0.393306,0.696653,9.609318e-07,0.393306,0.047152
8,Llama-3.1-8B,0.483358,0.741679,1.727791e-09,0.483358,0.025148
9,Llama-3.1-8B-Instruct,0.460244,0.730122,9.836398e-09,0.460244,0.060096


In [34]:
instruct_gain = []
for m in models_ids_ordered:
    feat = f"surprisal_fixed_{m}"
    feat_instruct = f"{feat}-Instruct"
    if feat_instruct in corr_res_df['feature'].values:
        instruct_gain.append(corr_res_df[corr_res_df['feature']==feat_instruct].rank_biserial_r.item() - corr_res_df[corr_res_df['feature']==feat].rank_biserial_r.item())
    else:
        instruct_gain.append(None)

In [35]:
lai_2_df = pd.DataFrame({
    "model": models_ids_ordered,
    "instruct_gain": instruct_gain
})
lai_2_df

Unnamed: 0,model,instruct_gain
0,gpt2,
1,gpt2-medium,
2,gpt2-large,
3,gpt2-xl,
4,Llama-3.2-1B,0.004438
5,Llama-3.2-1B-Instruct,
6,Llama-3.2-3B,-0.057507
7,Llama-3.2-3B-Instruct,
8,Llama-3.1-8B,-0.023114
9,Llama-3.1-8B-Instruct,


# GPT-4o results

In [36]:
all_results = []
models_ids = []
for f in os.listdir("results"):
    if f.endswith(".parquet") and "GPT" in f:
        print(f"Loading {f}...")
        df_part = pd.read_parquet(os.path.join("results", f))
        all_results.append(df_part)
        models_ids.append(f.split("_")[-3])

Loading GPT-4o-METANOV_mod_openai-community_gpt2-xl_cloze_pimentel.parquet...
Loading GPT-4o-METANOV_mod_openai-community_gpt2_cloze_pimentel.parquet...
Loading GPT-4o-METANOV_mod_Qwen_Qwen2.5-0.5B_cloze_pimentel.parquet...
Loading GPT-4o-METANOV_mod_Qwen_Qwen2.5-0.5B-Instruct_cloze_pimentel.parquet...
Loading GPT-4o-METANOV_mod_meta-llama_Llama-3.1-8B_cloze_pimentel.parquet...
Loading GPT-4o-METANOV_mod_Qwen_Qwen2.5-7B_cloze_pimentel.parquet...
Loading GPT-4o-METANOV_mod_meta-llama_Llama-3.2-3B_cloze_pimentel.parquet...
Loading GPT-4o-METANOV_mod_Qwen_Qwen2.5-14B_cloze_pimentel.parquet...
Loading GPT-4o-METANOV_mod_meta-llama_Llama-3.2-1B-Instruct_cloze_pimentel.parquet...
Loading GPT-4o-METANOV_mod_meta-llama_Llama-3.1-8B-Instruct_cloze_pimentel.parquet...
Loading GPT-4o-METANOV_mod_meta-llama_Llama-3.2-1B_cloze_pimentel.parquet...
Loading GPT-4o-METANOV_mod_openai-community_gpt2-large_cloze_pimentel.parquet...
Loading GPT-4o-METANOV_mod_openai-community_gpt2-medium_cloze_pimentel.pa

In [37]:
models_ids

['gpt2-xl',
 'gpt2',
 'Qwen2.5-0.5B',
 'Qwen2.5-0.5B-Instruct',
 'Llama-3.1-8B',
 'Qwen2.5-7B',
 'Llama-3.2-3B',
 'Qwen2.5-14B',
 'Llama-3.2-1B-Instruct',
 'Llama-3.1-8B-Instruct',
 'Llama-3.2-1B',
 'gpt2-large',
 'gpt2-medium',
 'Llama-3.2-3B-Instruct',
 'Qwen2.5-14B-Instruct',
 'Qwen2.5-7B-Instruct']

In [38]:
all_results[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   sentence               200 non-null    object 
 1   target_word            200 non-null    object 
 2   novelty_label          200 non-null    object 
 3   offsets                200 non-null    object 
 4   subtoken_ids           200 non-null    object 
 5   subtoken_strs          200 non-null    object 
 6   surprisal_buggy        200 non-null    float64
 7   surprisal_fixed        200 non-null    float64
 8   subtoken_ids_cloze     200 non-null    object 
 9   subtoken_strs_cloze    200 non-null    object 
 10  surprisal_buggy_cloze  200 non-null    float64
 11  surprisal_fixed_cloze  200 non-null    float64
dtypes: float64(4), object(8)
memory usage: 18.9+ KB


In [39]:
df_analysis = pd.DataFrame()
for k, df in enumerate(all_results):
    labels = []
    surp_buggy = []
    surp_fixed = []
    surp_buggy_cloze = []
    surp_fixed_cloze = []
    for i, row in df.iterrows():
        labels.append(row['novelty_label'])
        surp_buggy.append(row['surprisal_buggy'])
        surp_fixed.append(row['surprisal_fixed'])
        surp_buggy_cloze.append(row['surprisal_buggy_cloze'])
        surp_fixed_cloze.append(row['surprisal_fixed_cloze'])
    df_analysis['labels'] = labels
    df_analysis[f"surprisal_buggy_{models_ids[k]}"] = surp_buggy
    df_analysis[f"surprisal_fixed_{models_ids[k]}"] = surp_fixed
    df_analysis[f"surprisal_buggy_cloze_{models_ids[k]}"] = surp_buggy_cloze
    df_analysis[f"surprisal_fixed_cloze_{models_ids[k]}"] = surp_fixed_cloze

In [40]:
df_analysis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 65 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   labels                                       200 non-null    object 
 1   surprisal_buggy_gpt2-xl                      200 non-null    float64
 2   surprisal_fixed_gpt2-xl                      200 non-null    float64
 3   surprisal_buggy_cloze_gpt2-xl                200 non-null    float64
 4   surprisal_fixed_cloze_gpt2-xl                200 non-null    float64
 5   surprisal_buggy_gpt2                         200 non-null    float64
 6   surprisal_fixed_gpt2                         200 non-null    float64
 7   surprisal_buggy_cloze_gpt2                   200 non-null    float64
 8   surprisal_fixed_cloze_gpt2                   200 non-null    float64
 9   surprisal_buggy_Qwen2.5-0.5B                 200 non-null    float64
 10  su

In [41]:
corr_res_df = get_corr_stats(df_analysis, binary=True, feats=[f"surprisal_buggy_{mid}" for mid in models_ids] + [f"surprisal_fixed_{mid}" for mid in models_ids] + [f"surprisal_buggy_cloze_{mid}" for mid in models_ids] + [f"surprisal_fixed_cloze_{mid}" for mid in models_ids], score_col="score", binary_col='labels')
corr_res_df

Unnamed: 0,feature,rank_biserial_r,auc,mw_p,cohens_d,cliffs_delta,cliffs_size
0,surprisal_buggy_gpt2-xl,0.5667,0.78335,4.449074e-12,1.130037,0.5667,large
1,surprisal_buggy_gpt2,0.4922,0.74610,1.832929e-09,0.988014,0.4922,large
2,surprisal_buggy_Qwen2.5-0.5B,0.3609,0.68045,1.043881e-05,0.612829,0.3609,medium
3,surprisal_buggy_Qwen2.5-0.5B-Instruct,0.3861,0.69305,2.408269e-06,0.671574,0.3861,medium
4,surprisal_buggy_Llama-3.1-8B,0.5372,0.76860,5.318944e-11,1.058400,0.5372,large
...,...,...,...,...,...,...,...
59,surprisal_fixed_cloze_gpt2-large,0.3782,0.68910,3.851729e-06,0.654295,0.3782,medium
60,surprisal_fixed_cloze_gpt2-medium,0.3168,0.65840,1.092350e-04,0.587034,0.3168,small
61,surprisal_fixed_cloze_Llama-3.2-3B-Instruct,0.5196,0.75980,2.199787e-10,0.945969,0.5196,large
62,surprisal_fixed_cloze_Qwen2.5-14B-Instruct,0.5244,0.76220,1.500308e-10,1.058946,0.5244,large


In [42]:
corr_res_df[(corr_res_df['feature'].str.contains("surprisal_fixed")) & (corr_res_df['feature'].str.contains("gpt"))]

Unnamed: 0,feature,rank_biserial_r,auc,mw_p,cohens_d,cliffs_delta,cliffs_size
16,surprisal_fixed_gpt2-xl,0.5871,0.79355,7.425936e-13,1.175363,0.5871,large
17,surprisal_fixed_gpt2,0.511,0.7555,4.330087e-10,1.029852,0.511,large
27,surprisal_fixed_gpt2-large,0.6289,0.81445,1.565953e-14,1.283981,0.6289,large
28,surprisal_fixed_gpt2-medium,0.5862,0.7931,8.04691e-13,1.158512,0.5862,large
48,surprisal_fixed_cloze_gpt2-xl,0.3062,0.6531,0.0001843084,0.507835,0.3062,small
49,surprisal_fixed_cloze_gpt2,0.3442,0.6721,2.624122e-05,0.57912,0.3442,medium
59,surprisal_fixed_cloze_gpt2-large,0.3782,0.6891,3.851729e-06,0.654295,0.3782,medium
60,surprisal_fixed_cloze_gpt2-medium,0.3168,0.6584,0.000109235,0.587034,0.3168,small


In [43]:
models_ids_ordered = [
    'gpt2',
    'gpt2-medium',
    'gpt2-large',
    'gpt2-xl',
        
    'Llama-3.2-1B',
    'Llama-3.2-1B-Instruct',
    'Llama-3.2-3B',
    'Llama-3.2-3B-Instruct',
    'Llama-3.1-8B',
    'Llama-3.1-8B-Instruct',    

    'Qwen2.5-0.5B',    
    'Qwen2.5-0.5B-Instruct',
    'Qwen2.5-7B',
    'Qwen2.5-7B-Instruct',    
    'Qwen2.5-14B',    
    'Qwen2.5-14B-Instruct',
]

In [44]:
rb = []
auc = []
mw_p = []
c_delta = []
cloze_gain = []
instruct_gain = []
for m in models_ids_ordered:
    feat = f"surprisal_fixed_{m}"
    rb.append(corr_res_df[corr_res_df['feature']==feat].rank_biserial_r.item())
    auc.append(corr_res_df[corr_res_df['feature']==feat].auc.item())
    mw_p.append(corr_res_df[corr_res_df['feature']==feat].mw_p.item())
    c_delta.append(corr_res_df[corr_res_df['feature']==feat].cliffs_delta.item())
    feat_cloze = f"surprisal_fixed_cloze_{m}"
    cloze_gain.append(corr_res_df[corr_res_df['feature']==feat_cloze].rank_biserial_r.item() - corr_res_df[corr_res_df['feature']==feat].rank_biserial_r.item())

In [45]:
gpt_t_df = pd.DataFrame({
    "model": models_ids_ordered,
    "rank_biserial": rb,
    "auc": auc,
    "mw_p": mw_p,
    "cliffs_delta": c_delta,
    "cloze_gain": cloze_gain
})
gpt_t_df

Unnamed: 0,model,rank_biserial,auc,mw_p,cliffs_delta,cloze_gain
0,gpt2,0.511,0.7555,4.330087e-10,0.511,-0.1668
1,gpt2-medium,0.5862,0.7931,8.04691e-13,0.5862,-0.2694
2,gpt2-large,0.6289,0.81445,1.565953e-14,0.6289,-0.2507
3,gpt2-xl,0.5871,0.79355,7.425936e-13,0.5871,-0.2809
4,Llama-3.2-1B,0.5077,0.75385,5.59887e-10,0.5077,-0.1569
5,Llama-3.2-1B-Instruct,0.4563,0.72815,2.49849e-08,0.4563,-0.1025
6,Llama-3.2-3B,0.5107,0.75535,4.432672e-10,0.5107,0.0071
7,Llama-3.2-3B-Instruct,0.4549,0.72745,2.755964e-08,0.4549,0.0647
8,Llama-3.1-8B,0.5568,0.7784,1.037689e-11,0.5568,0.127
9,Llama-3.1-8B-Instruct,0.5056,0.7528,6.588538e-10,0.5056,0.109


In [46]:
instruct_gain = []
for m in models_ids_ordered:
    feat = f"surprisal_fixed_{m}"
    feat_instruct = f"{feat}-Instruct"
    if feat_instruct in corr_res_df['feature'].values:
        instruct_gain.append(corr_res_df[corr_res_df['feature']==feat_instruct].rank_biserial_r.item() - corr_res_df[corr_res_df['feature']==feat].rank_biserial_r.item())
    else:
        instruct_gain.append(None)

In [47]:
gpt_t2_df = pd.DataFrame({
    "model": models_ids_ordered,
    "instruct_gain": instruct_gain
})
gpt_t2_df

Unnamed: 0,model,instruct_gain
0,gpt2,
1,gpt2-medium,
2,gpt2-large,
3,gpt2-xl,
4,Llama-3.2-1B,-0.0514
5,Llama-3.2-1B-Instruct,
6,Llama-3.2-3B,-0.0558
7,Llama-3.2-3B-Instruct,
8,Llama-3.1-8B,-0.0512
9,Llama-3.1-8B-Instruct,
