In [None]:
!pip install --upgrade datasets

import json
import math
import pandas as pd
import numpy as np
import pandas as pd
from collections import defaultdict

from huggingface_hub import login
from datasets import Dataset, DatasetDict, load_dataset
from scipy import stats

GENERATION WITH CONTEXT VS. WITHOUT

In [13]:
dataset = load_dataset("Ramitha/unique-records-snippet-combination")
df = pd.DataFrame(dataset['rawcases'])
df = df.drop(["ILRAlign_with_problem_context_only_llama", "ILRAlign_with_problem_context_only_falcon", "ILRAlign_with_problem_context_only_gemma", "ILRAlign_with_problem_context_only_mistral"], axis=1)

In [14]:
model_cols = [
    "ILRAlign_without_context_llama",
    "ILRAlign_without_context_falcon",
    "ILRAlign_without_context_gemma",
    "ILRAlign_without_context_mistral"
]
df_sub = df[["dataset", "snippet_percentage", "gold_standard_cos"] + model_cols]
df_long = df_sub.melt(
    id_vars=["dataset", "snippet_percentage", "gold_standard_cos"],
    value_vars=model_cols,
    var_name="base_model",
    value_name="ILRAlign_score"
)

df_long["base_model"] = df_long["base_model"].str.replace("ILRAlign_without_context_", "")

def compute_corr(group):
    return pd.Series({
        "correlation": group["ILRAlign_score"].corr(group["gold_standard_cos"])
    })

corr_table = (
    df_long.groupby(["dataset", "base_model", "snippet_percentage"])
    .apply(compute_corr)
    .reset_index()
)

pivot_table = corr_table.pivot_table(
    index=["base_model", "snippet_percentage"],
    columns="dataset",
    values="correlation"
).sort_index(level=["base_model", "snippet_percentage"])

pivot_table

  .apply(compute_corr)


Unnamed: 0_level_0,dataset,alqa,newsqa,sl
base_model,snippet_percentage,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
falcon,0,0.873095,0.778709,0.761182
falcon,25,0.907755,0.682617,0.753227
falcon,50,0.867485,0.650496,0.790188
falcon,75,0.934951,0.731213,0.73932
falcon,100,0.886178,0.629811,0.716722
gemma,0,0.799492,0.918992,0.778577
gemma,25,0.776362,0.942538,0.773888
gemma,50,0.74444,0.951503,0.817797
gemma,75,0.796942,0.93708,0.751949
gemma,100,0.767854,0.919336,0.73002


In [15]:
model_cols = [
    "ILRAlign_without_context_llama",
    "ILRAlign_without_context_falcon",
    "ILRAlign_without_context_gemma",
    "ILRAlign_without_context_mistral"
]

df_sub = df[["dataset", "snippet_percentage", "gold_standard_cos"] + model_cols]
df_long = df_sub.melt(
    id_vars=["dataset", "snippet_percentage", "gold_standard_cos"],
    value_vars=model_cols,
    var_name="base_model",
    value_name="ILRAlign_score"
)

df_long["base_model"] = df_long["base_model"].str.replace("ILRAlign_without_context_", "")

def compute_corr(group):
    return pd.Series({
        "correlation": group["ILRAlign_score"].corr(group["gold_standard_cos"])
    })

corr_table = (
    df_long.groupby(["dataset", "base_model", "snippet_percentage"])
    .apply(compute_corr)
    .reset_index()
)
mean_corr_table = (
    corr_table.groupby(["dataset", "snippet_percentage"])["correlation"]
    .mean()
    .reset_index()
    .pivot(index="snippet_percentage", columns="dataset", values="correlation")
)

mean_corr_table = mean_corr_table.sort_index()
mean_corr_table

  .apply(compute_corr)


dataset,alqa,newsqa,sl
snippet_percentage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.832554,0.831446,0.834433
25,0.84734,0.836911,0.835237
50,0.83253,0.840207,0.867312
75,0.868897,0.856062,0.824781
100,0.84196,0.826873,0.822456


In [None]:
def compute_corr_with_n(group):
    return pd.Series({
        "correlation": group["ILRAlign_score"].corr(group["gold_standard_cos"]),
        "n": len(group)
    })

corr_table = (
    df_long.groupby(["dataset", "base_model", "snippet_percentage"])
    .apply(compute_corr_with_n)
    .reset_index()
)

df_0 = corr_table[corr_table["snippet_percentage"] == 0].copy()
df_0 = df_0.rename(columns={"correlation": "correlation_0", "n": "n_0"})

def fisher_z(r):
    return np.arctanh(r)

def test_significance(r0, rX, n0, nX):
    if abs(r0) >= 1 or abs(rX) >= 1:
        return False
    z0 = fisher_z(r0)
    zX = fisher_z(rX)
    se = np.sqrt(1 / (n0 - 3) + 1 / (nX - 3))
    z = (z0 - zX) / se
    p = 2 * (1 - stats.norm.cdf(abs(z)))
    return p < 0.05

result_frames = []
for perc in [25, 50, 75, 100]:
    df_x = corr_table[corr_table["snippet_percentage"] == perc].copy()
    df_x = df_x.rename(columns={"correlation": "correlation_x", "n": "n_x"})

    merged = pd.merge(df_0, df_x, on=["base_model", "dataset"])

    merged[f"significant_{perc}"] = merged.apply(
        lambda row: test_significance(
            row["correlation_0"], row["correlation_x"],
            row["n_0"], row["n_x"]
        ),
        axis=1
    )

    result_frames.append(
        merged[["base_model", "dataset", f"significant_{perc}"]]
    )

from functools import reduce
final = reduce(
    lambda left, right: pd.merge(left, right, on=["base_model", "dataset"]),
    result_frames
)

final.columns = ["base_model", "dataset", "25", "50", "75", "100"]
final = final[["base_model", "dataset", "25", "50", "75", "100"]]

final

  .apply(compute_corr_with_n)


Unnamed: 0,base_model,dataset,25,50,75,100
0,falcon,alqa,False,False,True,False
1,gemma,alqa,False,False,False,False
2,llama,alqa,False,False,False,False
3,mistral,alqa,False,True,True,True
4,falcon,newsqa,True,True,False,True
5,gemma,newsqa,False,True,False,False
6,llama,newsqa,False,True,True,False
7,mistral,newsqa,True,False,False,True
8,falcon,sl,False,False,False,False
9,gemma,sl,False,False,False,False
