In [None]:
!pip install --upgrade datasets

import json
import math
import pandas as pd
import numpy as np
import pandas as pd
from collections import defaultdict

from huggingface_hub import login
from datasets import Dataset, DatasetDict, load_dataset
from scipy import stats

EMBEDDING CONTEXT VS. WITHOUT

In [17]:
dataset1 = load_dataset("Ramitha/unique-records-snippet-combination")
df1 = pd.DataFrame(dataset1['rawcases'])
dataset2 = load_dataset("Ramitha/snippet-less-output-all-models")
df2 = pd.DataFrame(dataset2['rawcases'])

In [18]:
df_with_context = df1[df1["snippet_percentage"] == 100]
df_without_context = df2

df_combined = pd.concat([df_without_context, df_with_context])
model_cols = [
    "ILRAlign_without_context_llama",
    "ILRAlign_without_context_falcon",
    "ILRAlign_without_context_gemma",
    "ILRAlign_without_context_mistral",
    "ILRAlign_with_problem_context_only_llama",
    "ILRAlign_with_problem_context_only_falcon",
    "ILRAlign_with_problem_context_only_gemma",
    "ILRAlign_with_problem_context_only_mistral"
]
df_sub = df_combined[["dataset", "gold_standard_cos"] + model_cols]
df_long = df_sub.melt(
    id_vars=["dataset", "gold_standard_cos"],
    var_name="model",
    value_name="score"
)

df_long["context"] = df_long["model"].apply(
    lambda x: "without_context" if "without_context" in x else "with_problem_context_only"
)

df_long["base_model"] = df_long["model"].apply(
    lambda x: "llama" if "llama" in x else
              "falcon" if "falcon" in x else
              "gemma" if "gemma" in x else
              "mistral"
)

def compute_corr(group):
    return pd.Series({
        "correlation": group["score"].corr(group["gold_standard_cos"])
    })

corr_table = (
    df_long.groupby(["dataset", "base_model", "context"])
    .apply(compute_corr)
    .reset_index()
)

pivot_table = corr_table.pivot_table(
    index=["dataset", "base_model"],
    columns="context",
    values="correlation"
).reset_index()

pretty_table = pivot_table.set_index(["dataset", "base_model"])
pretty_table

  .apply(compute_corr)


Unnamed: 0_level_0,context,with_problem_context_only,without_context
dataset,base_model,Unnamed: 2_level_1,Unnamed: 3_level_1
alqa,falcon,0.887612,0.722925
alqa,gemma,0.837832,0.7408
alqa,llama,0.947742,0.898506
alqa,mistral,0.917195,0.775233
newsqa,falcon,0.666437,0.565388
newsqa,gemma,0.913844,0.814009
newsqa,llama,0.944567,0.859017
newsqa,mistral,0.877796,0.8855
sl,falcon,0.759538,0.601082
sl,gemma,0.705582,0.668835


In [19]:
summary_table = (
    corr_table.groupby(["dataset", "context"])["correlation"]
    .mean()
    .reset_index()
    .pivot(index="dataset", columns="context", values="correlation")
    .reset_index()
)

summary_table

context,dataset,with_problem_context_only,without_context
0,alqa,0.897595,0.784366
1,newsqa,0.850661,0.780978
2,sl,0.845212,0.772508


In [20]:
def compute_corr_with_n(group):
    return pd.Series({
        "correlation": group["score"].corr(group["gold_standard_cos"]),
        "n": len(group)
    })

corr_table = (
    df_long.groupby(["dataset", "base_model", "context"])
    .apply(compute_corr_with_n)
    .reset_index()
)

df_without_context = corr_table[corr_table["context"] == "without_context"].copy()
df_with_context = corr_table[corr_table["context"] == "with_problem_context_only"].copy()

merged = pd.merge(
    df_without_context, df_with_context,
    on=["dataset", "base_model"],
    suffixes=("_without", "_with")
)

def fisher_z(r):
    return np.arctanh(r)

def compare_correlations(row):
    r1, r2 = row["correlation_without"], row["correlation_with"]
    n1, n2 = row["n_without"], row["n_with"]

    if abs(r1) >= 1 or abs(r2) >= 1:
        return pd.Series({
            "correlation_without": r1,
            "correlation_with": r2,
            "n_without": n1,
            "n_with": n2,
            "z_score": np.nan,
            "p_value": np.nan,
            "significant": False,
            "note": "Perfect correlation (±1), Z undefined"
        })

    z1 = fisher_z(r1)
    z2 = fisher_z(r2)
    se = np.sqrt(1 / (n1 - 3) + 1 / (n2 - 3))
    z = (z1 - z2) / se
    p = 2 * (1 - stats.norm.cdf(abs(z)))

    return pd.Series({
        "correlation_without": r1,
        "correlation_with": r2,
        "n_without": n1,
        "n_with": n2,
        "z_score": z,
        "p_value": p,
        "significant": p < 0.05,
        "note": ""
    })

results = merged.apply(compare_correlations, axis=1)
final_results = pd.concat([merged[["dataset", "base_model"]], results], axis=1)
final_results

  .apply(compute_corr_with_n)


Unnamed: 0,dataset,base_model,correlation_without,correlation_with,n_without,n_with,z_score,p_value,significant,note
0,alqa,falcon,0.722925,0.887612,400.0,400.0,-6.99957,2.567502e-12,True,
1,alqa,gemma,0.7408,0.837832,400.0,400.0,-3.685742,0.0002280372,True,
2,alqa,llama,0.898506,0.947742,400.0,400.0,-4.856532,1.194596e-06,True,
3,alqa,mistral,0.775233,0.917195,400.0,400.0,-7.576421,3.552714e-14,True,
4,newsqa,falcon,0.565388,0.666437,400.0,400.0,-2.30479,0.02117836,True,
5,newsqa,gemma,0.814009,0.913844,400.0,400.0,-5.798398,6.695157e-09,True,
6,newsqa,llama,0.859017,0.944567,400.0,400.0,-6.892696,5.47451e-12,True,
7,newsqa,mistral,0.8855,0.877796,400.0,400.0,0.487573,0.6258524,False,
8,sl,falcon,0.601082,0.759538,400.0,400.0,-4.230675,2.329914e-05,True,
9,sl,gemma,0.668835,0.705582,400.0,400.0,-0.982004,0.3260977,False,


TEMP BREAK EXPERIMENTS

In [None]:
dataset1 = load_dataset("Ramitha/unique-records-snippet-combination")
df1 = pd.DataFrame(dataset1["rawcases"])
dataset2 = load_dataset("Ramitha/snippet-less-output-all-models")
df2 = pd.DataFrame(dataset2["rawcases"])

df_with_context = df1[df1["snippet_percentage"] == 100]
df_without_context = df2
df_combined = pd.concat([df_with_context, df_without_context], ignore_index=True)

model_cols = [c for c in df_combined.columns if c.startswith("ILRAlign_")]

df_long = df_combined.melt(
    id_vars=["dataset", "gold_standard_cos", "temperature"],
    value_vars=model_cols,
    var_name="model",
    value_name="score"
)

df_long["context"] = df_long["model"].apply(
    lambda x: "without_context" if "without_context" in x else "with_problem_context_only"
)
df_long["base_model"] = df_long["model"].apply(
    lambda x: "llama" if "llama" in x else
              "falcon" if "falcon" in x else
              "gemma" if "gemma" in x else
              "mistral"
)

df_long = df_long.dropna(subset=["score"])

def assign_temp_bin(subdf):
    temps = subdf["temperature"]
    if temps.nunique() < 3:
        return pd.Series(["mid"] * len(subdf), index=subdf.index)
    q_low = temps.quantile(1/3)
    q_high = temps.quantile(2/3)
    def bin_fn(t):
        if t <= q_low:
            return "low"
        elif t <= q_high:
            return "mid"
        else:
            return "high"
    return subdf["temperature"].apply(bin_fn)

df_long["temp_bin"] = (
    df_long
      .groupby(["dataset", "base_model", "context"])
      .apply(assign_temp_bin)
      .reset_index(level=[0,1,2], drop=True)
)

group_cols = ["dataset", "base_model", "context", "temp_bin"]
corr_list = []
for keys, group in df_long.groupby(group_cols):
    corr = group["score"].corr(group["gold_standard_cos"])
    corr_list.append({
        "dataset": keys[0],
        "base_model": keys[1],
        "context": keys[2],
        "temperature": keys[3],
        "correlation": corr,
        "n": len(group)
    })
corr_df = pd.DataFrame(corr_list)

pivot_tbl = corr_df.pivot_table(
    index=["dataset", "base_model", "context"],
    columns="temperature",
    values="correlation"
).reset_index()

for col in ["low", "mid", "high"]:
    if col not in pivot_tbl.columns:
        pivot_tbl[col] = None

pivot_tbl = pivot_tbl[["dataset", "base_model", "context", "low", "mid", "high"]]
pretty = pivot_tbl.set_index(["dataset", "base_model", "context"])

print(pretty)


temperature                                        low       mid      high
dataset base_model context                                                
alqa    falcon     with_problem_context_only  0.806630  0.871037  0.914028
                   without_context            0.591144  0.637479  0.747607
        gemma      with_problem_context_only  0.764217  0.666492  0.708016
                   without_context            0.711607  0.397696  0.469556
        llama      with_problem_context_only  0.858059  0.956672  0.974975
                   without_context            0.774305  0.831182  0.900002
        mistral    with_problem_context_only  0.677096  0.758566  0.906369
                   without_context            0.708803  0.565460  0.741699
newsqa  falcon     with_problem_context_only  0.807877  0.625030  0.898849
                   without_context            0.727865  0.496250  0.576811
        gemma      with_problem_context_only  0.064951 -0.138586  0.976255
                   withou

  .apply(assign_temp_bin)
