In [None]:
import os
import pandas as pd


In [2]:
RESULTS_DIR = os.path.abspath("/home/h5/albu670g/qa-model/results")


In [6]:
validated_paths = {
    "llama_3_8b_instruct": os.path.join(
        RESULTS_DIR,
        "llama_3_8b_instruct",
        "codex_high_validated_mcq_llama_3_8b_instruct.tsv",
    ),
    "mistral_7b_instruct": os.path.join(
        RESULTS_DIR,
        "mistral_7b_instruct",
        "codex_high_validated_mcq_mistral_7b_instruct.tsv",
    ),
}


In [7]:
def accuracy_from_validated(path: str) -> float:
    df = pd.read_csv(path, sep="\t")
    return (df["conclusion"]).mean()

rows = []
for name, path in validated_paths.items():
    acc = accuracy_from_validated(path)
    rows.append({"model": name, "accuracy": acc})

results = pd.DataFrame(rows).sort_values("accuracy", ascending=False)
results


Unnamed: 0,model,accuracy
0,llama_3_8b_instruct,0.622912
1,mistral_7b_instruct,0.582339


In [None]:
import glob

CODEX_GROUND_TRUTH = os.path.join(RESULTS_DIR, "codex", "mcq_codex_submission.tsv")

def _pick_choice(row):
    true_cols = [c for c in ["A", "B", "C", "D"] if str(row[c]).strip().lower() == "true"]
    if len(true_cols) != 1:
        return None
    return true_cols[0]

def _load_predictions(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, sep="\t")
    df["pred_choice"] = df.apply(_pick_choice, axis=1)
    return df[["MCQID", "pred_choice"]]

codex = _load_predictions(CODEX_GROUND_TRUTH).rename(columns={"pred_choice": "codex_choice"})

model_files = glob.glob(os.path.join(RESULTS_DIR, "*", "mcq_submission*.tsv"))

rows = []
for path in sorted(model_files):
    model_name = os.path.basename(os.path.dirname(path))
    preds = _load_predictions(path)
    merged = codex.merge(preds, on="MCQID", how="left")
    acc = (merged["pred_choice"] == merged["codex_choice"]).mean()
    missing = merged["pred_choice"].isna().sum()
    rows.append({"model": model_name, "file": path, "accuracy_vs_codex": acc, "missing": missing})

codex_results = pd.DataFrame(rows).sort_values("accuracy_vs_codex", ascending=False)
codex_results
