In [1]:
import json
import glob
from pathlib import Path
import numpy as np

# ------------------------------------------------------------------
# CONFIG: which metric to use for each task, and from where in JSON
# ------------------------------------------------------------------
# In our JSON:
# - arc_challenge:      results["arc_challenge"]["acc,none"]
# - hellaswag:          results["hellaswag"]["acc,none"]
# - bbh (group):        groups["bbh"]["exact_match,get-answer"]
# - mmlu (group):       groups["mmlu"]["acc,none"]
TASKS = ["arc_challenge", "hellaswag", "bbh", "mmlu"]

METRIC_CONFIG = {
    "arc_challenge": ("results", "acc,none"),
    "hellaswag":     ("results", "acc,none"),
    "bbh":           ("groups",  "exact_match,get-answer"),
    "mmlu":          ("groups",  "acc,none"),
}

def load_json(path):
    path = Path(path)
    with path.open("r") as f:
        return json.load(f)

def extract_task_scores(eval_json_path, tasks=TASKS, metric_cfg=METRIC_CONFIG):
    """
    Return a dict: {task_name: score} for the given eval json.
    """
    data = load_json(eval_json_path)
    scores = {}
    for task in tasks:
        section_name, metric_key = metric_cfg[task]
        section = data[section_name]
        scores[task] = section[task][metric_key]
    return scores


## OM_perf — Overall Performance Metric

We define the average performance of a KD method across **5 student models** and **4 tasks** using the following formula:

\[
\text{OM\_perf}
=
\frac{1}{N_{\text{models}}}
\sum_{i=1}^{N_{\text{models}}}
\left(
    \frac{1}{T}
    \sum_{t=1}^{T}
    \frac{s_{i,t}}{s^{(\text{teacher})}_t}
\right)
\]

Where:

- \(T = 4\) — number of evaluation tasks  
- \(N_{\text{models}} = 5\) — number of distilled student models  
- \(s_{i,t}\) — score of student model \(i\) on task \(t\)  
- \(s^{(\text{teacher})}_t\) — score of the teacher model on task \(t\)

### Interpretation

1. **Task averaging:**  
   For each student model \(i\), we compute the average of the student/teacher accuracy ratios across all 4 tasks.

2. **Model averaging:**  
   We then average these per-model scores across all 5 KD student models.

This gives a single scalar value measuring **how well the KD method performs relative to the teacher**, averaged across all tasks and student runs.


In [None]:
# ------------------------------------------------------------------
# PATHS you need to customize:
# ------------------------------------------------------------------

# Single teacher eval JSON
TEACHER_EVAL = "Base/eval/results/harness_meta-llama_Llama-3.1-70B-Instruct_20251113_144303_2025-11-14T01-20-58.355548.json"   # <- Base teacher evaluation path

# All 5 KD eval JSONs for ONE KD method (e.g., response KD)
# Example: use glob if stored in a folder
KD_EVAL_GLOB = "evals/resp_kd/*.json"   # <- change to your folder pattern

kd_eval_paths = sorted(glob.glob(KD_EVAL_GLOB))
print("KD eval files:", kd_eval_paths)

# ------------------------------------------------------------------
# 1) Load teacher scores
# ------------------------------------------------------------------
teacher_scores = extract_task_scores(TEACHER_EVAL, TASKS, METRIC_CONFIG)
print("Teacher scores per task:")
for t in TASKS:
    print(f"  {t:15s}: {teacher_scores[t]:.4f}")

# ------------------------------------------------------------------
# 2) Load KD scores for each model
# ------------------------------------------------------------------
kd_models_scores = []
for p in kd_eval_paths:
    scores = extract_task_scores(p, TASKS, METRIC_CONFIG)
    kd_models_scores.append((p, scores))

# ------------------------------------------------------------------
# 3) Compute OM_perf per model and overall average
# ------------------------------------------------------------------
om_perf_per_model = []  # list of floats

for path, scores in kd_models_scores:
    ratios = []
    for t in TASKS:
        s_kd = scores[t]
        s_teacher = teacher_scores[t]
        ratios.append(s_kd / s_teacher)
    om_model = sum(ratios) / len(TASKS)     # average over tasks
    om_perf_per_model.append(om_model)
    print(f"OM_perf for model {Path(path).name}: {om_model:.4f}")

om_perf_overall = sum(om_perf_per_model) / len(om_perf_per_model)
print("\n========================================")
print(f"Overall OM_perf (KD method, avg over 5 models): {om_perf_overall:.4f}")
print("========================================")


In [None]:
import pandas as pd

rows = []
for (path, scores), om_val in zip(kd_models_scores, om_perf_per_model):
    row = {"model_json": Path(path).name, "OM_perf": om_val}
    # also store raw task scores if you want
    for t in TASKS:
        row[f"{t}_score"] = scores[t]
        row[f"{t}_ratio"] = scores[t] / teacher_scores[t]
    rows.append(row)

df = pd.DataFrame(rows)
display(df)

print("Overall OM_perf:", om_perf_overall)
