$$
Eff_{overal} = 
\frac{1}{K}
\sum_{i=1}^K
\left[
    OM_{perf} (S_i | R)
    \cdot
    \left(
        \frac{E_A^T}{E_A^{S_i}}
    \right)
\right]
$$

In [13]:
import json
import math
from pathlib import Path

import pandas as pd


In [14]:

TEACHER_EVAL_PATH = Path("../../logs/ept_logs/teacher_base_parallel/eval_results_2025-11-26T11-15-36.434403.json")  # <--- EDIT
TEACHER_EPT_JSON = Path("../../eval/ept/benchmark/results/ept_teacher_23901.json")

# Relation KD
REL_EVAL_DIR  = Path("../../results/relation/")   # EDIT
REL_EPT_DIR = Path("../../eval/ept/benchmark/relation_23898/")

# Response KD
RESP_EVAL_DIR  = Path("../../results/response/")  # EDIT
RESP_EPT_DIR = Path("../../eval/ept/benchmark/response_23899/")

# Feature KD
FEAT_EVAL_DIR  = Path("../../results/feature/")   # EDIT
FEAT_EPT_DIR = Path("../../eval/ept/benchmark/feature_23897/")

# Tasks used in OM_perf and Eff_overall
TASKS = ["arc_challenge", "bbh", "hellaswag", "mmlu"]


In [15]:
def load_eval_json_maybe_zip(path: Path) -> dict:
    """
    Load an LM-Eval JSON.

    - If 'path' is a .json: read it directly.
    - If 'path' is a .zip: open the zip, read the first .json inside.
    """
    if path.suffix == ".json":
        with path.open("r") as f:
            return json.load(f)

    if path.suffix == ".zip":
        import zipfile
        with zipfile.ZipFile(path, "r") as z:
            # take the first JSON file inside
            json_names = [n for n in z.namelist() if n.endswith(".json")]
            if not json_names:
                raise FileNotFoundError(f"No JSON file found inside {path}")
            with z.open(json_names[0], "r") as f:
                return json.load(f)

    raise ValueError(f"Unsupported eval file type: {path}")

In [16]:
def extract_main_task_scores(eval_dict: dict) -> dict:
    """
    Extract the 4 main task scores into a flat dict:

      - arc_challenge: acc_norm
      - bbh:           acc
      - hellaswag:     acc_norm
      - mmlu:          acc

    This assumes the LM-Eval JSON structure where metrics live under
    eval_dict["results"][task][metric_name]["value"] or ["acc"] style.
    Adjust if your schema is slightly different.
    """
    results = eval_dict.get("results", eval_dict)  # be a bit robust

    out = {}

    # ARC-Challenge
    arc = results["arc_challenge"]
    if isinstance(arc, dict):
        # LM-Eval usually: {"acc": ..., "acc_norm": ...}
        out["arc_challenge"] = arc.get("acc_norm", arc.get("acc"))

    # BBH
    bbh = results["bbh"]
    out["bbh"] = bbh.get("acc", bbh.get("accuracy", None))

    # HellaSwag
    hs = results["hellaswag"]
    out["hellaswag"] = hs.get("acc_norm", hs.get("acc"))

    # MMLU
    mmlu = results["mmlu"]
    out["mmlu"] = mmlu.get("acc", mmlu.get("accuracy", None))

    # Sanity check
    missing = [k for k, v in out.items() if v is None]
    if missing:
        raise KeyError(f"Missing metrics for tasks: {missing}")

    return out


In [17]:
def get_normalizer(eval_dict: dict) -> tuple[float, str]:
    """
    Try to automatically get a normalizer for energy:

      1) Prefer token-based fields (per-token metric):
         total_generated_tokens / total_tokens / num_eval_tokens / num_generated_tokens

      2) Fallback: total_evaluation_time_seconds (per-second metric)

    Returns:
        (value, kind) where kind is "tokens" or "seconds".
    """
    token_keys = [
        "total_generated_tokens",
        "total_tokens",
        "num_eval_tokens",
        "num_generated_tokens",
    ]

    for k in token_keys:
        if k in eval_dict:
            return float(eval_dict[k]), "tokens"

    if "total_evaluation_time_seconds" in eval_dict:
        return float(eval_dict["total_evaluation_time_seconds"]), "seconds"

    raise KeyError(
        "Could not find token count or total_evaluation_time_seconds in eval JSON.\n"
        "Add one of: total_generated_tokens / total_tokens / num_eval_tokens / "
        "num_generated_tokens / total_evaluation_time_seconds."
    )

In [18]:
def load_energy_mj_series(jsonl_path: Path, gpu_index: int | None = None) -> pd.Series:
    """
    Load cumulative energy_mJ from a monitor.py JSONL file.
    If gpu_index is None: sum across all GPUs.
    """
    vals = []

    with jsonl_path.open("r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            d = json.loads(line)
            gpus = d["gpus"]

            if gpu_index is None:
                total_mJ = sum(g["energy_mJ"] for g in gpus)
                vals.append(total_mJ)
            else:
                for g in gpus:
                    if g["gpu_index"] == gpu_index:
                        vals.append(g["energy_mJ"])
                        break

    return pd.Series(vals, name="energy_mJ")

In [19]:
def compute_E_run_J(energy_mJ_series: pd.Series) -> float:
    """
    Compute total energy for a run in Joules
    from cumulative energy_mJ series.
    """
    if energy_mJ_series.empty:
        return 0.0
    return (energy_mJ_series.iloc[-1] - energy_mJ_series.iloc[0]) / 1000.0

In [20]:
def compute_OM_perf(student_scores: dict,
                    teacher_scores: dict,
                    tasks = TASKS) -> float:
    """
    OM_perf(S|T) = sqrt( (1/M) * sum_t (A_t^S / A_t^T)^2 )

    Captures how well the student retains the teacher's ability across M tasks.
    """
    ratios_sq = []
    for t in tasks:
        A_s = student_scores[t]
        A_t = teacher_scores[t]
        ratios_sq.append((A_s / A_t) ** 2)

    M = len(tasks)
    return math.sqrt(sum(ratios_sq) / M)


In [21]:
# ---- Load teacher eval (from JSON or ZIP) ----
teacher_eval_dict = load_eval_json_maybe_zip(TEACHER_EVAL_PATH)
teacher_scores    = extract_main_task_scores(teacher_eval_dict)
teacher_norm_val, teacher_norm_kind = get_normalizer(teacher_eval_dict)

print("Teacher scores:", teacher_scores)
print("Teacher normalizer:", teacher_norm_val, f"({teacher_norm_kind})")

KeyError: "Missing metrics for tasks: ['arc_challenge', 'bbh', 'hellaswag', 'mmlu']"

In [None]:
# ---- Load teacher EPT benchmark and compute energy per token ----
# We now use the EPT benchmark summary JSON (token file) instead of raw monitor.py telemetry
# for the Eff_overall calculation.
# TEACHER_EPT_JSON should point to a single JSON file like:
# {
#   "E_run_J": ...,
#   "T_in": ...,
#   "T_out": ...,
#   "EPT_in_J_per_tok": ...,
#   "EPT_out_J_per_tok": ...,
#   "EPT_total_J_per_tok": ...
# }

with TEACHER_EPT_JSON.open("r") as f:
    teacher_ept = json.load(f)

teacher_E_run_J = teacher_ept["E_run_J"]
teacher_T_in    = teacher_ept.get("T_in")
teacher_T_out   = teacher_ept.get("T_out")

teacher_EPT_in_J_per_tok   = teacher_ept.get("EPT_in_J_per_tok")
teacher_EPT_out_J_per_tok  = teacher_ept.get("EPT_out_J_per_tok")
teacher_E_per_unit_J       = teacher_ept["EPT_total_J_per_tok"]  # J per token from EPT/token file

print(f"Teacher E_run_J (EPT): {teacher_E_run_J:.3f} J")
if teacher_T_in is not None and teacher_T_out is not None:
    print(f"Teacher tokens (EPT run): T_in={teacher_T_in}, T_out={teacher_T_out}, "
          f"total={teacher_T_in + teacher_T_out}")
print(f"Teacher EPT_total_J_per_tok: {teacher_E_per_unit_J:.6e} J/token")


In [None]:
def extract_tag_from_eval(path: Path) -> str:
    """
    Extract a stable experiment tag from an LM-Eval JSON filename.

    Example:
      23797_harness_meta-llama_Llama-3.1-8B-Instruct__20251113_2057_RelB_1n_20251123_185443_2025-11-24T07-44-58.217405.json
    -> tag: 20251113_2057_RelB_1n
    """
    stem = path.stem
    if "__" in stem:
        suffix = stem.split("__", 1)[1]
    else:
        suffix = stem

    parts = suffix.split("_")
    if len(parts) < 4:
        raise ValueError(f"Cannot extract tag from eval filename: {path.name}")
    tag = "_".join(parts[:4])
    return tag


def extract_tag_from_ept(path: Path) -> str:
    """
    Extract the same experiment tag from an EPT JSON filename.

    Example:
      ept_REL_1_20251113_2057_RelB_1n_23898.json
    -> tag: 20251113_2057_RelB_1n
    """
    stem = path.stem
    parts = stem.split("_")

    date_idx = None
    for i, p in enumerate(parts):
        if len(p) == 8 and p.isdigit():
            date_idx = i
            break

    if date_idx is None or len(parts) < date_idx + 4:
        raise ValueError(f"Cannot extract tag from EPT filename: {path.name}")

    tag_parts = parts[date_idx:date_idx + 4]
    tag = "_".join(tag_parts)
    return tag


def build_kd_df(eval_dir: Path, ept_dir: Path, kd_name: str) -> pd.DataFrame:
    """
    Build a dataframe of OM_perf, energy-per-token, and Eff_model
    for all student models in a given KD family (relation / response / feature).

    Matching is done via a shared experiment TAG, e.g. '20251113_2057_RelB_1n',
    which appears in BOTH the eval filename and the EPT filename, even though
    the full names differ.
    """
    eval_files = sorted(eval_dir.glob("*.json"))
    if not eval_files:
        print(f"[WARN] No eval JSON files found in {eval_dir}")
        return pd.DataFrame()

    ept_files = sorted(ept_dir.glob("*.json"))
    if not ept_files:
        print(f"[WARN] No EPT JSON files found in {ept_dir}")
        return pd.DataFrame()

    # Map tag -> EPT path
    ept_by_tag: dict[str, Path] = {}
    for ept_path in ept_files:
        try:
            tag = extract_tag_from_ept(ept_path)
            ept_by_tag[tag] = ept_path
        except ValueError as e:
            print(f"[WARN] {e}")
            continue

    rows: list[dict] = []

    for eval_path in eval_files:
        try:
            tag = extract_tag_from_eval(eval_path)
        except ValueError as e:
            print(f"[WARN] {e}")
            continue

        ept_path = ept_by_tag.get(tag)
        if ept_path is None:
            print(f"[WARN] No matching EPT file for eval {eval_path.name} with tag {tag}, skipping.")
            continue

        # --- Eval side ---
        eval_dict      = load_eval_json_maybe_zip(eval_path)
        student_scores = extract_main_task_scores(eval_dict)
        norm_val, norm_kind = get_normalizer(eval_dict)

        om_perf = compute_OM_perf(student_scores, teacher_scores, tasks=TASKS)

        # --- EPT side ---
        with ept_path.open("r") as f:
            ept = json.load(f)

        E_run_J  = ept["E_run_J"]
        T_in     = ept.get("T_in")
        T_out    = ept.get("T_out")
        EPT_in_J_per_tok   = ept.get("EPT_in_J_per_tok")
        EPT_out_J_per_tok  = ept.get("EPT_out_J_per_tok")
        E_per_unit_J       = ept["EPT_total_J_per_tok"]  # J/token

        energy_ratio_T_over_S = teacher_E_per_unit_J / E_per_unit_J
        Eff_model = om_perf * energy_ratio_T_over_S

        rows.append({
            "kd_type": kd_name,
            "model_name": eval_path.stem,
            "eval_file": str(eval_path),
            "ept_file": str(ept_path),
            "tag": tag,
            "normalizer_value": norm_val,
            "normalizer_kind": norm_kind,
            "OM_perf": om_perf,
            "E_run_J": E_run_J,
            "T_in": T_in,
            "T_out": T_out,
            "EPT_in_J_per_tok": EPT_in_J_per_tok,
            "EPT_out_J_per_tok": EPT_out_J_per_tok,
            "E_per_unit_J": E_per_unit_J,
            "energy_ratio_T_over_S": energy_ratio_T_over_S,
            "Eff_model": Eff_model,
        })

    if not rows:
        return pd.DataFrame()

    df = pd.DataFrame(rows)
    df.sort_values("Eff_model", ascending=False, inplace=True)
    return df


# Build dataframes for each KD family
relation_df = build_kd_df(REL_EVAL_DIR, REL_EPT_DIR, "relation")
response_df = build_kd_df(RESP_EVAL_DIR, RESP_EPT_DIR, "response")
feature_df  = build_kd_df(FEAT_EVAL_DIR, FEAT_EPT_DIR, "feature")

relation_df, response_df, feature_df


In [None]:
# ---- KD Matching Sanity Check ----

def sanity_check_kd_pairing(eval_dir: Path, ept_dir: Path, kd_name: str):
    print(f"\n=== Sanity Check for {kd_name.upper()} KD ===")
    
    eval_files = sorted(eval_dir.glob("*.json"))
    ept_files  = sorted(ept_dir.glob("*.json"))
    
    print(f"Found {len(eval_files)} eval files and {len(ept_files)} EPT files")
    
    # Build map tag -> ept path
    ept_map = {}
    for e in ept_files:
        try:
            tag = extract_tag_from_ept(e)
            ept_map[tag] = e
        except Exception as ex:
            print(f"[WARN] Could not parse EPT file {e.name}: {ex}")

    rows = []
    for ev in eval_files:
        try:
            tag = extract_tag_from_eval(ev)
        except Exception as ex:
            print(f"[WARN] Could not parse eval file {ev.name}: {ex}")
            continue
        
        matched = ept_map.get(tag)
        rows.append({
            "kd_type": kd_name,
            "eval_file": ev.name,
            "ept_file": matched.name if matched else "❌ NO MATCH",
            "tag": tag,
        })
    
    df_check = pd.DataFrame(rows)
    display(df_check)
    
    missing = df_check[df_check["ept_file"] == "❌ NO MATCH"]
    if not missing.empty:
        print("\n❌ Missing matches detected:")
        display(missing)
    else:
        print("\n✅ All eval files correctly matched to EPT files!")
    
    return df_check


check_relation = sanity_check_kd_pairing(REL_EVAL_DIR, REL_EPT_DIR, "relation")
check_response = sanity_check_kd_pairing(RESP_EVAL_DIR, RESP_EPT_DIR, "response")
check_feature  = sanity_check_kd_pairing(FEAT_EVAL_DIR, FEAT_EPT_DIR, "feature")


In [None]:
def build_kd_df(eval_dir: Path, ept_dir: Path, kd_name: str) -> pd.DataFrame:
    """
    Build a dataframe of OM_perf, energy-per-token, and Eff_model
    for all student models in a given KD family (relation / response / feature).

    We treat eval, telemetry, and token/EPT files as separate:
      - eval_dir: LM-eval JSON files (metrics per task)
      - ept_dir:  EPT/token JSON files (E_run_J, T_in, T_out, EPT_*_J_per_tok)
    Raw telemetry JSONLs can live separately (e.g., REL_TELEM_DIR) but are
    not needed for Eff_overall because E_run_J already appears in the EPT files.
    """
    eval_files = sorted(eval_dir.glob("*.json"))
    if not eval_files:
        print(f"[WARN] No eval JSON files found in {eval_dir}")
        return pd.DataFrame()

    rows: list[dict] = []

    for eval_path in eval_files:
        stem = eval_path.stem
        ept_path = ept_dir / f"{stem}.json"

        if not ept_path.exists():
            print(f"[WARN] No EPT/token file for {eval_path.name}, expected {ept_path.name}, skipping.")
            continue

        # --- Eval side: scores + normalizer (tokens or seconds if present) ---
        eval_dict      = load_eval_json_maybe_zip(eval_path)
        student_scores = extract_main_task_scores(eval_dict)
        norm_val, norm_kind = get_normalizer(eval_dict)

        # OM_perf across the 4 tasks
        om_perf = compute_OM_perf(student_scores, teacher_scores, tasks=TASKS)

        # --- EPT/token side: energy per token ---
        with ept_path.open("r") as f:
            ept = json.load(f)

        E_run_J  = ept["E_run_J"]
        T_in     = ept.get("T_in")
        T_out    = ept.get("T_out")
        EPT_in_J_per_tok   = ept.get("EPT_in_J_per_tok")
        EPT_out_J_per_tok  = ept.get("EPT_out_J_per_tok")
        E_per_unit_J       = ept["EPT_total_J_per_tok"]  # J per token

        # Teacher / Student energy-per-token ratio
        energy_ratio_T_over_S = teacher_E_per_unit_J / E_per_unit_J

        # Overall efficiency for this model:
        # high if (a) performance is close to teacher, and (b) student uses
        # fewer Joules per token than the teacher.
        Eff_model = om_perf * energy_ratio_T_over_S

        rows.append({
            "kd_type": kd_name,
            "model_name": stem,
            "eval_file": str(eval_path),
            "ept_file": str(ept_path),
            "normalizer_value": norm_val,
            "normalizer_kind": norm_kind,   # "tokens" or "seconds" from eval harness (if available)
            "OM_perf": om_perf,
            "E_run_J": E_run_J,
            "T_in": T_in,
            "T_out": T_out,
            "EPT_in_J_per_tok": EPT_in_J_per_tok,
            "EPT_out_J_per_tok": EPT_out_J_per_tok,
            "E_per_unit_J": E_per_unit_J,
            "energy_ratio_T_over_S": energy_ratio_T_over_S,
            "Eff_model": Eff_model,
        })

    if not rows:
        return pd.DataFrame()

    df = pd.DataFrame(rows)
    df.sort_values("Eff_model", ascending=False, inplace=True)
    return df


# Build dataframes for each KD family
relation_df = build_kd_df(REL_EVAL_DIR, REL_EPT_DIR, "relation")
response_df = build_kd_df(RESP_EVAL_DIR, RESP_EPT_DIR, "response")
feature_df  = build_kd_df(FEAT_EVAL_DIR, FEAT_EPT_DIR, "feature")

relation_df, response_df, feature_df


In [None]:
# Compute Eff_overall for each KD type
if relation_df is None or relation_df.empty:
    raise RuntimeError("No relation KD runs processed. Check eval / EPT paths and naming.")
if response_df is None or response_df.empty:
    print("[WARN] No response KD runs processed.")
if feature_df is None or feature_df.empty:
    print("[WARN] No feature KD runs processed.")

Eff_overall_relation = relation_df["Eff_model"].mean()
print(f"Eff_overall (Relation KD): {Eff_overall_relation:.4f}")

Eff_overall_response = None
Eff_overall_feature  = None

if not response_df.empty:
    Eff_overall_response = response_df["Eff_model"].mean()
    print(f"Eff_overall (Response KD): {Eff_overall_response:.4f}")

if not feature_df.empty:
    Eff_overall_feature = feature_df["Eff_model"].mean()
    print(f"Eff_overall (Feature KD): {Eff_overall_feature:.4f}")

# Optional: combined view of all models from all KD families
all_kd_df = pd.concat(
    [df for df in [relation_df, response_df, feature_df] if df is not None and not df.empty],
    ignore_index=True
)
all_kd_df
