## General Look
The purpose of this file is to create a general table that can show the detailed experiment results

| Category             | Model Name                | Training Energy Usage | Accuracy Test Results | Energy Per Token | OMperf Results | EffOverall |
|----------------------|---------------------------|------------------------|------------------------|------------------|----------------|------------|
| Feature              | Model No 1                |                        |                        |                  |                |            |
|                      | Model No 2                |                        |                        |                  |                |            |
|                      | Model No 3                |                        |                        |                  |                |            |
|                      | Model No 4                |                        |                        |                  |                |            |
|                      | Model No 5                |                        |                        |                  |                |            |
| Response             | Model No 1                |                        |                        |                  |                |            |
|                      | Model No 2                |                        |                        |                  |                |            |
|                      | Model No 3                |                        |                        |                  |                |            |
|                      | Model No 4                |                        |                        |                  |                |            |
|                      | Model No 5                |                        |                        |                  |                |            |
| Relation             | Model No 1                |                        |                        |                  |                |            |
|                      | Model No 2                |                        |                        |                  |                |            |
|                      | Model No 3                |                        |                        |                  |                |            |
|                      | Model No 4                |                        |                        |                  |                |            |
|                      | Model No 5                |                        |                        |                  |                |            |
| Base Model           | Llama3.1-70B-Instruct     | -                      |                        |                  |                |            |
|                      | Llama3.1-8B-Instruct      | -                      |                        |                  |                |            |
| Traditional Training | Llama3.1-8B-Instruct      | -                      |                        |                  |                |            |


In [20]:
# =======================
# CELL 1 - IMPORTS & DIRS
# =======================

import json
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime

# Adjust this to your repo layout
BASE_DIR = Path("./../../0EXPERIMENT_RESULTS").resolve()
assert BASE_DIR.is_dir(), f"BASE_DIR not found: {BASE_DIR}"
BASE_DIR


PosixPath('/mnt/DISCL/work/bsencer/Reproductivity-test/Energy-Aware-Knowledge-Distillation/0EXPERIMENT_RESULTS')

In [21]:
# =======================
# CELL 2 - TELEMETRY ENERGY HELPERS
# =======================

def parse_ts(ts: str) -> float:
    """ISO8601 timestamp with timezone -> epoch seconds."""
    return datetime.fromisoformat(ts).timestamp()

def energy_from_power_integration(telemetry_path: Path):
    """
    Compute total GPU energy by integrating instantaneous power over time.

    Returns:
      duration_s, E_gpu_J_power, E_gpu_Wh_power, E_gpu_kWh_power
    """
    prev_t = None
    total_J = 0.0

    first_t = None
    last_t = None

    with open(telemetry_path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rec = json.loads(line)

            t = parse_ts(rec["timestamp"])
            if first_t is None:
                first_t = t
            last_t = t

            # total GPU power across all GPUs at this sample
            p_total = 0.0
            for g in rec.get("gpus", []):
                pw = g.get("power_watts")
                if pw is not None:
                    p_total += float(pw)

            if prev_t is not None:
                dt = t - prev_t
                if dt > 0:
                    total_J += p_total * dt  # W*s = J

            prev_t = t

    if first_t is None or last_t is None:
        return None

    duration_s = last_t - first_t
    total_Wh = total_J / 3600.0
    total_kWh = total_Wh / 1000.0

    return {
        "duration_s": duration_s,
        "E_gpu_J_power": total_J,
        "E_gpu_Wh_power": total_Wh,
        "E_gpu_kWh_power": total_kWh,
    }

def energy_from_energy_counters(telemetry_path: Path):
    """
    Optional cross-check:
    Use NVML energy counter deltas (energy_mJ) between first and last sample.

    NOTE: In your earlier snippet, counter deltas did NOT match power_watts,
    so treat this as diagnostic until you confirm the counter units/scale.

    Returns:
      E_gpu_J_counter, E_gpu_Wh_counter, E_gpu_kWh_counter
    """
    first = None
    last = None

    with open(telemetry_path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rec = json.loads(line)
            if first is None:
                first = rec
            last = rec

    if first is None or last is None:
        return None

    def energy_map(rec):
        m = {}
        for g in rec.get("gpus", []):
            idx = g.get("gpu_index")
            emj = g.get("energy_mJ")
            if idx is not None and emj is not None:
                m[idx] = float(emj)
        return m

    e0 = energy_map(first)
    e1 = energy_map(last)

    total_delta_mJ = 0.0
    for idx in sorted(set(e0) & set(e1)):
        delta = e1[idx] - e0[idx]
        if delta >= 0:
            total_delta_mJ += delta

    total_J = total_delta_mJ / 1000.0
    total_Wh = total_J / 3600.0
    total_kWh = total_Wh / 1000.0

    return {
        "E_gpu_J_counter": total_J,
        "E_gpu_Wh_counter": total_Wh,
        "E_gpu_kWh_counter": total_kWh,
    }


In [22]:
# =======================
# CELL 3 - EVAL (ACCURACY) HELPERS
# =======================

TASKS = ["arc_challenge", "bbh", "hellaswag", "mmlu"]

MAIN_METRIC_BASE = {
    "arc_challenge": "acc_norm",
    "bbh":           "exact_match",
    "hellaswag":     "acc_norm",
    "mmlu":          "acc",
}

def _get_metric_value(metrics_dict: dict, base_name: str):
    for k, v in metrics_dict.items():
        if k.startswith(base_name) and "stderr" not in k:
            return float(v)
    return None

def extract_main_task_scores(eval_dict: dict):
    results = eval_dict.get("results", eval_dict)
    scores = {}
    for task in TASKS:
        block = results.get(task)
        if not isinstance(block, dict):
            scores[task] = None
            continue
        base = MAIN_METRIC_BASE[task]
        scores[task] = _get_metric_value(block, base)
    return scores

def load_eval_scores(eval_path: Path):
    data = json.loads(eval_path.read_text())
    return extract_main_task_scores(data)


In [23]:
# =======================
# CELL 4 - EPT HELPERS
# =======================

def load_ept_file(ept_path: Path)->dict:
    d = json.loads(ept_path.read_text())

    ept_total_J = d.get("EPT_total_J_per_tok", None)
    ept_in_J = d.get("EPT_in_J_per_tok", None)
    ept_out_J = d.get("EPT_out_J_per_tok", None)

    ept_total_wh = (float(ept_total_J)/3600.0) if ept_total_J is not None else None

    return {
        "E_run_J": d.get("E_run_J", None),
        "T_in": d.get("T_in", None),
        "T_out": d.get("T_out", None),
        "EPT_in_J_per_tok": ept_in_J,
        "EPT_out_J_per_tok": ept_out_J,
        "EPT_total_J_per_tok": ept_total_J,
        "EPT_total_Wh_per_tok": ept_total_wh,
    }




In [29]:
# =======================
# CELL 5 - OMperf HELPERS
# =======================
TEACHER_EVAL_PATH = BASE_DIR / "BASE" / "TEACHER" / "eval.json"

teacher_scores = load_eval_scores(TEACHER_EVAL_PATH)
# teacher_scores

def compute_omperf_i(row: pd.Series, teacher: dict, tasks: list[str]) -> float:
    ratios = []
    for t in TASKS:
        s_it = row.get(t, np.nan)
        s_tt = teacher[t]
        if pd.isna(s_it) or s_tt == 0:
            continue
        ratios.append(float(s_it) / float(s_tt))

    return float(np.mean(ratios)) if ratios else np.nan


In [30]:
# =======================
# CELL 6 - BUILD DATAFRAMES
# =======================

energy_rows = []
eval_rows = []
ept_rows = []

for category_dir in sorted([p for p in BASE_DIR.iterdir() if p.is_dir()]):
    for model_dir in sorted([p for p in category_dir.iterdir() if p.is_dir()]):

        telemetry_path = model_dir / "telemetry.jsonl"
        if telemetry_path.exists():
            power_res = energy_from_power_integration(telemetry_path)
            counter_res = energy_from_energy_counters(telemetry_path)

            row = {
                "Category": category_dir.name,
                "Model": model_dir.name,
                "telemetry_path": str(telemetry_path),
            }
            if power_res:
                row.update(power_res)
            if counter_res:
                row.update(counter_res)
            energy_rows.append(row)

        eval_path = model_dir / "eval.json"
        if eval_path.exists():
            scores = load_eval_scores(eval_path)
            row = {
                "Category": category_dir.name,
                "Model": model_dir.name,
                "eval_path": str(eval_path),
            }
            row.update(scores)
            eval_rows.append(row)
        
        ept_path = model_dir / "ept.json"
        if ept_path.exists():
            ept = load_ept_file(ept_path)
            row = {
                "Category": category_dir.name,
                "Model": model_dir.name,
            }
            row.update(ept)
            ept_rows.append(row)

energy_df = pd.DataFrame(energy_rows).sort_values(["Category", "Model"])
eval_df = pd.DataFrame(eval_rows).sort_values(["Category", "Model"])
ept_df = pd.DataFrame(ept_rows).sort_values(["Category", "Model"])

print("energy_df rows:", len(energy_df))
print("eval_df rows:", len(eval_df))
print("ept_df rows:", len(ept_df))

energy_df.head(10), eval_df.head(10), ept_df.head(10)


energy_df rows: 15
eval_df rows: 15
ept_df rows: 15


(   Category    Model                                     telemetry_path  \
 0   FEATURE  MODEL 1  /mnt/DISCL/work/bsencer/Reproductivity-test/En...   
 1   FEATURE  MODEL 2  /mnt/DISCL/work/bsencer/Reproductivity-test/En...   
 2   FEATURE  MODEL 3  /mnt/DISCL/work/bsencer/Reproductivity-test/En...   
 3   FEATURE  MODEL 4  /mnt/DISCL/work/bsencer/Reproductivity-test/En...   
 4   FEATURE  MODEL 5  /mnt/DISCL/work/bsencer/Reproductivity-test/En...   
 5  RELATION  MODEL 1  /mnt/DISCL/work/bsencer/Reproductivity-test/En...   
 6  RELATION  MODEL 2  /mnt/DISCL/work/bsencer/Reproductivity-test/En...   
 7  RELATION  MODEL 3  /mnt/DISCL/work/bsencer/Reproductivity-test/En...   
 8  RELATION  MODEL 4  /mnt/DISCL/work/bsencer/Reproductivity-test/En...   
 9  RELATION  MODEL 5  /mnt/DISCL/work/bsencer/Reproductivity-test/En...   
 
     duration_s  E_gpu_J_power  E_gpu_Wh_power  E_gpu_kWh_power  \
 0  1162.402112  513595.516758      142.665421         0.142665   
 1  1229.894308  539364.1912

In [31]:
# =======================
# CELL 6 - MERGE + PLOTTING-READY VIEWS
# =======================


general_df = pd.merge(
    energy_df,
    eval_df,
    on=["Category", "Model"],
    how="outer",
    suffixes=("", ""),
)

general_df = pd.merge(
    general_df,
    ept_df,
    on=["Category", "Model"],
    how="outer",
)

general_df = general_df.sort_values(["Category", "Model"])

general_df["acc_avg"] = general_df[TASKS].mean(axis=1, skipna=True)

general_df["OM_perf_i"] = general_df.apply(
    lambda r: compute_omperf_i(r, teacher_scores, TASKS),
    axis=1
)

omperf_by_category = (
    general_df.dropna(subset=["OM_perf_i"])
              .groupby("Category", as_index=False)["OM_perf_i"]
              .mean()
              .rename(columns={"OM_perf_i": "OM_perf"})
)

general_df = general_df.merge(omperf_by_category, on="Category", how="left")

general_df


Unnamed: 0,Category,Model,telemetry_path,duration_s,E_gpu_J_power,E_gpu_Wh_power,E_gpu_kWh_power,E_gpu_J_counter,E_gpu_Wh_counter,E_gpu_kWh_counter,...,E_run_J,T_in,T_out,EPT_in_J_per_tok,EPT_out_J_per_tok,EPT_total_J_per_tok,EPT_total_Wh_per_tok,acc_avg,OM_perf_i,OM_perf
0,BASE,TEACHER,,,,,,,,,...,,,,,,,,0.646619,1.0,1.0
1,FEATURE,MODEL 1,/mnt/DISCL/work/bsencer/Reproductivity-test/En...,1162.402112,513595.516758,142.665421,0.142665,513326810.0,142590.780556,142.590781,...,9626.843775,15582.0,6400.0,0.617818,1.504194,0.437942,0.000122,0.193905,0.265485,0.26491
2,FEATURE,MODEL 2,/mnt/DISCL/work/bsencer/Reproductivity-test/En...,1229.894308,539364.191274,149.823386,0.149823,538938889.0,149705.246944,149.705247,...,9714.22107,15582.0,6400.0,0.623426,1.517847,0.441917,0.000123,,,0.26491
3,FEATURE,MODEL 3,/mnt/DISCL/work/bsencer/Reproductivity-test/En...,1219.150956,535499.427027,148.749841,0.14875,535135080.0,148648.633333,148.648633,...,9832.732804,15582.0,6400.0,0.631031,1.536365,0.447308,0.000124,0.191112,0.260475,0.26491
4,FEATURE,MODEL 4,/mnt/DISCL/work/bsencer/Reproductivity-test/En...,1162.509435,513580.368879,142.661214,0.142661,513249652.0,142569.347778,142.569348,...,9852.72964,15582.0,6400.0,0.632315,1.539489,0.448218,0.000125,0.194659,0.266507,0.26491
5,FEATURE,MODEL 5,/mnt/DISCL/work/bsencer/Reproductivity-test/En...,1285.365638,554512.489703,154.031247,0.154031,554099396.0,153916.498889,153.916499,...,9887.997686,15582.0,6400.0,0.634578,1.545,0.449822,0.000125,0.194976,0.267171,0.26491
6,RELATION,MODEL 1,/mnt/DISCL/work/bsencer/Reproductivity-test/En...,642.48988,416814.770822,115.781881,0.115782,416287573.0,115635.436944,115.635437,...,9943.076487,15582.0,6341.0,0.638113,1.568061,0.453545,0.000126,0.644052,1.103316,1.092834
7,RELATION,MODEL 2,/mnt/DISCL/work/bsencer/Reproductivity-test/En...,779.190602,466093.113905,129.470309,0.12947,465883702.0,129412.139444,129.412139,...,10102.933549,15582.0,6400.0,0.648372,1.578583,0.4596,0.000128,0.642167,1.10943,1.092834
8,RELATION,MODEL 3,/mnt/DISCL/work/bsencer/Reproductivity-test/En...,578.980734,395151.625367,109.76434,0.109764,394944071.0,109706.686389,109.706686,...,10136.379975,15582.0,6341.0,0.650519,1.598546,0.462363,0.000128,0.641572,1.101994,1.092834
9,RELATION,MODEL 4,/mnt/DISCL/work/bsencer/Reproductivity-test/En...,584.919042,400535.056718,111.259738,0.11126,400245066.0,111179.185,111.179185,...,10199.401877,15582.0,6400.0,0.654563,1.593657,0.463989,0.000129,0.632201,1.087392,1.092834
