I am assuming we have access two jsonls
- Student Outptuts
    - `index`: Unique identifier for each question.
    - `output`: The model's response to the question.

- Hidden Test Set with the following fields:
    - `index`: Unique identifier for each question.
    - `task`: The name of the task (e.g., "mmlu_med").
    - `prompt`: The question prompt presented to the model.
    - `gold_answer`: The correct answer to the question.
    - (Not needed)`meta`: Additional metadata about the question, including unique id in the dataset and other fields.

For this grading logic, we assume we can get the task and other info by essentially grouping by `index` from the hidden test set and joining with the student outputs on `index`.
""

In [14]:
import json
import os
from tqdm import tqdm
from typing import List, Dict, Any
from grader import (
    InfoBenchEvaluator,
    GraphEvaluator,
    MMLUEvaluator,
    ResponseParser,
    evaluate_single,
)
import nest_asyncio

nest_asyncio.apply()

In [15]:
def load_hidden_test(path: str) -> List[Dict[str, Any]]:
    """Load hidden test JSONL file."""
    data = []
    with open(path, "r") as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data


def load_student_outputs(path: str) -> Dict[int, str]:
    """Load student outputs JSONL, return dict mapping index -> output."""
    outputs = {}
    with open(path, "r") as f:
        for line in f:
            if line.strip():
                item = json.loads(line)
                outputs[item["index"]] = item.get("output", "")
    return outputs

In [16]:
def save_jsonl(data: list, path: str):
    """Save list of dicts to JSONL."""
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    with open(path, "w") as f:
        for item in data:
            f.write(json.dumps(item) + "\n")


def save_json(data: dict, path: str):
    """Save dict to JSON."""
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    with open(path, "w") as f:
        json.dump(data, f, indent=2)

In [17]:
# NOTe: Remove later helper for printing metrics


def print_metrics(metrics: dict):
    """Print metrics summary."""
    print("\n" + "=" * 50)
    print(f"RESULTS: {metrics['student_id']}")
    print("=" * 50)
    for task, m in metrics["task_metrics"].items():
        print(f"{task:12s}: {m['accuracy']:.4f} ({m['count']} examples)")
    print("-" * 50)
    print(f"{'OVERALL':12s}: {metrics['overall_accuracy']:.4f}")
    print("=" * 50)


In [18]:
# ============================================================================
# METRICS
# ============================================================================
def calculate_metrics(results: list, student_id: str) -> dict:
    """Calculate task-wise and overall metrics."""
    task_scores = {"mmlu_med": [], "graph": [], "infobench": []}

    for r in results:
        task = r["task"]
        if task in task_scores:
            task_scores[task].append(r["score"])

    metrics = {
        "student_id": student_id,
        "total_examples": len(results),
        "task_metrics": {},
        "overall_accuracy": 0.0,
    }

    all_scores = []
    for task, scores in task_scores.items():
        if scores:
            metrics["task_metrics"][task] = {
                "count": len(scores),
                "accuracy": sum(scores) / len(scores),
                "total_score": sum(scores),
            }
            all_scores.extend(scores)

    if all_scores:
        metrics["overall_accuracy"] = sum(all_scores) / len(all_scores)

    return metrics

In [19]:
def run_eval(
    hidden_test: list, student_outputs: dict, infobench_evaluator: InfoBenchEvaluator
) -> list:
    """Run evaluation on all test items."""
    results = []

    for idx, test_item in enumerate(tqdm(hidden_test, desc="Evaluating")):
        index = test_item["index"]
        student_response = student_outputs.get(index, "")
        result = evaluate_single(idx, test_item, student_response, infobench_evaluator)
        results.append(result)

    return results


# RUN EVALUATION

In [20]:
openai_key = os.getenv("OPENAI_API_KEY")

In [26]:
# Transform simulation summary into the .jsonl format required here

run_name = "run15"
sim_summary_path = f"../request/{run_name}/simulation_summary.json"
output_path = f"student_outputs_{run_name}.jsonl"

with open(sim_summary_path, "r") as f:
    sim_summary = json.load(f)

results = sim_summary["results"]

# Each item in `results` is a dict: {'prompt_idxs': [int, ...], 'response': {'choices': [{'index': int, 'text': str}, ...]}}
# The .jsonl format we need here has each line as: {'index': int, 'output': str}

student_outputs = {}
for item in results:
    prompt_idxs = item["prompt_idxs"]
    response = item["response"]
    choices = response["choices"] if response is not None else []
    if not choices:
        for overall_index in prompt_idxs:
            student_outputs[overall_index] = "Error: no response"
        continue
    for choice in choices:
        choice_index = choice["index"]
        overall_index = prompt_idxs[choice_index]
        student_outputs[overall_index] = choice["text"]

# save student outputs to a .jsonl fil
with open(output_path, "w") as f:
    for index, output in student_outputs.items():
        json_line = {"index": index, "output": output}
        f.write(json.dumps(json_line) + "\n")

In [27]:
# === Configuration ===
HIDDEN_TEST_PATH = "combined_dataset_full.jsonl"
STUDENT_OUTPUT_PATH = f"student_outputs_{run_name}.jsonl"
OUTPUT_DIR = f"./eval_results_{run_name}"
STUDENT_ID = "test_student"
EVAL_MODEL = "gpt-5-nano-2025-08-07"

if not openai_key:
    raise ValueError("OPENAI_API_KEY environment variable not set.")
# === Load data ===
print("Loading data...")
hidden_test = load_hidden_test(HIDDEN_TEST_PATH)
student_outputs = load_student_outputs(STUDENT_OUTPUT_PATH)

print(f"Hidden test size: {len(hidden_test)}")
print(f"Student outputs: {len(student_outputs)}")

Loading data...
Hidden test size: 300
Student outputs: 300


In [28]:
# # === Initialize InfoBench Evaluator ===
print("\nInitializing InfoBench evaluator...")
infobench_evaluator = InfoBenchEvaluator(openai_key, EVAL_MODEL)

# print("Verifying OpenAI connection...")
# if not infobench_evaluator.verify_connection():
#     raise RuntimeError("OpenAI connection failed - cannot proceed")
# print("OpenAI connection verified ✓")



Initializing InfoBench evaluator...


In [29]:
# === Run Evaluation ===
print(f"\nEvaluating: {STUDENT_ID}")
print("-" * 50)
results = run_eval(hidden_test, student_outputs, infobench_evaluator)

# === Calculate Metrics ===\
metrics = calculate_metrics(results, STUDENT_ID)


Evaluating: test_student
--------------------------------------------------


Evaluating: 100%|██████████| 300/300 [00:00<00:00, 85627.17it/s]


In [30]:
# === Save Results ===
results_path = os.path.join(OUTPUT_DIR, f"{STUDENT_ID}_results_full.jsonl")
metrics_path = os.path.join(OUTPUT_DIR, f"{STUDENT_ID}_metrics_full.json")

save_jsonl(results, results_path)
save_json(metrics, metrics_path)

# === Print Summary ===
print_metrics(metrics)
print(f"\nResults saved to: {results_path}")
print(f"Metrics saved to: {metrics_path}")


RESULTS: test_student
mmlu_med    : 0.8500 (100 examples)
graph       : 0.9500 (100 examples)
infobench   : 0.0000 (100 examples)
--------------------------------------------------
OVERALL     : 0.6000

Results saved to: ./eval_results_run15/test_student_results_full.jsonl
Metrics saved to: ./eval_results_run15/test_student_metrics_full.json
