In [2]:
import json
import os
import time
import logging
from pathlib import Path

from preprocessing import DataPreprocessor
from server import SingleVariantServer
from load_generator import ClosedLoopLoadGenerator
from metrics import MetricsCalculator
from evaluation import HeldOutEvaluator

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

In [8]:
# -------------------------
# Configuration
# -------------------------

PREPROCESS = False

DATA_DIR = "data/raw"
PROCESSED_DIR = "data/processed"

MODEL_NAME_OR_PATH = "meta-llama/Llama-2-7b-chat-hf"
# OR local path (we'll discuss this below)
# MODEL_NAME_OR_PATH = "/mnt/models/llama-2-7b-chat"

DEVICE = "mps"
DTYPE = "auto"

NUM_REQUESTS = 5000
CONCURRENCIES = [1, 2, 4, 8, 16, 32]

DATA_SUBSET = 0  # 0 = full data

OUTPUT_DIR = "results/baseline_med"
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [9]:
if PREPROCESS:
    logger.info("[STEP 0] Preprocessing data")

    preprocessor = DataPreprocessor(
        data_dir=DATA_DIR,
        output_dir=PROCESSED_DIR
    )
    train_data, val_data, test_data = preprocessor.run_pipeline()

In [10]:
def load_data(data_dir):
    splits = {"train": [], "val": [], "test": []}

    for split in splits:
        path = os.path.join(data_dir, f"{split}_data.jsonl")
        if not os.path.exists(path):
            logger.warning(f"Missing {path}")
            continue

        with open(path) as f:
            for line in f:
                if line.strip():
                    splits[split].append(json.loads(line))

    return splits["train"], splits["val"], splits["test"]


train_data, val_data, test_data = load_data(PROCESSED_DIR)

assert len(val_data) > 0 and len(test_data) > 0, "Validation/Test data missing"

if DATA_SUBSET > 0:
    val_data = val_data[:DATA_SUBSET]
    test_data = test_data[:DATA_SUBSET]

logger.info(f"Loaded val={len(val_data)}, test={len(test_data)}")


2026-01-15 08:42:37,456 - __main__ - INFO - Loaded val=4873, test=4874


In [None]:
logger.info("[STEP 2] Initializing server")

server = SingleVariantServer(
    model_name=MODEL_NAME_OR_PATH,
    variant="med",
    device=DEVICE,
    dtype=DTYPE
)

2026-01-15 08:42:38,577 - __main__ - INFO - [STEP 2] Initializing server
2026-01-15 08:42:38,578 - server - INFO - Initializing MED server
2026-01-15 08:42:38,579 - server - INFO -   Model: meta-llama/Llama-2-7b-chat-hf
2026-01-15 08:42:38,579 - server - INFO -   Device: mps
2026-01-15 08:42:38,579 - server - INFO -   Dtype: auto
2026-01-15 08:42:39,511 - server - INFO - Tokenizer loaded: LlamaTokenizerFast
2026-01-15 08:42:39,511 - server - INFO - Loading model with 8-bit quantization...
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2026-01-15 08:42:53,562 - server - INFO - Model loaded successfully
2026-01-15 08:42:53,564 - server - INFO - Model size: 6.74B parameters


In [12]:
load_gen = ClosedLoopLoadGenerator(
    inference_func=server.generate,
    max_concurrency=1,
    num_requests=10,
    data_loader=val_data
)

metrics = load_gen.run()
calc = MetricsCalculator(metrics)
calc.print_report("SANITY CHECK (Concurrency=1)")

2026-01-15 08:44:52,706 - load_generator - INFO - Initialized ClosedLoopLoadGenerator
2026-01-15 08:44:52,707 - load_generator - INFO -   Concurrency: 1
2026-01-15 08:44:52,707 - load_generator - INFO -   Total requests: 10
2026-01-15 08:44:52,708 - load_generator - INFO -   Data pool size: 4873
2026-01-15 08:44:52,709 - load_generator - INFO - STARTING LOAD TEST: 10 requests @ 1 concurrency
2026-01-15 08:45:00,104 - load_generator - INFO - Load test complete in 7.4s
2026-01-15 08:45:00,105 - metrics - INFO - Initialized MetricsCalculator with 2 metrics



SANITY CHECK (Concurrency=1)

SUMMARY:
  Total Requests:             2
  Successful:                 2
  Failed:                     0
  Success Rate:          100.00%
  Total Duration:          7.39 seconds
  Throughput:               0.3 tokens/sec
  SLO Compliance:          0.00%
  SLO Violations:             2
  Escalation Rate:         0.00%

TTFT (Time-to-First-Token) in milliseconds:
  P50:  2616.81 ms
  P75:  3320.93 ms
  P90:  3743.41 ms
  P95:  3884.23 ms
  P99:  3996.89 ms
  Mean: 2616.81 ms (±1408.25)

TPOT (Time-Per-Output-Token) in milliseconds:
  P50:  1070.81 ms
  P75:  1111.84 ms
  P90:  1136.46 ms
  P95:  1144.67 ms
  P99:  1151.23 ms
  Mean: 1070.81 ms (±82.06)

E2E Latency (End-to-End) in milliseconds:
  P50:  3695.24 ms
  P75:  4443.13 ms
  P90:  4891.87 ms
  P95:  5041.45 ms
  P99:  5161.11 ms
  Mean: 3695.24 ms (±1495.79)

Queue Wait Time in milliseconds:
  P50:     0.03 ms
  P95:     0.05 ms
  P99:     0.05 ms
  Mean:    0.03 ms

Sample SLO Violations (first 10

{'summary': {'total_requests': 2,
  'successful_requests': 2,
  'failed_requests': 0,
  'success_rate': 1.0,
  'slo_compliant': 0,
  'slo_violations': 2,
  'slo_compliance': 0.0,
  'escalation_rate': 0.0,
  'total_duration_sec': 7.3905720710754395,
  'throughput_tokens_per_sec': 0.27061504586734514},
 'ttft': {'p50': 2616.8066263198853,
  'p75': 3320.931851863861,
  'p90': 3743.4069871902466,
  'p95': 3884.2320322990417,
  'p99': 3996.892068386078,
  'mean': 2616.8066263198853,
  'std': 1408.2504510879517},
 'tpot': {'p50': 1070.8125829696655,
  'p75': 1111.8443608283997,
  'p90': 1136.4634275436401,
  'p95': 1144.669783115387,
  'p99': 1151.2348675727844,
  'mean': 1070.8125829696655,
  'std': 82.06355571746826},
 'e2e_latency': {'p50': 3695.2404975891113,
  'p75': 4443.134784698486,
  'p90': 4891.871356964111,
  'p95': 5041.450214385986,
  'p99': 5161.113300323486,
  'mean': 3695.2404975891113,
  'std': 1495.78857421875},
 'queue_wait': {'p50': 0.027179718017578125,
  'p75': 0.039219

In [13]:
all_metrics_summary = {}

for concurrency in CONCURRENCIES:
    logger.info(f"Running load test: concurrency={concurrency}")

    load_gen = ClosedLoopLoadGenerator(
        inference_func=server.generate,
        max_concurrency=concurrency,
        num_requests=NUM_REQUESTS,
        data_loader=val_data
    )

    start = time.time()
    raw_metrics = load_gen.run()
    duration = time.time() - start

    calc = MetricsCalculator(raw_metrics)
    metrics = calc.compute_all_metrics()

    calc.print_report(f"Concurrency {concurrency}")

    # Save
    calc.save_metrics(f"{OUTPUT_DIR}/metrics_{concurrency}.json")
    load_gen.save_metrics(f"{OUTPUT_DIR}/requests_{concurrency}.jsonl")

    all_metrics_summary[concurrency] = {
        "metrics": metrics,
        "duration_sec": duration
    }


2026-01-15 08:46:08,898 - __main__ - INFO - Running load test: concurrency=1
2026-01-15 08:46:08,900 - load_generator - INFO - Initialized ClosedLoopLoadGenerator
2026-01-15 08:46:08,901 - load_generator - INFO -   Concurrency: 1
2026-01-15 08:46:08,902 - load_generator - INFO -   Total requests: 5000
2026-01-15 08:46:08,903 - load_generator - INFO -   Data pool size: 4873
2026-01-15 08:46:08,904 - load_generator - INFO - STARTING LOAD TEST: 5000 requests @ 1 concurrency
2026-01-15 08:46:13,699 - load_generator - INFO - Load test complete in 4.8s
2026-01-15 08:46:13,699 - metrics - INFO - Initialized MetricsCalculator with 2 metrics
2026-01-15 08:46:13,704 - metrics - INFO - Saved metrics to results/baseline_med/metrics_1.json
2026-01-15 08:46:13,704 - load_generator - INFO - Saved metrics to results/baseline_med/requests_1.jsonl
2026-01-15 08:46:13,704 - __main__ - INFO - Running load test: concurrency=2
2026-01-15 08:46:13,705 - load_generator - INFO - Initialized ClosedLoopLoadGener


Concurrency 1

SUMMARY:
  Total Requests:             2
  Successful:                 2
  Failed:                     0
  Success Rate:          100.00%
  Total Duration:          4.79 seconds
  Throughput:               0.4 tokens/sec
  SLO Compliance:          0.00%
  SLO Violations:             2
  Escalation Rate:         0.00%

TTFT (Time-to-First-Token) in milliseconds:
  P50:  1281.75 ms
  P75:  1441.69 ms
  P90:  1537.65 ms
  P95:  1569.63 ms
  P99:  1595.22 ms
  Mean: 1281.75 ms (±319.87)

TPOT (Time-Per-Output-Token) in milliseconds:
  P50:  1111.31 ms
  P75:  1173.47 ms
  P90:  1210.76 ms
  P95:  1223.19 ms
  P99:  1233.14 ms
  Mean: 1111.31 ms (±124.31)

E2E Latency (End-to-End) in milliseconds:
  P50:  2394.89 ms
  P75:  2617.53 ms
  P90:  2751.11 ms
  P95:  2795.64 ms
  P99:  2831.26 ms
  Mean: 2394.89 ms (±445.27)

Queue Wait Time in milliseconds:
  P50:     0.00 ms
  P95:     0.00 ms
  P99:     0.00 ms
  Mean:    0.00 ms

Sample SLO Violations (first 10):
  1. Request 

2026-01-15 08:46:26,259 - load_generator - INFO - Load test complete in 12.6s
2026-01-15 08:46:26,260 - metrics - INFO - Initialized MetricsCalculator with 4 metrics
2026-01-15 08:46:26,265 - metrics - INFO - Saved metrics to results/baseline_med/metrics_2.json
2026-01-15 08:46:26,266 - load_generator - INFO - Saved metrics to results/baseline_med/requests_2.jsonl
2026-01-15 08:46:26,266 - __main__ - INFO - Running load test: concurrency=4
2026-01-15 08:46:26,267 - load_generator - INFO - Initialized ClosedLoopLoadGenerator
2026-01-15 08:46:26,267 - load_generator - INFO -   Concurrency: 4
2026-01-15 08:46:26,268 - load_generator - INFO -   Total requests: 5000
2026-01-15 08:46:26,268 - load_generator - INFO -   Data pool size: 4873
2026-01-15 08:46:26,268 - load_generator - INFO - STARTING LOAD TEST: 5000 requests @ 4 concurrency



Concurrency 2

SUMMARY:
  Total Requests:             4
  Successful:                 4
  Failed:                     0
  Success Rate:          100.00%
  Total Duration:         12.55 seconds
  Throughput:               0.3 tokens/sec
  SLO Compliance:          0.00%
  SLO Violations:             4
  Escalation Rate:         0.00%

TTFT (Time-to-First-Token) in milliseconds:
  P50:  3085.85 ms
  P75:  3923.11 ms
  P90:  4255.57 ms
  P95:  4366.38 ms
  P99:  4455.04 ms
  Mean: 3225.87 ms (±921.97)

TPOT (Time-Per-Output-Token) in milliseconds:
  P50:  3095.33 ms
  P75:  3255.49 ms
  P90:  3337.23 ms
  P95:  3364.48 ms
  P99:  3386.28 ms
  Mean: 2992.59 ms (±378.26)

E2E Latency (End-to-End) in milliseconds:
  P50:  6274.92 ms
  P75:  6963.99 ms
  P90:  7399.58 ms
  P95:  7544.78 ms
  P99:  7660.94 ms
  Mean: 6221.18 ms (±1123.34)

Queue Wait Time in milliseconds:
  P50:     0.00 ms
  P95:     0.00 ms
  P99:     0.00 ms
  Mean:    0.00 ms

Sample SLO Violations (first 10):
  1. Request

2026-01-15 08:46:52,672 - load_generator - INFO - Load test complete in 26.4s
2026-01-15 08:46:52,672 - metrics - INFO - Initialized MetricsCalculator with 8 metrics
2026-01-15 08:46:52,676 - metrics - INFO - Saved metrics to results/baseline_med/metrics_4.json
2026-01-15 08:46:52,677 - load_generator - INFO - Saved metrics to results/baseline_med/requests_4.jsonl
2026-01-15 08:46:52,677 - __main__ - INFO - Running load test: concurrency=8
2026-01-15 08:46:52,678 - load_generator - INFO - Initialized ClosedLoopLoadGenerator
2026-01-15 08:46:52,678 - load_generator - INFO -   Concurrency: 8
2026-01-15 08:46:52,678 - load_generator - INFO -   Total requests: 5000
2026-01-15 08:46:52,679 - load_generator - INFO -   Data pool size: 4873
2026-01-15 08:46:52,679 - load_generator - INFO - STARTING LOAD TEST: 5000 requests @ 8 concurrency



Concurrency 4

SUMMARY:
  Total Requests:             8
  Successful:                 8
  Failed:                     0
  Success Rate:          100.00%
  Total Duration:         26.40 seconds
  Throughput:               0.3 tokens/sec
  SLO Compliance:          0.00%
  SLO Violations:             8
  Escalation Rate:         0.00%

TTFT (Time-to-First-Token) in milliseconds:
  P50:  6757.58 ms
  P75:  7325.58 ms
  P90:  7435.77 ms
  P95:  7559.20 ms
  P99:  7657.93 ms
  Mean: 6721.22 ms (±719.60)

TPOT (Time-Per-Output-Token) in milliseconds:
  P50:  6228.26 ms
  P75:  6627.55 ms
  P90:  6711.32 ms
  P95:  6771.49 ms
  P99:  6819.62 ms
  Mean: 6262.53 ms (±386.05)

E2E Latency (End-to-End) in milliseconds:
  P50:  13140.19 ms
  P75:  13290.64 ms
  P90:  13456.18 ms
  P95:  13559.44 ms
  P99:  13642.05 ms
  Mean: 12988.45 ms (±537.56)

Queue Wait Time in milliseconds:
  P50:     0.00 ms
  P95:     0.00 ms
  P99:     0.00 ms
  Mean:    0.00 ms

Sample SLO Violations (first 10):
  1. Re

2026-01-15 08:47:53,378 - load_generator - INFO - Load test complete in 60.7s
2026-01-15 08:47:53,378 - metrics - INFO - Initialized MetricsCalculator with 16 metrics
2026-01-15 08:47:53,382 - metrics - INFO - Saved metrics to results/baseline_med/metrics_8.json
2026-01-15 08:47:53,383 - load_generator - INFO - Saved metrics to results/baseline_med/requests_8.jsonl
2026-01-15 08:47:53,383 - __main__ - INFO - Running load test: concurrency=16
2026-01-15 08:47:53,383 - load_generator - INFO - Initialized ClosedLoopLoadGenerator
2026-01-15 08:47:53,384 - load_generator - INFO -   Concurrency: 16
2026-01-15 08:47:53,384 - load_generator - INFO -   Total requests: 5000
2026-01-15 08:47:53,384 - load_generator - INFO -   Data pool size: 4873
2026-01-15 08:47:53,385 - load_generator - INFO - STARTING LOAD TEST: 5000 requests @ 16 concurrency



Concurrency 8

SUMMARY:
  Total Requests:            16
  Successful:                16
  Failed:                     0
  Success Rate:          100.00%
  Total Duration:         60.69 seconds
  Throughput:               0.3 tokens/sec
  SLO Compliance:          0.00%
  SLO Violations:            16
  Escalation Rate:         0.00%

TTFT (Time-to-First-Token) in milliseconds:
  P50:  14543.21 ms
  P75:  14823.85 ms
  P90:  15485.02 ms
  P95:  15504.69 ms
  P99:  15519.87 ms
  Mean: 14551.40 ms (±632.95)

TPOT (Time-Per-Output-Token) in milliseconds:
  P50:  15475.24 ms
  P75:  15989.28 ms
  P90:  16716.01 ms
  P95:  16781.79 ms
  P99:  16863.31 ms
  Mean: 15426.79 ms (±944.81)

E2E Latency (End-to-End) in milliseconds:
  P50:  30215.49 ms
  P75:  30804.57 ms
  P90:  31739.73 ms
  P95:  32305.38 ms
  P99:  32348.75 ms
  Mean: 29984.84 ms (±1442.20)

Queue Wait Time in milliseconds:
  P50:     0.00 ms
  P95:     0.00 ms
  P99:     0.01 ms
  Mean:    0.00 ms

Sample SLO Violations (first

2026-01-15 08:50:28,771 - load_generator - INFO - Load test complete in 155.4s
2026-01-15 08:50:28,781 - metrics - INFO - Initialized MetricsCalculator with 32 metrics
2026-01-15 08:50:28,795 - metrics - INFO - Saved metrics to results/baseline_med/metrics_16.json
2026-01-15 08:50:28,797 - load_generator - INFO - Saved metrics to results/baseline_med/requests_16.jsonl
2026-01-15 08:50:28,797 - __main__ - INFO - Running load test: concurrency=32
2026-01-15 08:50:28,798 - load_generator - INFO - Initialized ClosedLoopLoadGenerator
2026-01-15 08:50:28,798 - load_generator - INFO -   Concurrency: 32
2026-01-15 08:50:28,798 - load_generator - INFO -   Total requests: 5000
2026-01-15 08:50:28,798 - load_generator - INFO -   Data pool size: 4873
2026-01-15 08:50:28,799 - load_generator - INFO - STARTING LOAD TEST: 5000 requests @ 32 concurrency



Concurrency 16

SUMMARY:
  Total Requests:            32
  Successful:                32
  Failed:                     0
  Success Rate:          100.00%
  Total Duration:        155.35 seconds
  Throughput:               0.2 tokens/sec
  SLO Compliance:          0.00%
  SLO Violations:            32
  Escalation Rate:         0.00%

TTFT (Time-to-First-Token) in milliseconds:
  P50:  37592.08 ms
  P75:  40403.20 ms
  P90:  42069.41 ms
  P95:  42248.20 ms
  P99:  42398.76 ms
  Mean: 38039.26 ms (±2919.52)

TPOT (Time-Per-Output-Token) in milliseconds:
  P50:  38674.96 ms
  P75:  39873.35 ms
  P90:  40542.34 ms
  P95:  40744.07 ms
  P99:  41164.26 ms
  Mean: 38572.72 ms (±1595.66)

E2E Latency (End-to-End) in milliseconds:
  P50:  77225.19 ms
  P75:  78951.26 ms
  P90:  79522.24 ms
  P95:  81365.00 ms
  P99:  82064.32 ms
  Mean: 76625.64 ms (±3134.41)

Queue Wait Time in milliseconds:
  P50:     0.00 ms
  P95:     0.00 ms
  P99:     0.01 ms
  Mean:    0.00 ms

Sample SLO Violations (fi

2026-01-15 09:00:29,236 - load_generator - INFO - Load test complete in 600.4s
2026-01-15 09:00:29,274 - metrics - INFO - Initialized MetricsCalculator with 64 metrics
2026-01-15 09:00:29,313 - metrics - INFO - Saved metrics to results/baseline_med/metrics_32.json
2026-01-15 09:00:29,319 - load_generator - INFO - Saved metrics to results/baseline_med/requests_32.jsonl



Concurrency 32

SUMMARY:
  Total Requests:            64
  Successful:                64
  Failed:                     0
  Success Rate:          100.00%
  Total Duration:        599.75 seconds
  Throughput:               0.1 tokens/sec
  SLO Compliance:          0.00%
  SLO Violations:            64
  Escalation Rate:         0.00%

TTFT (Time-to-First-Token) in milliseconds:
  P50:  139861.80 ms
  P75:  179087.96 ms
  P90:  183124.62 ms
  P95:  185788.28 ms
  P99:  186177.00 ms
  Mean: 142121.64 ms (±37386.87)

TPOT (Time-Per-Output-Token) in milliseconds:
  P50:  147082.75 ms
  P75:  165830.44 ms
  P90:  169659.12 ms
  P95:  175246.54 ms
  P99:  176073.36 ms
  Mean: 148604.50 ms (±20837.08)

E2E Latency (End-to-End) in milliseconds:
  P50:  296736.57 ms
  P75:  344216.96 ms
  P90:  356668.63 ms
  P95:  360648.07 ms
  P99:  364478.01 ms
  Mean: 293574.13 ms (±55068.37)

Queue Wait Time in milliseconds:
  P50:     0.00 ms
  P95:     0.03 ms
  P99:     0.89 ms
  Mean:    0.04 ms

Samp

In [14]:
evaluator = HeldOutEvaluator(
    model=server,
    data_loader=test_data,
    batch_size=32
)

eval_results = evaluator.evaluate()

with open(f"{OUTPUT_DIR}/eval_results.json", "w") as f:
    json.dump(eval_results, f, indent=2)

eval_results

2026-01-15 09:01:28,337 - evaluation - INFO - EVALUATING ON 4874 EXAMPLES
2026-01-15 09:04:55,114 - evaluation - INFO -   Generated 32/4874 predictions
2026-01-15 09:07:02,069 - evaluation - INFO -   Generated 64/4874 predictions
2026-01-15 09:08:42,536 - evaluation - INFO -   Generated 96/4874 predictions
2026-01-15 09:10:45,126 - evaluation - INFO -   Generated 128/4874 predictions
2026-01-15 09:13:13,881 - evaluation - INFO -   Generated 160/4874 predictions
2026-01-15 09:15:21,007 - evaluation - INFO -   Generated 192/4874 predictions
2026-01-15 09:17:28,949 - evaluation - INFO -   Generated 224/4874 predictions
2026-01-15 09:19:50,728 - evaluation - INFO -   Generated 256/4874 predictions
2026-01-15 09:22:08,255 - evaluation - INFO -   Generated 288/4874 predictions
2026-01-15 09:30:35,928 - evaluation - INFO -   Generated 320/4874 predictions
2026-01-15 09:33:29,084 - evaluation - INFO -   Generated 352/4874 predictions
2026-01-15 09:35:36,179 - evaluation - INFO -   Generated 38

{'mmlu': {'accuracy': 0.003244646333549643,
  'em': 0.003244646333549643,
  'correct_count': 10,
  'total_count': 3082},
 'gsm8k': {'accuracy': 0.0,
  'em': 0.0,
  'correct_count': 0,
  'total_count': 1792},
 'overall': {'accuracy': 0.002051702913418137,
  'em': 0.002051702913418137,
  'correct_count': 10,
  'total_count': 4874}}

In [15]:
summary = {
    "load_tests": all_metrics_summary,
    "eval_results": eval_results,
    "config": {
        "model": MODEL_NAME_OR_PATH,
        "device": DEVICE,
        "num_requests": NUM_REQUESTS,
        "concurrencies": CONCURRENCIES
    }
}

with open(f"{OUTPUT_DIR}/summary.json", "w") as f:
    json.dump(summary, f, indent=2)

logger.info("Evaluation complete")


2026-01-15 14:55:15,647 - __main__ - INFO - Evaluation complete
