# 04 — RL vs PID Policy Evaluation

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/SharathSPhD/RLpower/blob/main/notebooks/04_policy_evaluation.ipynb)

Visualize RL vs PID results across curriculum phases from saved evaluation report artifacts.

This notebook runs fully on Google Colab — no FMU binary is required. Evaluation results are loaded from `data/evaluation_report.json` and `data/cross_validation_212k.json` in the repository.

**Key result**: At step 212,992 with all infrastructure bugs fixed, the RL controller achieves a **+17.6%** improvement in cumulative episode reward over the PID baseline (806.6 vs 685.9) in Phase 0 (steady-state optimisation). The agent subsequently traverses all 7 curriculum phases within 229,376 steps, reaching Phase 6 (emergency turbine trip recovery) with mean reward 399.9 and zero constraint violations.

In [1]:
# ── Environment Setup (runs on Colab or locally) ──────────────────────────────
import subprocess, sys, os

IN_COLAB = "google.colab" in sys.modules
try:
    import google.colab  # noqa: F401
    IN_COLAB = True
except ImportError:
    pass

REPO_URL = "https://github.com/SharathSPhD/RLpower.git"
REPO_DIR = "/content/RLpower" if IN_COLAB else os.environ.get("WORKSPACE_DIR", "/workspace")

if IN_COLAB:
    if not os.path.exists(REPO_DIR):
        subprocess.run(["git", "clone", "--depth=1", REPO_URL, REPO_DIR], check=True)
    os.chdir(REPO_DIR)
    subprocess.run(
        [sys.executable, "-m", "pip", "install", "-q", "matplotlib", "numpy"],
        check=True,
    )
else:
    os.chdir(REPO_DIR)

import matplotlib
matplotlib.use("Agg")

from pathlib import Path
import json
import matplotlib.pyplot as plt
import numpy as np

print(f"Environment: {'Google Colab' if IN_COLAB else 'Local/Docker'}")
print("Imports OK")

Imports OK


In [2]:
ROOT = Path('.').resolve()

# Load from repo data/ dir (works on Colab and locally)
def _load_report(data_path, fallback_path):
    p = ROOT / data_path
    if not p.exists():
        p = ROOT / fallback_path
    return json.loads(p.read_text())

# Phase-level evaluation (from pre-training with bugs, Phase 0 only trained)
report = _load_report('data/evaluation_report.json', 'artifacts/policies/evaluation_report.json')
# Cross-validation result with all bugs fixed at 212k steps
cv = _load_report('data/cross_validation_212k.json', 'artifacts/reports/cross_validation_cpu_parallel_212k.json')

per_phase = report.get('per_phase', [])
phases = [p['phase'] for p in per_phase]
rl   = np.asarray([p['rl_mean_reward']        for p in per_phase], dtype=np.float64)
pid  = np.asarray([p['pid_mean_reward']        for p in per_phase], dtype=np.float64)
impr = np.asarray([p['reward_improvement_pct'] for p in per_phase], dtype=np.float64)

print('=== Phase-level evaluation (Phase-0-trained checkpoint) ===')
print(f'Overall RL  mean reward: {report["rl_mean_reward"]:.2f}')
print(f'Overall PID mean reward: {report["pid_mean_reward"]:.2f}')
print(f'Overall improvement   : {report["reward_improvement_pct"]:+.2f}%')
print()
print('=== Cross-validation with all bugs fixed (step 212,992) ===')
print(f'RL  mean reward: {cv["rl_mean_reward"]:.2f}')
print(f'PID mean reward: {cv["pid_mean_reward"]:.2f}')
print(f'Improvement    : {cv["reward_improvement_pct"]:+.2f}%')

Overall RL mean reward: 134.2883293282662
Overall PID mean reward: 114.31978125194696
Overall improvement %: 17.46727281793542


In [3]:
fig, axs = plt.subplots(1, 2, figsize=(14, 5))

axs[0].plot(phases, rl,  marker='o', label='RL (Phase-0 trained checkpoint)')
axs[0].plot(phases, pid, marker='s', label='PID baseline')
axs[0].set_xlabel('Curriculum Phase')
axs[0].set_ylabel('Mean Episode Reward')
axs[0].set_title('RL vs PID by Curriculum Phase\n(Phase-0-trained policy evaluated on all phases)')
axs[0].grid(True)
axs[0].legend()

colors = ['tab:green' if x >= 0 else 'tab:red' for x in impr]
bars = axs[1].bar(phases, impr, color=colors)
axs[1].axhline(0.0, color='black', linewidth=1)
axs[1].axhline(cv['reward_improvement_pct'], color='tab:blue', linestyle='--',
               label=f'Bug-fixed @212k: {cv["reward_improvement_pct"]:+.1f}%')
axs[1].set_xlabel('Curriculum Phase')
axs[1].set_ylabel('Improvement over PID [%]')
axs[1].set_title('RL Improvement over PID')
axs[1].legend()
axs[1].grid(True, axis='y')

plt.tight_layout()
plt.savefig('/tmp/policy_evaluation.png', dpi=90, bbox_inches='tight')
plt.show()
print()
print('Note: Phases 1-6 show negative RL improvement because the checkpoint')
print('was trained ONLY on Phase 0 (steady-state). After full curriculum')
print('traversal (229k steps with all bugs fixed), the policy handles all')
print('7 phases with mean_reward=399.9 and zero constraint violations.')