In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# RL Case Study: Adaptive Chemotherapy Dose Optimization

## NovaCure Therapeutics -- Personalizing Cancer Treatment with Q-Learning

In this notebook, we implement the full case study: an RL agent that learns to optimize chemotherapy dosing for individual patients. We build a pharmacokinetic/pharmacodynamic (PK/PD) patient simulator, wrap it in a Gymnasium-compatible environment, and train a Q-learning agent to outperform fixed-dose and rule-based protocols.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

print("RL Case Study: Adaptive Chemotherapy Dose Optimization")
print("NovaCure Therapeutics -- Vizuara")
print("=" * 55)

## Part A: Patient Simulator (PK/PD Model)

We model tumor dynamics using the Gompertzian growth model and log-kill drug effect. Inter-patient variability is captured through randomized PK/PD parameters.

In [None]:
class PatientSimulator:
    """
    Simulated patient for chemotherapy dose optimization.

    Tumor dynamics: Gompertzian growth with log-kill drug effect
    Toxicity: WBC suppression proportional to dose, with recovery

    State: [tumor_size, wbc, toxicity_grade, kidney_function, cycle_number]
    """

    def __init__(self, seed=None):
        if seed is not None:
            np.random.seed(seed)

        # Patient-specific parameters (sampled for inter-patient variability)
        self.tumor_growth_rate = np.random.uniform(0.02, 0.08)
        self.tumor_carrying_capacity = np.random.uniform(80, 120)  # mm
        self.drug_sensitivity = np.random.uniform(0.15, 0.35)
        self.toxicity_sensitivity = np.random.uniform(0.3, 0.7)
        self.wbc_recovery_rate = np.random.uniform(0.4, 0.8)
        self.baseline_wbc = np.random.uniform(6000, 10000)  # cells/uL
        self.baseline_egfr = np.random.uniform(80, 120)  # mL/min

        # Initial state
        self.tumor_size = np.random.uniform(30, 60)  # mm
        self.wbc = self.baseline_wbc
        self.toxicity_grade = 0
        self.egfr = self.baseline_egfr
        self.cycle = 1
        self.max_cycles = 8

        # Track cumulative drug exposure
        self.cumulative_dose = 0.0

    def get_state(self):
        """Return normalized state vector."""
        return np.array([
            self.tumor_size / 100.0,          # normalize to ~[0, 1]
            self.wbc / 10000.0,               # normalize to ~[0, 1]
            self.toxicity_grade / 4.0,         # normalize to [0, 1]
            self.egfr / 120.0,                 # normalize to ~[0, 1]
            self.cycle / self.max_cycles,      # normalize to [0, 1]
        ])

    def step(self, dose_fraction):
        """
        Simulate one treatment cycle.

        Args:
            dose_fraction: fraction of standard dose (e.g., 0.8 = 80% of standard)

        Returns:
            state, reward, done
        """
        # --- Tumor dynamics (Gompertzian growth + log-kill) ---
        # Natural growth
        growth = self.tumor_growth_rate * self.tumor_size * np.log(
            self.tumor_carrying_capacity / max(self.tumor_size, 1.0)
        )

        # Drug effect (log-kill: kills a fraction proportional to dose)
        drug_kill = self.drug_sensitivity * dose_fraction * self.tumor_size

        # Add noise
        noise = np.random.normal(0, 0.5)

        tumor_change = growth - drug_kill + noise
        old_tumor = self.tumor_size
        self.tumor_size = max(1.0, self.tumor_size + tumor_change)

        # --- Hematological toxicity (WBC suppression) ---
        # Drug suppresses WBC proportional to dose
        wbc_suppression = self.toxicity_sensitivity * dose_fraction * self.wbc * 0.3
        # Natural recovery toward baseline
        wbc_recovery = self.wbc_recovery_rate * (self.baseline_wbc - self.wbc) * 0.2

        self.wbc = max(500, self.wbc - wbc_suppression + wbc_recovery + np.random.normal(0, 200))

        # --- Toxicity grading (based on WBC count, CTCAE-like) ---
        if self.wbc >= 4000:
            self.toxicity_grade = 0
        elif self.wbc >= 3000:
            self.toxicity_grade = 1
        elif self.wbc >= 2000:
            self.toxicity_grade = 2
        elif self.wbc >= 1000:
            self.toxicity_grade = 3
        else:
            self.toxicity_grade = 4

        # --- Kidney function (gradual decline with cumulative dose) ---
        self.cumulative_dose += dose_fraction
        egfr_decline = 0.5 * dose_fraction + np.random.normal(0, 0.3)
        self.egfr = max(30, self.egfr - egfr_decline)

        # --- Reward computation ---
        # Component 1: Tumor shrinkage (positive reward)
        tumor_shrinkage = (old_tumor - self.tumor_size) / old_tumor  # fractional change
        r_tumor = 10.0 * tumor_shrinkage

        # Component 2: Toxicity penalty
        r_toxicity = 0.0
        if self.toxicity_grade >= 3:
            r_toxicity = -5.0
        if self.toxicity_grade >= 4:
            r_toxicity = -15.0

        # Component 3: Completion bonus
        r_completion = 0.5 if self.toxicity_grade < 3 else 0.0

        reward = r_tumor + r_toxicity + r_completion

        # --- Episode termination ---
        self.cycle += 1
        done = False

        # End conditions
        if self.cycle > self.max_cycles:
            done = True
            # Bonus for completing all cycles
            reward += 5.0
        if self.toxicity_grade >= 4 and np.random.random() < 0.5:
            done = True  # treatment discontinuation
            reward -= 10.0
        if self.tumor_size < 5.0:
            done = True
            reward += 20.0  # near-complete response

        return self.get_state(), reward, done


# Test the simulator
patient = PatientSimulator(seed=42)
print("Initial patient state:")
print(f"  Tumor size: {patient.tumor_size:.1f} mm")
print(f"  WBC count:  {patient.wbc:.0f} cells/uL")
print(f"  Toxicity:   Grade {patient.toxicity_grade}")
print(f"  Kidney (eGFR): {patient.egfr:.1f} mL/min")
print(f"  Cycle: {patient.cycle}/{patient.max_cycles}")

# Run a few cycles with standard dose
print("\nSimulating 3 cycles at standard dose (1.0):")
for i in range(3):
    state, reward, done = patient.step(1.0)
    print(f"  Cycle {i+2}: tumor={patient.tumor_size:.1f}mm, "
          f"WBC={patient.wbc:.0f}, tox=Grade {patient.toxicity_grade}, "
          f"reward={reward:.2f}, done={done}")

## Part B: Gymnasium-Compatible Environment

In [None]:
class ChemoDoseEnv:
    """
    Gymnasium-like environment for chemotherapy dose optimization.

    Action space: 6 discrete dose levels
        0: 0.6x standard dose
        1: 0.7x standard dose
        2: 0.8x standard dose
        3: 0.9x standard dose
        4: 1.0x standard dose (standard)
        5: 1.1x standard dose

    Observation: 5-dimensional normalized state
    """

    DOSE_LEVELS = [0.6, 0.7, 0.8, 0.9, 1.0, 1.1]

    def __init__(self):
        self.n_actions = len(self.DOSE_LEVELS)
        self.n_obs = 5
        self.patient = None

    def reset(self, seed=None):
        self.patient = PatientSimulator(seed=seed)
        return self.patient.get_state()

    def step(self, action):
        dose = self.DOSE_LEVELS[action]
        state, reward, done = self.patient.step(dose)
        info = {
            'tumor_size': self.patient.tumor_size,
            'wbc': self.patient.wbc,
            'toxicity_grade': self.patient.toxicity_grade,
            'egfr': self.patient.egfr,
            'cycle': self.patient.cycle,
            'dose': dose,
        }
        return state, reward, done, info


# Test the environment
env = ChemoDoseEnv()
state = env.reset(seed=0)
print("ChemoDoseEnv created.")
print(f"  Action space: {env.n_actions} actions (dose levels: {env.DOSE_LEVELS})")
print(f"  State dimension: {env.n_obs}")
print(f"  Initial state: {state}")

## Part C: Baseline Agents

In [None]:
class FixedDoseAgent:
    """Always prescribes the standard dose (1.0x)."""

    def choose_action(self, state):
        return 4  # index 4 = 1.0x standard dose

    def name(self):
        return "Fixed Dose (1.0x)"


class RuleBasedAgent:
    """
    Adjusts dose based on current toxicity grade.

    - Grade 0-1: maintain or increase dose
    - Grade 2: maintain dose
    - Grade 3: reduce by 20%
    - Grade 4: reduce to minimum
    """

    def __init__(self):
        self.current_dose_idx = 4  # start at standard

    def choose_action(self, state):
        toxicity = state[2] * 4.0  # denormalize

        if toxicity >= 4:
            self.current_dose_idx = 0  # 0.6x
        elif toxicity >= 3:
            self.current_dose_idx = max(0, self.current_dose_idx - 2)  # reduce by 2 levels
        elif toxicity >= 2:
            self.current_dose_idx = max(0, self.current_dose_idx - 1)  # reduce by 1 level
        elif toxicity <= 1:
            self.current_dose_idx = min(5, self.current_dose_idx + 1)  # increase by 1 level

        return self.current_dose_idx

    def reset(self):
        self.current_dose_idx = 4

    def name(self):
        return "Rule-Based"


print("Baseline agents defined:")
print("  1. Fixed Dose: always prescribes 1.0x standard")
print("  2. Rule-Based: adjusts dose reactively based on toxicity grade")

## Part D: Q-Learning Agent

In [None]:
class ChemoQLearningAgent:
    """
    Q-learning agent for chemotherapy dose optimization.

    Discretizes the 5-dimensional continuous state into bins
    for table-based Q-learning.
    """

    def __init__(self, n_actions=6, n_bins=6, alpha=0.1, gamma=0.95,
                 epsilon=1.0, epsilon_decay=0.999, epsilon_min=0.05):
        self.n_actions = n_actions
        self.n_bins = n_bins
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

        self.q_table = defaultdict(lambda: np.zeros(n_actions))

        # Bin edges for each state dimension
        self.bins = [
            np.linspace(0, 1, n_bins),      # tumor size (normalized)
            np.linspace(0, 1.2, n_bins),     # WBC (normalized)
            np.linspace(0, 1, n_bins),       # toxicity grade (normalized)
            np.linspace(0.25, 1, n_bins),    # kidney function (normalized)
            np.linspace(0, 1, n_bins),       # cycle number (normalized)
        ]

    def discretize(self, state):
        discrete = []
        for i, val in enumerate(state):
            idx = np.digitize(val, self.bins[i])
            discrete.append(idx)
        return tuple(discrete)

    def choose_action(self, state):
        disc_state = self.discretize(state)

        if np.random.random() < self.epsilon:
            return np.random.randint(self.n_actions)
        else:
            return np.argmax(self.q_table[disc_state])

    def update(self, state, action, reward, next_state, done):
        s = self.discretize(state)
        ns = self.discretize(next_state)

        if done:
            target = reward
        else:
            target = reward + self.gamma * np.max(self.q_table[ns])

        self.q_table[s][action] += self.alpha * (target - self.q_table[s][action])

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def name(self):
        return "Q-Learning"


print("Q-Learning agent defined.")
print("Update rule: Q(s,a) <- Q(s,a) + alpha * [r + gamma * max Q(s',a') - Q(s,a)]")

## Part E: Training

In [None]:
def evaluate_agent(agent, n_episodes=200, verbose=False):
    """Evaluate an agent over many simulated patients."""
    env = ChemoDoseEnv()

    results = {
        'rewards': [],
        'tumor_responses': [],
        'toxicity_events': [],  # grade 3+
        'completions': [],      # completed all 8 cycles
        'final_tumor_sizes': [],
        'dose_histories': [],
    }

    for ep in range(n_episodes):
        state = env.reset(seed=ep + 10000)
        total_reward = 0
        tox_events = 0
        doses = []

        if hasattr(agent, 'reset'):
            agent.reset()

        for step in range(8):
            if hasattr(agent, 'choose_action'):
                action = agent.choose_action(state)
            else:
                action = agent(state)  # function-based agent

            next_state, reward, done, info = env.step(action)
            total_reward += reward
            doses.append(info['dose'])

            if info['toxicity_grade'] >= 3:
                tox_events += 1

            state = next_state
            if done:
                break

        results['rewards'].append(total_reward)
        results['final_tumor_sizes'].append(info['tumor_size'])
        results['tumor_responses'].append(1 if info['tumor_size'] < 30 else 0)
        results['toxicity_events'].append(tox_events)
        results['completions'].append(1 if info['cycle'] > 8 else 0)
        results['dose_histories'].append(doses)

    return results


# Train the Q-learning agent
print("Training Q-learning agent...")
print("=" * 50)

env = ChemoDoseEnv()
q_agent = ChemoQLearningAgent(
    n_actions=6,
    n_bins=6,
    alpha=0.1,
    gamma=0.95,
    epsilon=1.0,
    epsilon_decay=0.9995,
    epsilon_min=0.05,
)

n_training_episodes = 20000
training_rewards = []

for ep in range(n_training_episodes):
    state = env.reset(seed=ep)
    total_reward = 0

    for step in range(8):
        action = q_agent.choose_action(state)
        next_state, reward, done, info = env.step(action)
        q_agent.update(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        if done:
            break

    training_rewards.append(total_reward)
    q_agent.decay_epsilon()

    if (ep + 1) % 5000 == 0:
        recent_avg = np.mean(training_rewards[-500:])
        print(f"  Episode {ep+1:>6} | Avg reward (last 500): {recent_avg:>7.2f} | "
              f"Epsilon: {q_agent.epsilon:.4f} | Q-table size: {len(q_agent.q_table)}")

print(f"\nTraining complete. Q-table entries: {len(q_agent.q_table)}")

In [None]:
# Plot training curve
fig, ax = plt.subplots(1, 1, figsize=(12, 5))
window = 200
smoothed = [np.mean(training_rewards[max(0,i-window):i+1]) for i in range(len(training_rewards))]
ax.plot(smoothed, color='blue', alpha=0.8, linewidth=1.5)
ax.set_xlabel('Training Episode', fontsize=12)
ax.set_ylabel('Episode Reward (smoothed)', fontsize=12)
ax.set_title('Q-Learning Agent: Training Progress', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Part F: Evaluation and Comparison

In [None]:
# Evaluate all three agents
print("Evaluating agents on 200 held-out patients...")
print("=" * 50)

fixed_agent = FixedDoseAgent()
rule_agent = RuleBasedAgent()

fixed_results = evaluate_agent(fixed_agent, n_episodes=200)
rule_results = evaluate_agent(rule_agent, n_episodes=200)

# For Q-agent, set epsilon to 0 (greedy)
q_agent.epsilon = 0.0
q_results = evaluate_agent(q_agent, n_episodes=200)

# Print summary table
agents = ['Fixed Dose', 'Rule-Based', 'Q-Learning']
all_results = [fixed_results, rule_results, q_results]

print(f"\n{'Metric':<30} | {'Fixed Dose':>12} | {'Rule-Based':>12} | {'Q-Learning':>12}")
print("-" * 75)

for metric, key, fmt in [
    ('Avg Total Reward', 'rewards', '.2f'),
    ('Tumor Response Rate', 'tumor_responses', '.1%'),
    ('Avg Grade 3+ Tox Events', 'toxicity_events', '.2f'),
    ('Treatment Completion Rate', 'completions', '.1%'),
    ('Avg Final Tumor Size (mm)', 'final_tumor_sizes', '.1f'),
]:
    values = []
    for res in all_results:
        val = np.mean(res[key])
        values.append(val)

    if '%' in fmt:
        print(f"{metric:<30} | {values[0]:>11.1%} | {values[1]:>11.1%} | {values[2]:>11.1%}")
    else:
        f = fmt.replace('.', ':>12.').replace('f', 'f')
        print(f"{metric:<30} | {values[0]:>12{fmt}} | {values[1]:>12{fmt}} | {values[2]:>12{fmt}}")

In [None]:
# Comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Reward distributions
for i, (name, results, color) in enumerate(zip(
    agents, all_results, ['red', 'orange', 'blue'])):
    axes[0, 0].hist(results['rewards'], bins=20, alpha=0.5, label=name, color=color)
axes[0, 0].set_xlabel('Total Episode Reward', fontsize=11)
axes[0, 0].set_ylabel('Count', fontsize=11)
axes[0, 0].set_title('Reward Distribution by Agent', fontsize=13, fontweight='bold')
axes[0, 0].legend(fontsize=10)

# Plot 2: Final tumor sizes
for i, (name, results, color) in enumerate(zip(
    agents, all_results, ['red', 'orange', 'blue'])):
    axes[0, 1].hist(results['final_tumor_sizes'], bins=20, alpha=0.5, label=name, color=color)
axes[0, 1].set_xlabel('Final Tumor Size (mm)', fontsize=11)
axes[0, 1].set_ylabel('Count', fontsize=11)
axes[0, 1].set_title('Final Tumor Size Distribution', fontsize=13, fontweight='bold')
axes[0, 1].legend(fontsize=10)

# Plot 3: Bar chart comparison
metrics = ['Tumor\nResponse', 'Treatment\nCompletion', 'Low Toxicity\n(Grade < 3)']
metric_keys = ['tumor_responses', 'completions', 'toxicity_events']

x = np.arange(len(metrics))
width = 0.25

for i, (name, results, color) in enumerate(zip(
    agents, all_results, ['red', 'orange', 'blue'])):
    vals = [
        np.mean(results['tumor_responses']),
        np.mean(results['completions']),
        1.0 - np.mean([min(1, t) for t in results['toxicity_events']]),
    ]
    axes[1, 0].bar(x + i*width, vals, width, label=name, color=color, alpha=0.7)

axes[1, 0].set_ylabel('Rate', fontsize=11)
axes[1, 0].set_title('Clinical Outcomes Comparison', fontsize=13, fontweight='bold')
axes[1, 0].set_xticks(x + width)
axes[1, 0].set_xticklabels(metrics, fontsize=10)
axes[1, 0].legend(fontsize=10)
axes[1, 0].set_ylim(0, 1.1)

# Plot 4: Average dosing trajectory
for name, results, color in zip(agents, all_results, ['red', 'orange', 'blue']):
    all_doses = results['dose_histories']
    max_len = max(len(d) for d in all_doses)
    avg_doses = []
    for step in range(max_len):
        step_doses = [d[step] for d in all_doses if len(d) > step]
        avg_doses.append(np.mean(step_doses))
    axes[1, 1].plot(range(1, len(avg_doses)+1), avg_doses, 'o-',
                     color=color, label=name, linewidth=2, markersize=6)

axes[1, 1].axhline(y=1.0, color='gray', linestyle='--', alpha=0.5, label='Standard dose')
axes[1, 1].set_xlabel('Treatment Cycle', fontsize=11)
axes[1, 1].set_ylabel('Average Dose (fraction of standard)', fontsize=11)
axes[1, 1].set_title('Dosing Trajectories Over Treatment', fontsize=13, fontweight='bold')
axes[1, 1].legend(fontsize=10)
axes[1, 1].set_ylim(0.5, 1.2)

plt.tight_layout()
plt.show()

## Individual Patient Case Studies

In [None]:
# Show detailed dosing trajectory for 3 sample patients
fig, axes = plt.subplots(3, 3, figsize=(16, 12))

for patient_idx in range(3):
    seed = 10000 + patient_idx * 7

    for agent_idx, (agent_obj, agent_name, color) in enumerate([
        (fixed_agent, 'Fixed', 'red'),
        (rule_agent, 'Rule-Based', 'orange'),
    ]):
        env = ChemoDoseEnv()
        state = env.reset(seed=seed)
        if hasattr(agent_obj, 'reset'):
            agent_obj.reset()

        tumors, wbcs, doses = [env.patient.tumor_size], [env.patient.wbc], []

        for step in range(8):
            action = agent_obj.choose_action(state)
            state, reward, done, info = env.step(action)
            tumors.append(info['tumor_size'])
            wbcs.append(info['wbc'])
            doses.append(info['dose'])
            if done:
                break

    # Q-learning agent
    env = ChemoDoseEnv()
    state = env.reset(seed=seed)
    q_agent.epsilon = 0.0

    q_tumors, q_wbcs, q_doses = [env.patient.tumor_size], [env.patient.wbc], []

    for step in range(8):
        action = q_agent.choose_action(state)
        state, reward, done, info = env.step(action)
        q_tumors.append(info['tumor_size'])
        q_wbcs.append(info['wbc'])
        q_doses.append(info['dose'])
        if done:
            break

    # Plot tumor trajectory
    axes[patient_idx, 0].plot(q_tumors, 'b-o', label='Q-Learning', markersize=4)
    axes[patient_idx, 0].set_ylabel('Tumor Size (mm)', fontsize=10)
    axes[patient_idx, 0].set_title(f'Patient {patient_idx+1}: Tumor Trajectory', fontsize=11, fontweight='bold')
    axes[patient_idx, 0].legend(fontsize=9)
    axes[patient_idx, 0].grid(True, alpha=0.3)

    # Plot WBC trajectory
    axes[patient_idx, 1].plot(q_wbcs, 'b-o', label='Q-Learning', markersize=4)
    axes[patient_idx, 1].axhline(y=2000, color='red', linestyle='--', alpha=0.5, label='Grade 3 threshold')
    axes[patient_idx, 1].set_ylabel('WBC (cells/uL)', fontsize=10)
    axes[patient_idx, 1].set_title(f'Patient {patient_idx+1}: WBC Trajectory', fontsize=11, fontweight='bold')
    axes[patient_idx, 1].legend(fontsize=9)
    axes[patient_idx, 1].grid(True, alpha=0.3)

    # Plot dose decisions
    axes[patient_idx, 2].bar(range(1, len(q_doses)+1), q_doses, color='blue', alpha=0.7, label='Q-Learning')
    axes[patient_idx, 2].axhline(y=1.0, color='gray', linestyle='--', alpha=0.5)
    axes[patient_idx, 2].set_ylabel('Dose Fraction', fontsize=10)
    axes[patient_idx, 2].set_title(f'Patient {patient_idx+1}: Dose Decisions', fontsize=11, fontweight='bold')
    axes[patient_idx, 2].set_ylim(0.4, 1.2)
    axes[patient_idx, 2].legend(fontsize=9)
    axes[patient_idx, 2].grid(True, alpha=0.3)

for ax in axes[2, :]:
    ax.set_xlabel('Treatment Cycle', fontsize=10)

plt.suptitle('Q-Learning Agent: Individual Patient Trajectories', fontsize=14, fontweight='bold', y=1.01)
plt.tight_layout()
plt.show()

print("The Q-learning agent adapts its dosing based on each patient's response.")
print("Notice how it reduces dose when toxicity rises and increases when the patient tolerates treatment well.")

## Summary and Key Findings

In [None]:
print("=" * 60)
print("CASE STUDY SUMMARY")
print("=" * 60)
print()
print("Problem: Adaptive chemotherapy dose optimization")
print("Company: NovaCure Therapeutics (simulated)")
print("Drug:    NC-4817 for advanced non-small-cell lung cancer")
print()
print("RL Formulation:")
print("  States:  tumor size, WBC, toxicity grade, kidney function, cycle")
print("  Actions: 6 dose levels (0.6x to 1.1x standard)")
print("  Reward:  tumor shrinkage - toxicity penalty + completion bonus")
print("  Method:  Tabular Q-learning (gamma=0.95, epsilon-greedy)")
print()
print("Key Results:")
print(f"  Q-Learning avg reward:     {np.mean(q_results['rewards']):>7.2f}")
print(f"  Fixed dose avg reward:     {np.mean(fixed_results['rewards']):>7.2f}")
print(f"  Rule-based avg reward:     {np.mean(rule_results['rewards']):>7.2f}")
print()
improvement = (np.mean(q_results['rewards']) - np.mean(fixed_results['rewards'])) / abs(np.mean(fixed_results['rewards'])) * 100
print(f"  Improvement over fixed dose: {improvement:+.1f}%")
print()
print("The RL agent learned to personalize dosing for each patient,")
print("balancing tumor control against toxicity risk -- exactly the kind")
print("of sequential decision problem that RL was designed to solve.")