In [1]:
%%writefile single_file_rl_training_final.py
#!/usr/bin/env python3
"""
Enhanced Single-file implementation of Reinforcement Learning for Hierarchical Employee Training Optimization
Based on the research paper by Soumedhik Bharati, Rupsha Sadhukhan, Debanjali Saha

Phase 2 Implementation: Environmental Complexity & Robustness Testing
- ADDED: Stochastic costs and outcomes to simulate real-world uncertainty.
- ADDED: A strategic "Wait" action to allow for more complex temporal planning.
- ADDED: New configuration options to control environmental complexity.
- ADDED: A new 'complex_compare' mode to test agent robustness under new conditions.

Usage:
    # Phase 1 (Deterministic)
    python single_file_rl_training.py --mode train --episodes 3000
    python single_file_rl_training.py --mode compare --episodes 1000

    # Phase 2 (Stochastic & Strategic)
    python single_file_rl_training.py --mode complex_compare --episodes 10000
"""

import os
import sys
import argparse
import time
import copy
from collections import defaultdict, deque
from typing import Dict, List, Tuple, Optional, Literal
from dataclasses import dataclass

# Core libraries
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import pandas as pd

# Gymnasium for RL environment
import gymnasium as gym
from gymnasium import spaces

# PyTorch for neural networks
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.optim.lr_scheduler import StepLR

# =============================================================================
# CONFIGURATION AND HYPERPARAMETERS
# =============================================================================

@dataclass
class TrainingConfig:
    """Configuration class for training parameters, now with environmental complexity controls"""
    # Training parameters
    num_episodes: int = 3000
    gamma: float = 0.99
    learning_rate: float = 3e-4
    hidden_dim: int = 128
    use_baseline: bool = True
    entropy_coefficient: float = 0.001
    
    # Learning rate scheduling
    lr_step_size: int = 750
    lr_gamma: float = 0.9
    
    # Environment parameters
    D: int = 8  # Number of skills
    K: int = 4  # Number of training modules
    alpha: List[float] = None
    beta: float = 0.01
    kappa: float = 1.2
    C_max: float = 120.0
    
    # Reward strategy
    reward_strategy: Literal['basic', 'terminal', 'efficiency', 'hybrid'] = 'hybrid'
    cost_penalty: float = 0.01
    skill_amplifier: float = 1.0
    terminal_bonus_multiplier: float = 1.5
    
    # Budget penalty
    base_budget_penalty: float = 4.0
    max_budget_penalty: float = 8.0
    
    # --- PHASE 2: ENVIRONMENTAL COMPLEXITY ---
    cost_noise: float = 0.0      # Std deviation for cost randomness. Default 0 = deterministic.
    outcome_noise: float = 0.0   # Std deviation for outcome randomness. Default 0 = deterministic.
    add_wait_action: bool = False # If True, adds a "Wait" action to the action space.
    
    # Logging and saving
    log_interval: int = 50
    save_interval: int = 1000
    model_save_path: str = 'models/employee_training_model.pth'
    plot_save_path: str = 'plots/'
    
    # Evaluation parameters
    eval_episodes: int = 100
    eval_render: bool = False
    
    def __post_init__(self):
        if self.alpha is None:
            self.alpha = [0.3, 0.25, 0.2, 0.35]

# =============================================================================
# COMPLEXITY-AWARE EMPLOYEE TRAINING ENVIRONMENT
# =============================================================================

class EmployeeTrainingEnv(gym.Env):
    """
    Gymnasium environment with optional stochasticity and a strategic 'Wait' action.
    """
    
    metadata = {"render_modes": ["human"]}
    
    def __init__(self, config: TrainingConfig):
        super().__init__()
        
        self.config = config
        self.D = config.D
        self.K = config.K
        self.beta = config.beta
        self.kappa = config.kappa
        self.C_max = config.C_max
        self.gamma = config.gamma
        
        # --- PHASE 2: Store complexity parameters ---
        self.cost_noise = config.cost_noise
        self.outcome_noise = config.outcome_noise
        self.add_wait_action = config.add_wait_action
        
        # Learning rates and costs
        self.alpha = np.array(config.alpha, dtype=np.float32)
        self.costs = np.array([10.0, 15.0, 20.0, 12.0], dtype=np.float32)
        
        self.module_targets = {0: [0, 1], 1: [2, 3], 2: [4, 5], 3: [6, 7]}
        self.synergy_matrix = self._initialize_synergy_matrix()
        
        # --- PHASE 2: Dynamic Action Space ---
        self.K_total = self.K + 1 if self.add_wait_action else self.K
        self.action_space = spaces.Discrete(self.K_total)
        self.observation_space = spaces.Box(low=0.0, high=1.0, shape=(self.D,), dtype=np.float32)
        
        # Episode state
        self.current_skills = None
        self.current_cost = 0.0
        self.episode_length = 0
        self.max_episode_length = 50
        self.initial_skills = None
        
    def _initialize_synergy_matrix(self) -> np.ndarray:
        synergy = np.zeros((self.D, self.D))
        synergy[0, 1] = synergy[1, 0] = 0.3; synergy[0, 2] = synergy[2, 0] = 0.2
        synergy[1, 2] = synergy[2, 1] = 0.4; synergy[2, 3] = synergy[3, 2] = 0.3
        synergy[4, 5] = synergy[5, 4] = 0.4; synergy[4, 6] = synergy[6, 4] = 0.3
        synergy[5, 6] = synergy[6, 5] = 0.2; synergy[6, 7] = synergy[7, 6] = 0.3
        synergy[1, 7] = synergy[7, 1] = 0.15; synergy[3, 5] = synergy[5, 3] = 0.1
        return synergy
    
    def reset(self, seed: Optional[int] = None, options: Optional[Dict] = None) -> Tuple[np.ndarray, Dict]:
        super().reset(seed=seed)
        self.current_skills = self.np_random.uniform(0.1, 0.6, size=self.D).astype(np.float32)
        self.initial_skills = self.current_skills.copy()
        self.current_cost = 0.0
        self.episode_length = 0
        return self.current_skills.copy(), {}
    
    def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, Dict]:
        """Execute one step, now handling the 'Wait' action and stochasticity."""
        if not 0 <= action < self.K_total:
            raise ValueError(f"Invalid action: {action}")

        info = {"waited": False}

        # --- PHASE 2: Handle the strategic "Wait" action ---
        if self.add_wait_action and action == self.K:
            prev_skills = self.current_skills.copy()
            # Only apply skill decay (forgetting)
            self.current_skills = np.clip(self.current_skills - self.beta * self.current_skills, 0.0, 1.0)
            self.episode_length += 1
            terminated = self.episode_length >= self.max_episode_length
            
            # Small negative reward to discourage passivity
            reward = -0.1 

            info.update({
                "current_cost": self.current_cost,
                "episode_length": self.episode_length,
                "skill_improvement": np.sum(self.current_skills - prev_skills), # will be negative
                "total_skill_improvement": np.sum(self.current_skills - self.initial_skills),
                "budget_utilization": self.current_cost / self.C_max,
                "terminated": terminated,
                "budget_exceeded": False,
                "waited": True,
            })
            return self.current_skills.copy(), reward, terminated, False, info

        # --- Regular training action ---
        # --- PHASE 2: Implement Stochastic Costs ---
        base_cost = self.costs[action]
        action_cost = max(1.0, self.np_random.normal(loc=base_cost, scale=self.cost_noise)) if self.cost_noise > 0 else base_cost
        
        if self.current_cost + action_cost > self.C_max:
            overrun_amount = (self.current_cost + action_cost - self.C_max)
            penalty_scale = min(1.0, overrun_amount / self.C_max)
            budget_penalty = -(self.config.base_budget_penalty + penalty_scale * (self.config.max_budget_penalty - self.config.base_budget_penalty))
            
            info.update({
                "budget_exceeded": True, "current_cost": self.current_cost + action_cost,
                "overrun_amount": overrun_amount, "budget_penalty": budget_penalty,
                "skill_improvement": 0.0,
                "total_skill_improvement": np.sum(self.current_skills - self.initial_skills),
                "budget_utilization": (self.current_cost + action_cost) / self.C_max,
                "terminated": True,
            })
            return self.current_skills.copy(), budget_penalty, True, False, info
        
        prev_skills = self.current_skills.copy()
        self.current_skills = self._apply_training(self.current_skills, action)
        
        self.current_cost += action_cost
        self.episode_length += 1
        
        terminated = (self.episode_length >= self.max_episode_length or self.current_cost >= self.C_max)
        reward = self._calculate_reward(prev_skills, self.current_skills, action_cost, terminated)
        
        info.update({
            "current_cost": self.current_cost,
            "episode_length": self.episode_length,
            "skill_improvement": np.sum(self.current_skills - prev_skills),
            "total_skill_improvement": np.sum(self.current_skills - self.initial_skills),
            "budget_utilization": self.current_cost / self.C_max,
            "terminated": terminated,
            "budget_exceeded": False
        })
        return self.current_skills.copy(), reward, terminated, False, info
    
    def _apply_training(self, skills: np.ndarray, action: int) -> np.ndarray:
        """Apply training module, now with stochastic outcomes."""
        new_skills = skills.copy()
        
        # --- PHASE 2: Implement Stochastic Outcomes ---
        base_alpha = self.alpha[action]
        alpha_a = max(0.0, self.np_random.normal(loc=base_alpha, scale=self.outcome_noise)) if self.outcome_noise > 0 else base_alpha

        target_attributes = self.module_targets[action]
        for j in range(self.D):
            delta_j = 0.0
            if j in target_attributes:
                delta_j = (1 - skills[j]) ** self.kappa
            else:
                for k in target_attributes:
                    delta_j += self.synergy_matrix[j, k] * (1 - skills[j]) ** self.kappa
            new_skills[j] = skills[j] + alpha_a * delta_j - self.beta * skills[j]
        
        return np.clip(new_skills, 0.0, 1.0)
    
    def _calculate_reward(self, prev_skills: np.ndarray, new_skills: np.ndarray, 
                         cost: float, terminated: bool) -> float:
        skill_improvement = np.sum(new_skills - prev_skills)
        if self.config.reward_strategy == 'hybrid':
            base_reward = self.config.skill_amplifier * skill_improvement
            cost_penalty = self.config.cost_penalty * cost
            if terminated and self.current_cost <= self.C_max:
                total_improvement = np.sum(self.current_skills - self.initial_skills)
                efficiency_bonus = ((self.C_max - self.current_cost) / self.C_max) * 2.0
                improvement_bonus = total_improvement * 0.5
                return base_reward - cost_penalty + improvement_bonus + efficiency_bonus
            return base_reward - cost_penalty
        # Other reward strategies can be added here if needed
        else:
             return self.config.skill_amplifier * skill_improvement - self.config.cost_penalty * cost

    def render(self, mode: str = "human") -> None:
        pass # Render logic omitted for brevity, but would be included here

# =============================================================================
# NEURAL NETWORKS AND AGENT (ADAPTIVE TO ACTION SPACE)
# =============================================================================

class PolicyNetwork(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self._init_weights()
        
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.zeros_(m.bias)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return F.softmax(self.fc3(x), dim=-1)

class ValueNetwork(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        self._init_weights()
        
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.zeros_(m.bias)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class EnhancedREINFORCEAgent:
    def __init__(self, config: TrainingConfig):
        self.config = config
        self.state_dim = config.D
        # --- PHASE 2: Dynamic Action Dimension ---
        self.action_dim = config.K + 1 if config.add_wait_action else config.K
        self.use_baseline = config.use_baseline
        self.entropy_coefficient = config.entropy_coefficient
        
        self.policy_net = PolicyNetwork(self.state_dim, config.hidden_dim, self.action_dim)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=config.learning_rate)
        self.policy_scheduler = StepLR(self.policy_optimizer, step_size=config.lr_step_size, gamma=config.lr_gamma)
        
        if self.use_baseline:
            self.value_net = ValueNetwork(self.state_dim, config.hidden_dim)
            self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=config.learning_rate)
            self.value_scheduler = StepLR(self.value_optimizer, step_size=config.lr_step_size, gamma=config.lr_gamma)
        
        self.log_probs, self.rewards, self.states, self.entropies = [], [], [], []
        self.training_stats = defaultdict(list)
        
    def select_action(self, state: np.ndarray) -> int:
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        action_probs = self.policy_net(state_tensor)
        dist = Categorical(action_probs)
        action = dist.sample()
        self.log_probs.append(dist.log_prob(action))
        self.entropies.append(dist.entropy())
        self.states.append(state_tensor)
        return action.item()
    
    def update_policy(self, gamma: float) -> Dict[str, float]:
        if not self.rewards: return {}
        
        returns = []
        R = 0
        for r in reversed(self.rewards):
            R = r + gamma * R
            returns.insert(0, R)
        returns = torch.tensor(returns, dtype=torch.float32)
        if len(returns) > 1: returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        
        policy_losses, value_losses = [], []
        for log_prob, R, state in zip(self.log_probs, returns, self.states):
            if self.use_baseline:
                baseline = self.value_net(state).squeeze()
                advantage = R - baseline
                policy_losses.append(-log_prob * advantage.detach())
                value_losses.append(F.mse_loss(baseline, R))
            else:
                policy_losses.append(-log_prob * R)

        self.policy_optimizer.zero_grad()
        policy_loss = torch.stack(policy_losses).mean()
        entropy_loss = torch.stack(self.entropies).mean()
        total_policy_loss = policy_loss - self.entropy_coefficient * entropy_loss
        total_policy_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
        self.policy_optimizer.step()
        self.policy_scheduler.step()

        if self.use_baseline and value_losses:
            self.value_optimizer.zero_grad()
            value_loss = torch.stack(value_losses).mean()
            value_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.value_net.parameters(), 1.0)
            self.value_optimizer.step()
            self.value_scheduler.step()
        else:
            value_loss = torch.tensor(0.0)

        self.log_probs, self.rewards, self.states, self.entropies = [], [], [], []
        return {"policy_loss": policy_loss.item(), "value_loss": value_loss.item(), "entropy_loss": entropy_loss.item()}

    def load_model(self, filepath: str):
        if os.path.exists(filepath):
            checkpoint = torch.load(filepath)
            self.policy_net.load_state_dict(checkpoint['policy_net'])
            print(f"Model loaded from {filepath}")
        else:
            print(f"Warning: No model found at {filepath}")


# =============================================================================
# ENHANCED TRAINING LOOP AND ANALYSIS
# =============================================================================

# Note: The EnhancedTrainingLoop class from the previous version is used here.
# For brevity, it is not repeated in its entirety. The key change is in the
# _plot_policy_trajectory method to handle the wait action.

class EnhancedTrainingLoop:
    def __init__(self, env: EmployeeTrainingEnv, agent: EnhancedREINFORCEAgent, config: TrainingConfig):
        self.env = env
        self.agent = agent
        self.config = config
        self.episode_rewards, self.episode_rois, self.skill_improvements, self.episode_costs = [], [], [], []
        self.budget_exceeded_episodes, self.success_episodes = [], []
        self.recent_rewards, self.recent_budget_exceeded = deque(maxlen=100), deque(maxlen=100)

    def run_episode(self) -> Dict[str, float]:
        state, _ = self.env.reset()
        episode_reward = 0
        budget_exceeded = False
        info = {}

        while True:
            action = self.agent.select_action(state)
            next_state, reward, terminated, truncated, info = self.env.step(action)
            self.agent.rewards.append(reward)
            episode_reward += reward
            if info.get('budget_exceeded', False):
                budget_exceeded = True
            state = next_state
            if terminated or truncated:
                break
        
        losses = self.agent.update_policy(self.config.gamma)
        total_skill_improvement = info.get('total_skill_improvement', 0)
        final_cost = info.get('current_cost', 0)
        roi = (total_skill_improvement / final_cost) if final_cost > 0 else 0.0

        return {'episode_reward': episode_reward, 'roi': roi, 'skill_improvement': total_skill_improvement,
                'episode_cost': final_cost, 'budget_exceeded': budget_exceeded, **losses}
    
    def train(self):
        print(f"Starting Training for {self.config.num_episodes} episodes...")
        if self.config.cost_noise > 0 or self.config.outcome_noise > 0:
            print(f"COMPLEXITY: Stochastic Env (Cost Noise: {self.config.cost_noise}, Outcome Noise: {self.config.outcome_noise})")
        if self.config.add_wait_action:
            print(f"COMPLEXITY: Strategic 'Wait' action is ENABLED.")
        
        for episode in range(self.config.num_episodes):
            metrics = self.run_episode()
            self.episode_rewards.append(metrics['episode_reward'])
            self.episode_rois.append(metrics['roi'])
            self.skill_improvements.append(metrics['skill_improvement'])
            self.episode_costs.append(metrics['episode_cost'])
            self.budget_exceeded_episodes.append(metrics['budget_exceeded'])
            self.success_episodes.append(metrics['episode_reward'] > 0 and not metrics['budget_exceeded'])
            self.recent_rewards.append(metrics['episode_reward'])
            self.recent_budget_exceeded.append(metrics['budget_exceeded'])

            if (episode + 1) % self.config.log_interval == 0:
                avg_reward = np.mean(self.recent_rewards)
                budget_exceed_rate = np.mean(self.recent_budget_exceeded)
                print(f"Ep {episode+1:5d} | Avg Reward: {avg_reward:7.2f} | Budget Exceed Rate: {budget_exceed_rate:5.1%}")

    def evaluate(self, num_episodes: int = 100, visualize_trajectory: bool = False) -> Dict[str, any]:
        print(f"\nEvaluating policy over {num_episodes} episodes...")
        eval_rewards, eval_costs, eval_skill_improvements, eval_budget_exceeded, action_trajectories = [], [], [], [], []

        for _ in range(num_episodes):
            state, _ = self.env.reset()
            episode_reward, episode_cost, total_skill_improvement = 0, 0, 0
            current_trajectory = []
            while True:
                with torch.no_grad():
                    action = torch.argmax(self.agent.policy_net(torch.FloatTensor(state).unsqueeze(0))).item()
                current_trajectory.append(action)
                next_state, reward, terminated, truncated, info = self.env.step(action)
                episode_reward += reward
                state = next_state
                if terminated or truncated:
                    episode_cost = info.get('current_cost', 0)
                    total_skill_improvement = info.get('total_skill_improvement', 0)
                    eval_budget_exceeded.append(info.get('budget_exceeded', False))
                    break
            eval_rewards.append(episode_reward)
            eval_costs.append(episode_cost)
            eval_skill_improvements.append(total_skill_improvement)
            action_trajectories.append(current_trajectory)

        if visualize_trajectory:
            self._plot_policy_trajectory(action_trajectories)
            
        return {
            'mean_reward': np.mean(eval_rewards), 'std_reward': np.std(eval_rewards),
            'mean_cost': np.mean(eval_costs), 'mean_skill_improvement': np.mean(eval_skill_improvements),
            'budget_exceeded_rate': np.mean(eval_budget_exceeded),
            'raw_costs': eval_costs, 'raw_skill_improvements': eval_skill_improvements,
        }

    def _plot_policy_trajectory(self, action_trajectories: List[List[int]]):
        """Visualize agent's action sequences, now handling the Wait action."""
        print("\nGenerating policy trajectory heatmap...")
        plt.figure(figsize=(15, 10))
        max_len = max(len(t) for t in action_trajectories) if action_trajectories else 0
        padded = np.full((len(action_trajectories), max_len), -1.0)
        for i, t in enumerate(action_trajectories):
            padded[i, :len(t)] = t

        # Dynamically create colormap and labels
        num_actions = self.env.K_total
        cmap = cm.get_cmap('viridis', num_actions)
        ticks = np.arange(num_actions)
        tick_labels = [f'Module {i}' for i in range(self.env.K)]
        if self.env.add_wait_action:
            tick_labels.append('Wait')

        ax = sns.heatmap(padded, cmap=cmap, vmin=-0.5, vmax=num_actions - 0.5,
                         cbar_kws={'ticks': ticks, 'label': 'Action Chosen'})
        cbar = ax.collections[0].colorbar
        cbar.set_ticklabels(tick_labels)

        plt.title(f'Policy Trajectories over {len(action_trajectories)} Episodes')
        plt.xlabel('Step in Episode'); plt.ylabel('Evaluation Episode')
        plt.show()

# =============================================================================
# PERFORMANCE COMPARISON FRAMEWORK
# =============================================================================

class PerformanceComparison:
    def __init__(self):
        self.results = {}

    def run_comparison(self, configurations: Dict[str, TrainingConfig], episodes: int):
        for name, config in configurations.items():
            print(f"\n{'='*30}\nTesting Configuration: {name}\n{'='*30}")
            config.num_episodes = episodes
            env = EmployeeTrainingEnv(config)
            agent = EnhancedREINFORCEAgent(config)
            loop = EnhancedTrainingLoop(env, agent, config)
            loop.train()
            eval_results = loop.evaluate(num_episodes=200, visualize_trajectory=True)
            self.results[name] = {'config': config, 'eval_results': eval_results}
        return self.results

    def print_summary(self):
        print("\n" + "="*80)
        print("COMPLEX ENVIRONMENT PERFORMANCE COMPARISON SUMMARY")
        print("="*80)
        print(f"{'Configuration':<25} {'Mean Reward':<15} {'Std Reward':<15} {'Budget Exceed':<15} {'Mean Skill Imp.':<20}")
        print("-" * 80)
        for name, result in self.results.items():
            res = result['eval_results']
            print(f"{name:<25} {res['mean_reward']:<15.2f} {res['std_reward']:<15.2f} "
                  f"{res['budget_exceeded_rate']:<15.1%} {res['mean_skill_improvement']:<20.2f}")

# =============================================================================
# MAIN EXECUTION AND CONFIGURATIONS
# =============================================================================

def create_directories():
    os.makedirs('models', exist_ok=True)
    os.makedirs('plots', exist_ok=True)

def get_complex_env_configurations() -> Dict[str, TrainingConfig]:
    """Configurations for testing robustness in complex environments."""
    # Base parameters from the Phase 1 champion
    champion_params = {
        'learning_rate': 5e-4, 'cost_penalty': 0.02,
        'base_budget_penalty': 5.0, 'max_budget_penalty': 10.0,
        'reward_strategy': 'hybrid', 'entropy_coefficient': 0.001,
        'lr_step_size': 2500, 'lr_gamma': 0.9,
    }
    
    return {
        'A_Champion_Deterministic': TrainingConfig(
            **champion_params,
            # Baseline - no complexity
            cost_noise=0.0,
            outcome_noise=0.0,
            add_wait_action=False,
        ),
        'B_Champion_Stochastic': TrainingConfig(
            **champion_params,
            # Add noise to costs and outcomes
            cost_noise=3.0,       # Cost can vary by ~$3
            outcome_noise=0.05,   # Learning rate can vary by 5%
            add_wait_action=False,
        ),
        'C_Champion_Stochastic_Wait': TrainingConfig(
            **champion_params,
            # Add noise AND the strategic wait action
            cost_noise=3.0,
            outcome_noise=0.05,
            add_wait_action=True,
        )
    }

def main():
    parser = argparse.ArgumentParser(description='Robustness Testing for Employee Training RL Agent')
    parser.add_argument('--mode', choices=['train', 'complex_compare'], default='complex_compare')
    parser.add_argument('--episodes', type=int, default=10000, help='Number of training episodes')
    parser.add_argument('--seed', type=int, default=42, help='Random seed')
    args = parser.parse_args()

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    create_directories()

    print("RL for Employee Training Optimization - Phase 2: Robustness Testing")
    print(f"Mode: {args.mode}, Episodes: {args.episodes}, Seed: {args.seed}")

    if args.mode == 'complex_compare':
        configs = get_complex_env_configurations()
        comparison = PerformanceComparison()
        comparison.run_comparison(configurations=configs, episodes=args.episodes)
        comparison.print_summary()
    
    elif args.mode == 'train':
        # Example of training a single complex configuration
        config = TrainingConfig(
            num_episodes=args.episodes,
            learning_rate=5e-4, cost_penalty=0.02,
            base_budget_penalty=5.0, max_budget_penalty=10.0,
            cost_noise=3.0, outcome_noise=0.05, add_wait_action=True
        )
        env = EmployeeTrainingEnv(config)
        agent = EnhancedREINFORCEAgent(config)
        loop = EnhancedTrainingLoop(env, agent, config)
        loop.train()
        loop.evaluate(num_episodes=200, visualize_trajectory=True)

    print("\nExecution completed.")

if __name__ == "__main__":
    main()

Writing single_file_rl_training_final.py


In [2]:
!python single_file_rl_training_final.py --mode complex_compare --episodes 100000

RL for Employee Training Optimization - Phase 2: Robustness Testing
Mode: complex_compare, Episodes: 100000, Seed: 42

Testing Configuration: A_Champion_Deterministic
Starting Training for 100000 episodes...
Ep    50 | Avg Reward:   -4.62 | Budget Exceed Rate: 92.0%
Ep   100 | Avg Reward:   -4.67 | Budget Exceed Rate: 93.0%
Ep   150 | Avg Reward:   -4.59 | Budget Exceed Rate: 93.0%
Ep   200 | Avg Reward:   -4.21 | Budget Exceed Rate: 89.0%
Ep   250 | Avg Reward:   -3.62 | Budget Exceed Rate: 81.0%
Ep   300 | Avg Reward:   -3.90 | Budget Exceed Rate: 85.0%
Ep   350 | Avg Reward:   -4.34 | Budget Exceed Rate: 91.0%
Ep   400 | Avg Reward:   -4.29 | Budget Exceed Rate: 89.0%
Ep   450 | Avg Reward:   -4.31 | Budget Exceed Rate: 89.0%
Ep   500 | Avg Reward:   -3.97 | Budget Exceed Rate: 85.0%
Ep   550 | Avg Reward:   -3.61 | Budget Exceed Rate: 79.0%
Ep   600 | Avg Reward:   -3.71 | Budget Exceed Rate: 81.0%
Ep   650 | Avg Reward:   -4.11 | Budget Exceed Rate: 88.0%
Ep   70

In [3]:
# !python single_file_rl_training_final.py --mode train \
#     --episodes 25000 \
#     --reward-strategy terminal \
#     --cost-penalty 0.008 \
#     --entropy-coef 0.001 \
#     --base-budget-penalty 5.0 \
#     --max-budget-penalty 10.0 \
#     --terminal-bonus-multiplier 3.5