In [37]:
%%writefile single_file_rl_training.py
#!/usr/bin/env python3
"""
Single-file implementation of Reinforcement Learning for Hierarchical Employee Training Optimization
Based on the research paper by Soumedhik Bharati, Rupsha Sadhukhan, Debanjali Saha

This file contains all components in a single executable:
- Custom Gymnasium environment for employee training
- REINFORCE agent with optional Actor-Critic baseline
- Training loop with logging and evaluation
- Visualization utilities
- Main execution and example usage

Usage:
    python single_file_rl_training.py --mode train --episodes 1000
    python single_file_rl_training.py --mode evaluate --model models/trained_model.pth
    python single_file_rl_training.py --mode visualize
"""

import os
import sys
import argparse
import time
import copy
from collections import defaultdict, deque
from typing import Dict, List, Tuple, Optional

# Core libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Gymnasium for RL environment
import gymnasium as gym
from gymnasium import spaces

# PyTorch for neural networks
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical


# =============================================================================
# EMPLOYEE TRAINING ENVIRONMENT
# =============================================================================

class EmployeeTrainingEnv(gym.Env):
    """
    Custom Gymnasium environment for employee training optimization.
    Implements hierarchical skill structure with cross-attribute synergy and forgetting.
    """
    
    metadata = {"render_modes": ["human"]}
    
    def __init__(self, 
                 D: int = 8,  # Number of leaf-level sub-attributes
                 K: int = 4,  # Number of training modules
                 alpha: List[float] = None,  # Learning rates for each module
                 beta: float = 0.02,  # Forgetting rate
                 kappa: float = 1.5,  # Diminishing returns exponent
                 C_max: float = 100.0,  # Maximum cost budget
                 gamma: float = 0.99):  # Discount factor
        
        super().__init__()
        
        self.D = D  # Number of sub-attributes
        self.K = K  # Number of training modules
        self.beta = beta
        self.kappa = kappa
        self.C_max = C_max
        self.gamma = gamma
        
        # Learning rates for each training module
        self.alpha = alpha if alpha is not None else [0.3, 0.25, 0.2, 0.35]
        
        # Training module costs
        self.costs = [10.0, 15.0, 20.0, 12.0]  # Cost for each training module
        
        # Define which sub-attributes each training module targets
        self.module_targets = {
            0: [0, 1],      # Technical Skills: Coding, Debugging
            1: [2, 3],      # Technical Skills: Testing, Architecture
            2: [4, 5],      # Soft Skills: Communication, Leadership
            3: [6, 7]       # Soft Skills: Teamwork, Problem-solving
        }
        
        # Cross-attribute synergy matrix (ρjk)
        self.synergy_matrix = self._initialize_synergy_matrix()
        
        # Gymnasium spaces
        self.observation_space = spaces.Box(low=0.0, high=1.0, shape=(D,), dtype=np.float32)
        self.action_space = spaces.Discrete(K)
        
        # Episode state
        self.current_skills = None
        self.current_cost = 0.0
        self.episode_length = 0
        self.max_episode_length = 50
        
    def _initialize_synergy_matrix(self) -> np.ndarray:
        """Initialize the cross-attribute synergy matrix."""
        synergy = np.zeros((self.D, self.D))
        
        # Within technical skills
        synergy[0, 1] = synergy[1, 0] = 0.3  # Coding <-> Debugging
        synergy[0, 2] = synergy[2, 0] = 0.2  # Coding <-> Testing
        synergy[1, 2] = synergy[2, 1] = 0.4  # Debugging <-> Testing
        synergy[2, 3] = synergy[3, 2] = 0.3  # Testing <-> Architecture
        
        # Within soft skills
        synergy[4, 5] = synergy[5, 4] = 0.4  # Communication <-> Leadership
        synergy[4, 6] = synergy[6, 4] = 0.3  # Communication <-> Teamwork
        synergy[5, 6] = synergy[6, 5] = 0.2  # Leadership <-> Teamwork
        synergy[6, 7] = synergy[7, 6] = 0.3  # Teamwork <-> Problem-solving
        
        # Cross-domain synergies (technical to soft)
        synergy[1, 7] = synergy[7, 1] = 0.15  # Debugging <-> Problem-solving
        synergy[3, 5] = synergy[5, 3] = 0.1   # Architecture <-> Leadership
        
        return synergy
    
    def reset(self, seed: Optional[int] = None, options: Optional[Dict] = None) -> Tuple[np.ndarray, Dict]:
        """Reset the environment to initial state."""
        super().reset(seed=seed)
        
        # Initialize skills randomly between 0.1 and 0.6
        self.current_skills = self.np_random.uniform(0.1, 0.6, size=self.D).astype(np.float32)
        self.current_cost = 0.0
        self.episode_length = 0
        
        return self.current_skills.copy(), {}
    
    def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, Dict]:
        """Execute one step in the environment."""
        if action < 0 or action >= self.K:
            raise ValueError(f"Invalid action: {action}")
        
        # Calculate cost and check budget constraint
        action_cost = self.costs[action]
        if self.current_cost + action_cost > self.C_max:
            # Episode terminates if budget exceeded
            return self.current_skills.copy(), -10.0, True, False, {"budget_exceeded": True}
        
        # Store previous skills for reward calculation
        prev_skills = self.current_skills.copy()
        
        # Apply training module
        self.current_skills = self._apply_training(self.current_skills, action)
        
        # Update cost and episode length
        self.current_cost += action_cost
        self.episode_length += 1
        
        # Calculate reward
        reward = self._calculate_reward(prev_skills, self.current_skills, action_cost)
        
        # Check termination conditions
        terminated = (self.episode_length >= self.max_episode_length or 
                     self.current_cost >= self.C_max)
        
        info = {
            "current_cost": self.current_cost,
            "episode_length": self.episode_length,
            "skill_improvement": np.sum(self.current_skills - prev_skills)
        }
        
        return self.current_skills.copy(), reward, terminated, False, info
    
    def _apply_training(self, skills: np.ndarray, action: int) -> np.ndarray:
        """Apply training module to current skills."""
        new_skills = skills.copy()
        alpha_a = self.alpha[action]
        target_attributes = self.module_targets[action]
        
        # Calculate potential gains for each attribute
        for j in range(self.D):
            # Direct training effect
            if j in target_attributes:
                delta_j = (1 - skills[j]) ** self.kappa
            else:
                # Cross-attribute synergy effect
                delta_j = 0.0
                for k in target_attributes:
                    delta_j += self.synergy_matrix[j, k] * (1 - skills[j]) ** self.kappa
            
            # Apply training gain and forgetting
            new_skills[j] = skills[j] + alpha_a * delta_j - self.beta * skills[j]
        
        # Clip skills to valid range [0, 1]
        new_skills = np.clip(new_skills, 0.0, 1.0)
        
        return new_skills
    
    def _calculate_reward(self, prev_skills: np.ndarray, new_skills: np.ndarray, cost: float) -> float:
        """Calculate reward based on skill improvement and cost."""
        skill_improvement = np.sum(new_skills - prev_skills)
        # FIXED: Reduced cost penalty from 0.1 to 0.01 to enable positive rewards
        return skill_improvement - 0.01 * cost
    
    def get_hierarchical_skills(self, skills: np.ndarray) -> Dict[str, float]:
        """Calculate hierarchical skill aggregations."""
        return {
            "technical_skills": np.mean(skills[0:4]),
            "soft_skills": np.mean(skills[4:8]),
            "coding_debugging": np.mean(skills[0:2]),
            "testing_architecture": np.mean(skills[2:4]),
            "communication_leadership": np.mean(skills[4:6]),
            "teamwork_problem_solving": np.mean(skills[6:8])
        }
    
    def render(self, mode: str = "human") -> None:
        """Render the current state."""
        if mode == "human":
            hierarchical = self.get_hierarchical_skills(self.current_skills)
            print(f"Episode Length: {self.episode_length}, Cost: {self.current_cost:.2f}/{self.C_max}")
            print("Hierarchical Skills:")
            for skill_name, value in hierarchical.items():
                print(f"  {skill_name}: {value:.3f}")
            print(f"Individual Skills: {self.current_skills}")


# =============================================================================
# NEURAL NETWORKS AND AGENT
# =============================================================================

class PolicyNetwork(nn.Module):
    """
    Neural network for policy approximation in REINFORCE algorithm.
    Takes employee skill vector as input and outputs action probabilities.
    """
    
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return F.softmax(x, dim=-1)


class ValueNetwork(nn.Module):
    """
    Value network for Actor-Critic implementation.
    Estimates state-value function V(s).
    """
    
    def __init__(self, input_dim: int, hidden_dim: int):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        return self.fc3(x)


class REINFORCEAgent:
    """
    REINFORCE agent for employee training optimization.
    Implements policy gradient method with optional baseline.
    """
    
    def __init__(self, 
                 state_dim: int,
                 action_dim: int,
                 hidden_dim: int = 128,
                 learning_rate: float = 1e-3,
                 use_baseline: bool = True):
        
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.use_baseline = use_baseline
        
        # Policy network
        self.policy_net = PolicyNetwork(state_dim, hidden_dim, action_dim)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate)
        
        # Value network (optional baseline)
        if use_baseline:
            self.value_net = ValueNetwork(state_dim, hidden_dim)
            self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=learning_rate)
        
        # Episode memory
        self.log_probs: List[torch.Tensor] = []
        self.rewards: List[float] = []
        self.states: List[torch.Tensor] = []
        
    def select_action(self, state: np.ndarray) -> int:
        """Select action using current policy."""
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        action_probs = self.policy_net(state_tensor)
        
        # Sample action from probability distribution
        dist = Categorical(action_probs)
        action = dist.sample()
        
        # Store log probability for policy update
        self.log_probs.append(dist.log_prob(action))
        self.states.append(state_tensor)
        
        return action.item()
    
    def store_reward(self, reward: float) -> None:
        """Store reward for current step."""
        self.rewards.append(reward)
    
    def update_policy(self, gamma: float = 0.99) -> Dict[str, float]:
        """Update policy using REINFORCE algorithm."""
        if len(self.rewards) == 0:
            return {"policy_loss": 0.0, "value_loss": 0.0}
        
        # Calculate discounted returns
        returns = []
        R = 0
        for r in reversed(self.rewards):
            R = r + gamma * R
            returns.insert(0, R)
        
        returns = torch.tensor(returns, dtype=torch.float32)
        
        # Normalize returns for stability
        if len(returns) > 1:
            returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        
        policy_loss = 0
        value_loss = 0
        
        # Calculate policy loss
        for i, (log_prob, R) in enumerate(zip(self.log_probs, returns)):
            if self.use_baseline:
                # Use value function as baseline
                state = self.states[i]
                baseline = self.value_net(state).squeeze()
                advantage = R - baseline
                policy_loss += -log_prob * advantage.detach()
            else:
                policy_loss += -log_prob * R
        
        # Update policy network
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
        self.policy_optimizer.step()
        
        # Update value network if using baseline
        if self.use_baseline:
            # Compute value predictions for all states
            value_predictions = []
            for state in self.states:
                value_pred = self.value_net(state).squeeze()
                # Ensure value_pred is at least 1-dimensional for concatenation
                if value_pred.dim() == 0:
                    value_pred = value_pred.unsqueeze(0)
                value_predictions.append(value_pred)
            
            # Concatenate value predictions
            if len(value_predictions) > 1:
                value_preds = torch.cat(value_predictions)
            else:
                value_preds = value_predictions[0]
            
            # Ensure returns has the same shape as value predictions
            if returns.dim() == 0:
                returns = returns.unsqueeze(0)
            
            value_loss = F.mse_loss(value_preds, returns)
            
            self.value_optimizer.zero_grad()
            value_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.value_net.parameters(), 1.0)
            self.value_optimizer.step()
        
        # Clear episode memory
        self.log_probs.clear()
        self.rewards.clear()
        self.states.clear()
        
        return {
            "policy_loss": policy_loss.item(),
            "value_loss": value_loss.item() if self.use_baseline else 0.0
        }
    
    def get_state_value(self, state: np.ndarray) -> float:
        """Get state value estimate (if using baseline)."""
        if not self.use_baseline:
            return 0.0
        
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            return self.value_net(state_tensor).item()
    
    def save_model(self, filepath: str) -> None:
        """Save trained model."""
        checkpoint = {
            'policy_net': self.policy_net.state_dict(),
            'policy_optimizer': self.policy_optimizer.state_dict(),
            'state_dim': self.state_dim,
            'action_dim': self.action_dim,
            'use_baseline': self.use_baseline
        }
        
        if self.use_baseline:
            checkpoint['value_net'] = self.value_net.state_dict()
            checkpoint['value_optimizer'] = self.value_optimizer.state_dict()
        
        torch.save(checkpoint, filepath)
    
    def load_model(self, filepath: str) -> None:
        """Load trained model."""
        checkpoint = torch.load(filepath)
        self.policy_net.load_state_dict(checkpoint['policy_net'])
        self.policy_optimizer.load_state_dict(checkpoint['policy_optimizer'])
        
        if self.use_baseline and 'value_net' in checkpoint:
            self.value_net.load_state_dict(checkpoint['value_net'])
            self.value_optimizer.load_state_dict(checkpoint['value_optimizer'])


# =============================================================================
# TRAINING LOOP
# =============================================================================

class TrainingLoop:
    """
    Main training loop for the employee training optimization system.
    Handles episode execution, logging, and progress tracking.
    """
    
    def __init__(self, env, agent, config: Dict):
        self.env = env
        self.agent = agent
        self.config = config
        
        # Training parameters
        self.num_episodes = config.get('num_episodes', 1000)
        self.gamma = config.get('gamma', 0.99)
        self.log_interval = config.get('log_interval', 50)
        self.save_interval = config.get('save_interval', 200)
        self.model_save_path = config.get('model_save_path', 'models/employee_training_model.pth')
        
        # Metrics tracking
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_costs = []
        self.policy_losses = []
        self.value_losses = []
        self.skill_improvements = []
        
        # Recent performance tracking
        self.recent_rewards = deque(maxlen=100)
        self.recent_lengths = deque(maxlen=100)
        
    def run_episode(self) -> Dict[str, float]:
        """Run a single training episode."""
        state, _ = self.env.reset()
        episode_reward = 0
        episode_length = 0
        total_skill_improvement = 0
        
        while True:
            # Select action
            action = self.agent.select_action(state)
            
            # Execute action
            next_state, reward, terminated, truncated, info = self.env.step(action)
            
            # Store reward
            self.agent.store_reward(reward)
            
            # Update metrics
            episode_reward += reward
            episode_length += 1
            total_skill_improvement += info.get('skill_improvement', 0)
            
            # Update state
            state = next_state
            
            # Check termination
            if terminated or truncated:
                break
        
        # Update policy at end of episode
        losses = self.agent.update_policy(self.gamma)
        
        return {
            'episode_reward': episode_reward,
            'episode_length': episode_length,
            'episode_cost': info.get('current_cost', 0),
            'skill_improvement': total_skill_improvement,
            'policy_loss': losses['policy_loss'],
            'value_loss': losses['value_loss']
        }
    
    def train(self) -> None:
        """Main training loop."""
        print("Starting training...")
        print(f"Episodes: {self.num_episodes}")
        print(f"Environment: {self.env.D} skills, {self.env.K} training modules")
        print(f"Agent: {'with' if self.agent.use_baseline else 'without'} baseline")
        print("-" * 50)
        
        start_time = time.time()
        
        for episode in range(self.num_episodes):
            # Run episode
            episode_metrics = self.run_episode()
            
            # Store metrics
            self.episode_rewards.append(episode_metrics['episode_reward'])
            self.episode_lengths.append(episode_metrics['episode_length'])
            self.episode_costs.append(episode_metrics['episode_cost'])
            self.skill_improvements.append(episode_metrics['skill_improvement'])
            self.policy_losses.append(episode_metrics['policy_loss'])
            self.value_losses.append(episode_metrics['value_loss'])
            
            # Update recent performance
            self.recent_rewards.append(episode_metrics['episode_reward'])
            self.recent_lengths.append(episode_metrics['episode_length'])
            
            # Logging
            if (episode + 1) % self.log_interval == 0:
                self._log_progress(episode + 1, episode_metrics)
            
            # Save model
            if (episode + 1) % self.save_interval == 0:
                self.agent.save_model(self.model_save_path)
                print(f"Model saved at episode {episode + 1}")
        
        # Final save
        self.agent.save_model(self.model_save_path)
        
        training_time = time.time() - start_time
        print(f"\nTraining completed in {training_time:.2f} seconds")
        print(f"Average reward (last 100 episodes): {np.mean(self.recent_rewards):.3f}")
        print(f"Average length (last 100 episodes): {np.mean(self.recent_lengths):.2f}")
    
    def _log_progress(self, episode: int, metrics: Dict[str, float]) -> None:
        """Log training progress."""
        avg_reward = np.mean(self.recent_rewards)
        avg_length = np.mean(self.recent_lengths)
        
        print(f"Episode {episode:4d} | "
              f"Reward: {metrics['episode_reward']:6.2f} | "
              f"Avg Reward: {avg_reward:6.2f} | "
              f"Length: {metrics['episode_length']:2d} | "
              f"Cost: {metrics['episode_cost']:5.1f} | "
              f"Skill Δ: {metrics['skill_improvement']:6.3f} | "
              f"Policy Loss: {metrics['policy_loss']:6.3f}")
    
    def evaluate(self, num_episodes: int = 100, render: bool = False) -> Dict[str, float]:
        """Evaluate trained policy."""
        print(f"\nEvaluating policy over {num_episodes} episodes...")
        
        eval_rewards = []
        eval_lengths = []
        eval_costs = []
        eval_skill_improvements = []
        
        for episode in range(num_episodes):
            state, _ = self.env.reset()
            episode_reward = 0
            episode_length = 0
            total_skill_improvement = 0
            
            while True:
                # Select action (no exploration)
                action = self.agent.select_action(state)
                next_state, reward, terminated, truncated, info = self.env.step(action)
                
                episode_reward += reward
                episode_length += 1
                total_skill_improvement += info.get('skill_improvement', 0)
                
                if render and episode == 0:  # Render first episode
                    self.env.render()
                
                state = next_state
                
                if terminated or truncated:
                    break
            
            eval_rewards.append(episode_reward)
            eval_lengths.append(episode_length)
            eval_costs.append(info.get('current_cost', 0))
            eval_skill_improvements.append(total_skill_improvement)
        
        results = {
            'mean_reward': np.mean(eval_rewards),
            'std_reward': np.std(eval_rewards),
            'mean_length': np.mean(eval_lengths),
            'mean_cost': np.mean(eval_costs),
            'mean_skill_improvement': np.mean(eval_skill_improvements),
            'success_rate': np.mean([r > 0 for r in eval_rewards])
        }
        
        print("Evaluation Results:")
        print(f"  Mean Reward: {results['mean_reward']:.3f} ± {results['std_reward']:.3f}")
        print(f"  Mean Length: {results['mean_length']:.2f}")
        print(f"  Mean Cost: {results['mean_cost']:.2f}")
        print(f"  Mean Skill Improvement: {results['mean_skill_improvement']:.3f}")
        print(f"  Success Rate: {results['success_rate']:.2%}")
        
        return results
    
    def plot_training_curves(self, save_path: str = None) -> None:
        """Plot training progress curves."""
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        
        # Episode rewards
        axes[0, 0].plot(self.episode_rewards, alpha=0.7)
        axes[0, 0].plot(self._smooth_curve(self.episode_rewards, window=50), 'r-', linewidth=2)
        axes[0, 0].set_title('Episode Rewards')
        axes[0, 0].set_xlabel('Episode')
        axes[0, 0].set_ylabel('Reward')
        axes[0, 0].grid(True)
        
        # Episode lengths
        axes[0, 1].plot(self.episode_lengths, alpha=0.7)
        axes[0, 1].plot(self._smooth_curve(self.episode_lengths, window=50), 'r-', linewidth=2)
        axes[0, 1].set_title('Episode Lengths')
        axes[0, 1].set_xlabel('Episode')
        axes[0, 1].set_ylabel('Length')
        axes[0, 1].grid(True)
        
        # Episode costs
        axes[0, 2].plot(self.episode_costs, alpha=0.7)
        axes[0, 2].plot(self._smooth_curve(self.episode_costs, window=50), 'r-', linewidth=2)
        axes[0, 2].set_title('Episode Costs')
        axes[0, 2].set_xlabel('Episode')
        axes[0, 2].set_ylabel('Cost')
        axes[0, 2].grid(True)
        
        # Skill improvements
        axes[1, 0].plot(self.skill_improvements, alpha=0.7)
        axes[1, 0].plot(self._smooth_curve(self.skill_improvements, window=50), 'r-', linewidth=2)
        axes[1, 0].set_title('Skill Improvements')
        axes[1, 0].set_xlabel('Episode')
        axes[1, 0].set_ylabel('Skill Δ')
        axes[1, 0].grid(True)
        
        # Policy losses
        axes[1, 1].plot(self.policy_losses, alpha=0.7)
        axes[1, 1].plot(self._smooth_curve(self.policy_losses, window=50), 'r-', linewidth=2)
        axes[1, 1].set_title('Policy Losses')
        axes[1, 1].set_xlabel('Episode')
        axes[1, 1].set_ylabel('Loss')
        axes[1, 1].grid(True)
        
        # Value losses (if using baseline)
        if self.agent.use_baseline:
            axes[1, 2].plot(self.value_losses, alpha=0.7)
            axes[1, 2].plot(self._smooth_curve(self.value_losses, window=50), 'r-', linewidth=2)
            axes[1, 2].set_title('Value Losses')
            axes[1, 2].set_xlabel('Episode')
            axes[1, 2].set_ylabel('Loss')
            axes[1, 2].grid(True)
        else:
            axes[1, 2].text(0.5, 0.5, 'No Baseline Used', 
                           horizontalalignment='center', verticalalignment='center',
                           transform=axes[1, 2].transAxes, fontsize=14)
            axes[1, 2].set_title('Value Losses')
        
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Training curves saved to {save_path}")
        
        plt.show()
    
    def _smooth_curve(self, values: List[float], window: int = 50) -> List[float]:
        """Apply moving average smoothing to a curve."""
        if len(values) < window:
            return values
        
        smoothed = []
        for i in range(len(values)):
            start = max(0, i - window + 1)
            end = i + 1
            smoothed.append(np.mean(values[start:end]))
        
        return smoothed


# =============================================================================
# VISUALIZATION UTILITIES
# =============================================================================

class VisualizationUtils:
    """
    Utility class for visualizing training progress and employee skill development.
    """
    
    def __init__(self):
        plt.style.use('default')  # Use default style for compatibility
        self.skill_names = [
            'Coding', 'Debugging', 'Testing', 'Architecture',
            'Communication', 'Leadership', 'Teamwork', 'Problem-solving'
        ]
        
        self.hierarchical_names = {
            'technical_skills': 'Technical Skills',
            'soft_skills': 'Soft Skills',
            'coding_debugging': 'Coding & Debugging',
            'testing_architecture': 'Testing & Architecture',
            'communication_leadership': 'Communication & Leadership',
            'teamwork_problem_solving': 'Teamwork & Problem-solving'
        }
    
    def plot_skill_progression(self, env, agent, num_episodes: int = 5, save_path: str = None) -> None:
        """Plot skill progression over multiple episodes."""
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        colors = plt.cm.Set3(np.linspace(0, 1, len(self.skill_names)))
        
        for episode in range(num_episodes):
            state, _ = env.reset()
            initial_skills = state.copy()
            
            episode_skills = [initial_skills]
            episode_actions = []
            
            while True:
                action = agent.select_action(state)
                next_state, _, terminated, truncated, _ = env.step(action)
                
                episode_skills.append(next_state.copy())
                episode_actions.append(action)
                state = next_state
                
                if terminated or truncated:
                    break
            
            episode_skills = np.array(episode_skills)
            
            # Plot individual skills
            for i, (skill_name, color) in enumerate(zip(self.skill_names, colors)):
                axes[0, 0].plot(episode_skills[:, i], color=color, alpha=0.7, 
                               label=skill_name if episode == 0 else "")
            
            # Plot hierarchical skills
            hierarchical_skills = []
            for skills in episode_skills:
                hier_skills = env.get_hierarchical_skills(skills)
                hierarchical_skills.append(hier_skills)
            
            hierarchical_skills = pd.DataFrame(hierarchical_skills)
            
            for i, (key, name) in enumerate(self.hierarchical_names.items()):
                if key in hierarchical_skills.columns:
                    axes[0, 1].plot(hierarchical_skills[key], alpha=0.7,
                                   label=name if episode == 0 else "")
            
            # Plot action distribution
            action_counts = np.bincount(episode_actions, minlength=env.K)
            axes[1, 0].bar(range(env.K), action_counts, alpha=0.7, 
                          label=f'Episode {episode + 1}')
            
            # Plot skill improvement per step
            skill_improvements = np.diff(np.sum(episode_skills, axis=1))
            axes[1, 1].plot(skill_improvements, alpha=0.7, 
                           label=f'Episode {episode + 1}')
        
        # Configure subplots
        axes[0, 0].set_title('Individual Skill Progression')
        axes[0, 0].set_xlabel('Training Step')
        axes[0, 0].set_ylabel('Skill Level')
        axes[0, 0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        axes[0, 0].grid(True, alpha=0.3)
        
        axes[0, 1].set_title('Hierarchical Skill Progression')
        axes[0, 1].set_xlabel('Training Step')
        axes[0, 1].set_ylabel('Skill Level')
        axes[0, 1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        axes[0, 1].grid(True, alpha=0.3)
        
        axes[1, 0].set_title('Action Distribution')
        axes[1, 0].set_xlabel('Training Module')
        axes[1, 0].set_ylabel('Usage Count')
        axes[1, 0].set_xticks(range(env.K))
        axes[1, 0].set_xticklabels([f'Module {i}' for i in range(env.K)])
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
        
        axes[1, 1].set_title('Skill Improvement per Step')
        axes[1, 1].set_xlabel('Training Step')
        axes[1, 1].set_ylabel('Skill Δ')
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Skill progression plot saved to {save_path}")
        
        plt.show()
    
    def plot_synergy_matrix(self, env, save_path: str = None) -> None:
        """Plot the cross-attribute synergy matrix."""
        plt.figure(figsize=(10, 8))
        
        sns.heatmap(env.synergy_matrix,
                   xticklabels=self.skill_names,
                   yticklabels=self.skill_names,
                   annot=True,
                   fmt='.2f',
                   cmap='coolwarm',
                   center=0,
                   cbar_kws={'label': 'Synergy Coefficient'})
        
        plt.title('Cross-Attribute Synergy Matrix')
        plt.xlabel('Source Skill')
        plt.ylabel('Target Skill')
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
            print(f"Synergy matrix plot saved to {save_path}")
        
        plt.show()


# =============================================================================
# CONFIGURATION AND UTILITIES
# =============================================================================

def create_directories():
    """Create necessary directories for saving models and plots."""
    os.makedirs('models', exist_ok=True)
    os.makedirs('plots', exist_ok=True)
    os.makedirs('logs', exist_ok=True)


def get_default_config():
    """Get default configuration for training."""
    return {
        'num_episodes': 1000,
        'gamma': 0.99,
        'learning_rate': 1e-3,
        'hidden_dim': 128,
        'use_baseline': True,
        'log_interval': 50,
        'save_interval': 200,
        'model_save_path': 'models/employee_training_model.pth',
        'plot_save_path': 'plots/',
        
        # Environment parameters
        'D': 8,  # Number of skills
        'K': 4,  # Number of training modules
        'alpha': [0.3, 0.25, 0.2, 0.35],  # Learning rates
        'beta': 0.02,  # Forgetting rate
        'kappa': 1.5,  # Diminishing returns
        'C_max': 100.0,  # Cost budget
        
        # Evaluation parameters
        'eval_episodes': 100,
        'eval_render': False
    }


# =============================================================================
# EXAMPLE USAGE FUNCTIONS
# =============================================================================

def example_basic_training():
    """Basic training example with default parameters."""
    print("=== Basic Training Example ===")
    
    # Create environment
    env = EmployeeTrainingEnv(D=8, K=4)
    
    # Create agent
    agent = REINFORCEAgent(
        state_dim=env.D,
        action_dim=env.K,
        use_baseline=True
    )
    
    # Training configuration
    config = {
        'num_episodes': 200,
        'gamma': 0.99,
        'log_interval': 50,
        'save_interval': 100,
        'model_save_path': 'models/basic_example.pth'
    }
    
    # Create and run training loop
    training_loop = TrainingLoop(env, agent, config)
    training_loop.train()
    
    # Evaluate
    results = training_loop.evaluate(num_episodes=20)
    print(f"Training completed. Final performance: {results['mean_reward']:.3f}")
    
    return env, agent, training_loop


def example_skill_analysis():
    """Example showing skill development analysis."""
    print("\n=== Skill Development Analysis ===")
    
    env = EmployeeTrainingEnv()
    agent = REINFORCEAgent(
        state_dim=env.D,
        action_dim=env.K,
        use_baseline=True
    )
    
    # Quick training
    config = {
        'num_episodes': 100,
        'log_interval': 25,
        'model_save_path': 'models/analysis_example.pth'
    }
    
    training_loop = TrainingLoop(env, agent, config)
    training_loop.train()
    
    # Analyze skill development
    state, _ = env.reset()
    print(f"Initial skills: {state}")
    print(f"Initial hierarchical skills: {env.get_hierarchical_skills(state)}")
    
    episode_history = []
    for step in range(20):
        action = agent.select_action(state)
        next_state, reward, terminated, truncated, info = env.step(action)
        
        episode_history.append({
            'step': step,
            'action': action,
            'reward': reward,
            'skills': next_state.copy(),
            'hierarchical': env.get_hierarchical_skills(next_state),
            'skill_improvement': info.get('skill_improvement', 0)
        })
        
        print(f"Step {step}: Action {action}, Reward {reward:.3f}, "
              f"Skill Δ {info.get('skill_improvement', 0):.3f}")
        
        state = next_state
        if terminated or truncated:
            break
    
    print(f"\nFinal skills: {state}")
    print(f"Final hierarchical skills: {env.get_hierarchical_skills(state)}")
    
    # Calculate total improvement
    total_improvement = sum(h['skill_improvement'] for h in episode_history)
    print(f"Total skill improvement: {total_improvement:.3f}")
    
    return env, agent, episode_history


# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main():
    """Main entry point for the employee training optimization system."""
    parser = argparse.ArgumentParser(description='Employee Training Optimization with RL')
    parser.add_argument('--mode', choices=['train', 'evaluate', 'visualize', 'example'], 
                       default='train', help='Run mode')
    parser.add_argument('--model', type=str, help='Path to saved model for evaluation')
    parser.add_argument('--episodes', type=int, default=1000, help='Number of training episodes')
    parser.add_argument('--no-baseline', action='store_true', help='Disable baseline (Actor-Critic)')
    parser.add_argument('--seed', type=int, default=42, help='Random seed')
    
    args = parser.parse_args()
    
    # Set random seeds for reproducibility
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    
    # Create directories
    create_directories()
    
    # Load configuration
    config = get_default_config()
    
    # Override config with command line arguments
    if args.episodes:
        config['num_episodes'] = args.episodes
    if args.no_baseline:
        config['use_baseline'] = False
    
    print("Employee Training Optimization System")
    print("=" * 50)
    print(f"Mode: {args.mode}")
    print(f"Seed: {args.seed}")
    print("=" * 50)
    
    if args.mode == 'example':
        # Run examples
        print("Running example demonstrations...")
        
        # Basic training example
        env1, agent1, loop1 = example_basic_training()
        
        # Skill analysis example
        env2, agent2, history = example_skill_analysis()
        
        # Visualize results
        viz = VisualizationUtils()
        viz.plot_skill_progression(env1, agent1, num_episodes=3)
        viz.plot_synergy_matrix(env1)
        
        print("\n=== All Examples Completed Successfully! ===")
        
    elif args.mode == 'train':
        # Create environment
        env = EmployeeTrainingEnv(
            D=config['D'],
            K=config['K'],
            alpha=config['alpha'],
            beta=config['beta'],
            kappa=config['kappa'],
            C_max=config['C_max'],
            gamma=config['gamma']
        )
        
        # Create agent
        agent = REINFORCEAgent(
            state_dim=env.D,
            action_dim=env.K,
            hidden_dim=config['hidden_dim'],
            learning_rate=config['learning_rate'],
            use_baseline=config['use_baseline']
        )
        
        # Create training loop
        training_loop = TrainingLoop(env, agent, config)
        
        print(f"Environment: {env.D} skills, {env.K} training modules")
        print(f"Agent: {'Actor-Critic' if config['use_baseline'] else 'REINFORCE'}")
        
        # Training mode
        print("\nStarting training...")
        training_loop.train()
        
        # Plot training curves
        training_loop.plot_training_curves(
            save_path=os.path.join(config['plot_save_path'], 'training_curves.png')
        )
        
        # Evaluate trained model
        print("\nEvaluating trained model...")
        eval_results = training_loop.evaluate(
            num_episodes=config['eval_episodes'],
            render=config['eval_render']
        )
        
        # Visualize skill progression
        viz = VisualizationUtils()
        viz.plot_skill_progression(
            env, agent, num_episodes=3,
            save_path=os.path.join(config['plot_save_path'], 'skill_progression.png')
        )
        
    elif args.mode == 'evaluate':
        # Create environment and agent
        env = EmployeeTrainingEnv(
            D=config['D'],
            K=config['K'],
            alpha=config['alpha'],
            beta=config['beta'],
            kappa=config['kappa'],
            C_max=config['C_max'],
            gamma=config['gamma']
        )
        
        agent = REINFORCEAgent(
            state_dim=env.D,
            action_dim=env.K,
            hidden_dim=config['hidden_dim'],
            learning_rate=config['learning_rate'],
            use_baseline=config['use_baseline']
        )
        
        # Load model if specified
        if args.model:
            agent.load_model(args.model)
            print(f"Loaded model from {args.model}")
        else:
            print("No model specified for evaluation. Using randomly initialized policy.")
        
        # Create training loop for evaluation
        training_loop = TrainingLoop(env, agent, config)
        
        eval_results = training_loop.evaluate(
            num_episodes=config['eval_episodes'],
            render=True
        )
        
        # Visualize skill progression
        viz = VisualizationUtils()
        viz.plot_skill_progression(env, agent, num_episodes=5)
        
    elif args.mode == 'visualize':
        # Create environment and agent
        env = EmployeeTrainingEnv(
            D=config['D'],
            K=config['K'],
            alpha=config['alpha'],
            beta=config['beta'],
            kappa=config['kappa'],
            C_max=config['C_max'],
            gamma=config['gamma']
        )
        
        agent = REINFORCEAgent(
            state_dim=env.D,
            action_dim=env.K,
            hidden_dim=config['hidden_dim'],
            learning_rate=config['learning_rate'],
            use_baseline=config['use_baseline']
        )
        
        # Load model if specified
        if args.model:
            agent.load_model(args.model)
            print(f"Loaded model from {args.model}")
        
        print("\nGenerating visualizations...")
        
        # Create visualization utils
        viz = VisualizationUtils()
        
        # Plot synergy matrix
        viz.plot_synergy_matrix(
            env, 
            save_path=os.path.join(config['plot_save_path'], 'synergy_matrix.png')
        )
        
        # Plot skill progression
        viz.plot_skill_progression(
            env, agent, num_episodes=5,
            save_path=os.path.join(config['plot_save_path'], 'skill_progression.png')
        )
        
        print("Visualizations completed!")
    
    print("\nSystem execution completed.")


if __name__ == "__main__":
    main()

Overwriting single_file_rl_training.py


In [38]:
!python single_file_rl_training.py --mode train --episodes 10000

Employee Training Optimization System
Mode: train
Seed: 42
Environment: 8 skills, 4 training modules
Agent: Actor-Critic

Starting training...
Starting training...
Episodes: 10000
Environment: 8 skills, 4 training modules
Agent: with baseline
--------------------------------------------------
Episode   50 | Reward:  -8.49 | Avg Reward:  -9.14 | Length:  9 | Cost:   0.0 | Skill Δ:  2.486 | Policy Loss: -1.425
Episode  100 | Reward:  -9.45 | Avg Reward:  -8.59 | Length:  9 | Cost:   0.0 | Skill Δ:  1.474 | Policy Loss: -4.817
Episode  150 | Reward:  -9.44 | Avg Reward:  -7.92 | Length: 10 | Cost:   0.0 | Skill Δ:  1.495 | Policy Loss:  0.645
Episode  200 | Reward:  -9.07 | Avg Reward:  -8.00 | Length: 10 | Cost:   0.0 | Skill Δ:  1.892 | Policy Loss:  0.642
Model saved at episode 200
Episode  250 | Reward:   1.44 | Avg Reward:  -7.45 | Length:  9 | Cost: 100.0 | Skill Δ:  2.445 | Policy Loss: -1.820
Episode  300 | Reward:  -9.21 | Avg Reward:  -7.08 | Length: 10 | Cost:   0.0 | Skill Δ: 