# Improved RL Training with Better Hyperparameters

This notebook retrains the RL agent with:
- More training episodes (10,000)
- Slower epsilon decay for better exploration
- Higher learning rate
- Adjusted reward function alignment with scoring formula

**Expected time on M1 Pro:** ~30-45 minutes

In [None]:
import sys
sys.path.append('../src')

import numpy as np
import random
import pickle
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import defaultdict

from rl_agent import QLearningAgent
from hmm_model import HangmanHMM
from hangman_env import HangmanEnv
from utils import encode_state

# Load HMM model
hmm = HangmanHMM()
hmm.load('../models/hmm_model.pkl')
print("HMM model loaded")

# Load corpus words
with open('../results/preprocessed_data.pkl', 'rb') as f:
    preprocessed_data = pickle.load(f)

corpus_words = preprocessed_data['words']
print(f"Loaded {len(corpus_words)} words from corpus")

In [None]:
# Initialize RL agent with IMPROVED hyperparameters
agent = QLearningAgent(
    learning_rate=0.15,        # Increased from 0.1
    discount_factor=0.95,
    epsilon=1.0,
    epsilon_decay=0.9998,      # Slower decay from 0.9995
    epsilon_min=0.02,          # Lower minimum for more exploitation
    max_word_length=30
)

print("RL agent initialized with IMPROVED hyperparameters")
print(f"Learning rate: {agent.learning_rate}")
print(f"Initial epsilon: {agent.epsilon}")
print(f"Epsilon decay: {agent.epsilon_decay}")
print(f"Min epsilon: {agent.epsilon_min}")

In [None]:
# Training hyperparameters - MORE EPISODES
num_episodes = 10000       # Increased from 3000
eval_interval = 1000       # Evaluate every 1000 episodes
eval_episodes = 100

# Training statistics
training_history = {
    'episode_rewards': [],
    'episode_wins': [],
    'episode_wrong_guesses': [],
    'epsilon_values': [],
    'eval_win_rates': [],
    'eval_avg_rewards': []
}

print(f"Training for {num_episodes} episodes...")
print(f"Expected time: ~45 minutes on M1 Pro")

In [None]:
def train_episode(agent, hmm, word):
    """Train agent on a single episode."""
    env = HangmanEnv(word, max_lives=6)
    state = env.reset()
    
    episode_reward = 0.0
    episode_wrong = 0
    
    prev_state_features = None
    prev_hmm_probs = None
    prev_action = None
    prev_reward = 0.0
    
    while not env.done:
        masked_list = env.get_masked_word_list()
        hmm_probs = hmm.predict_letter_probabilities(masked_list, env.guessed_letters, len(word))
        state_features = encode_state(masked_list, env.guessed_letters, hmm_probs,
                                     env.lives, len(word))
        
        action = agent.select_action(state_features, hmm_probs, env.guessed_letters)
        next_state, reward, done, info = env.step(action)
        episode_reward += reward
        
        if not info.get('correct', False) and not info.get('repeated', False):
            episode_wrong += 1
        
        if prev_state_features is not None:
            agent.update_q_value(
                prev_state_features,
                prev_action,
                prev_reward,
                state_features,
                hmm_probs,
                env.guessed_letters,
                done,
                current_hmm_probs=prev_hmm_probs
            )
        
        prev_state_features = state_features
        prev_hmm_probs = hmm_probs
        prev_action = action
        prev_reward = reward
        
        if done:
            agent.update_q_value(
                prev_state_features,
                prev_action,
                prev_reward,
                None,
                None,
                None,
                done,
                current_hmm_probs=prev_hmm_probs
            )
            break
    
    return episode_reward, env.won, episode_wrong

print("Training function defined")

In [None]:
# Main training loop
print("=" * 60)
print("STARTING IMPROVED RL TRAINING")
print("=" * 60)

for episode in tqdm(range(num_episodes), desc="Training episodes"):
    # Sample a random word
    word = random.choice(corpus_words)
    
    # Train on this episode
    reward, won, wrong = train_episode(agent, hmm, word)
    
    # Record statistics
    training_history['episode_rewards'].append(reward)
    training_history['episode_wins'].append(1 if won else 0)
    training_history['episode_wrong_guesses'].append(wrong)
    training_history['epsilon_values'].append(agent.epsilon)
    
    # Decay epsilon
    agent.decay_epsilon()
    
    # Periodic evaluation
    if (episode + 1) % eval_interval == 0:
        # Evaluate on random sample
        eval_words = random.sample(corpus_words, eval_episodes)
        eval_rewards = []
        eval_wins = 0
        
        old_epsilon = agent.epsilon
        agent.epsilon = 0.0  # Pure exploitation for eval
        
        for eval_word in eval_words:
            r, w, _ = train_episode(agent, hmm, eval_word)
            eval_rewards.append(r)
            if w:
                eval_wins += 1
        
        agent.epsilon = old_epsilon
        
        win_rate = eval_wins / eval_episodes
        avg_reward = np.mean(eval_rewards)
        
        training_history['eval_win_rates'].append(win_rate)
        training_history['eval_avg_rewards'].append(avg_reward)
        
        print(f"\n[Episode {episode+1}]")
        print(f"  Eval Win Rate: {win_rate:.4f} ({win_rate*100:.2f}%)")
        print(f"  Eval Avg Reward: {avg_reward:.2f}")
        print(f"  Current Epsilon: {agent.epsilon:.4f}")
        print(f"  Q-table size: {len(agent.q_table)} states")

print("\n" + "=" * 60)
print("TRAINING COMPLETE!")
print("=" * 60)

In [None]:
# Save the improved model
agent.save('../models/rl_agent_improved.pkl')
print("✓ Improved RL agent saved to ../models/rl_agent_improved.pkl")

# Also save training history
with open('../results/training_history_improved.pkl', 'wb') as f:
    pickle.dump(training_history, f)
print("✓ Training history saved")

In [None]:
# Plot training progress
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Episode rewards (moving average)
window = 100
rewards_ma = np.convolve(training_history['episode_rewards'], 
                        np.ones(window)/window, mode='valid')
axes[0, 0].plot(rewards_ma)
axes[0, 0].set_title('Episode Rewards (100-episode moving average)')
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Reward')
axes[0, 0].grid(True)

# Plot 2: Win rate (moving average)
wins_ma = np.convolve(training_history['episode_wins'], 
                     np.ones(window)/window, mode='valid')
axes[0, 1].plot(wins_ma)
axes[0, 1].set_title('Win Rate (100-episode moving average)')
axes[0, 1].set_xlabel('Episode')
axes[0, 1].set_ylabel('Win Rate')
axes[0, 1].grid(True)

# Plot 3: Epsilon decay
axes[1, 0].plot(training_history['epsilon_values'])
axes[1, 0].set_title('Epsilon Decay')
axes[1, 0].set_xlabel('Episode')
axes[1, 0].set_ylabel('Epsilon')
axes[1, 0].grid(True)

# Plot 4: Evaluation win rates
eval_episodes_x = list(range(eval_interval, num_episodes+1, eval_interval))
axes[1, 1].plot(eval_episodes_x, training_history['eval_win_rates'], 'o-')
axes[1, 1].set_title('Evaluation Win Rate Over Training')
axes[1, 1].set_xlabel('Episode')
axes[1, 1].set_ylabel('Win Rate')
axes[1, 1].grid(True)

plt.tight_layout()
plt.savefig('../results/training_progress_improved.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Training plots saved")

In [None]:
# Final statistics
print("\n" + "=" * 60)
print("FINAL TRAINING STATISTICS")
print("=" * 60)
print(f"Total episodes trained: {num_episodes}")
print(f"Final epsilon: {agent.epsilon:.4f}")
print(f"Q-table size: {len(agent.q_table)} states")
print(f"Total state-action pairs: {sum(len(actions) for actions in agent.q_table.values())}")

# Last 1000 episodes statistics
last_1000_wins = sum(training_history['episode_wins'][-1000:])
last_1000_win_rate = last_1000_wins / 1000
print(f"\nLast 1000 episodes:")
print(f"  Win rate: {last_1000_win_rate:.4f} ({last_1000_win_rate*100:.2f}%)")
print(f"  Avg reward: {np.mean(training_history['episode_rewards'][-1000:]):.2f}")
print(f"  Avg wrong guesses: {np.mean(training_history['episode_wrong_guesses'][-1000:]):.2f}")

if training_history['eval_win_rates']:
    print(f"\nBest evaluation win rate: {max(training_history['eval_win_rates']):.4f}")
    print(f"Final evaluation win rate: {training_history['eval_win_rates'][-1]:.4f}")

print("=" * 60)

## Next Step: Evaluate the Improved Model

Now go back to `07_evaluation.ipynb` and:
1. Change the model loading line to: `agent.load('../models/rl_agent_improved.pkl')`
2. Re-run the evaluation cells
3. Compare the new results!