# Proper RL Training from Scratch

**Goal:** Train a PROPER RL agent that learns to use the improved HMM effectively

Strategy: Train Q-learning agent that combines learned Q-values with improved HMM probabilities

In [10]:
import sys
sys.path.append('../src')

import numpy as np
import pickle
from collections import defaultdict, Counter
from tqdm import tqdm
from hangman_env import HangmanEnv
from utils import calculate_final_score

# Define ImprovedHMM class (needed to load the pickle)
class ImprovedHMM:
    def __init__(self):
        self.alphabet = 'abcdefghijklmnopqrstuvwxyz'
        self.global_freq = Counter()
        self.bigrams = defaultdict(Counter)
        self.position_freq = defaultdict(Counter)
        self.length_patterns = defaultdict(Counter)
    
    def predict_letter_probabilities(self, masked_word, guessed_letters, word_length):
        """Predict letter probabilities for current state."""
        probs = {c: 0.0 for c in self.alphabet}
        
        # Strategy 1: Global frequency
        for char in self.alphabet:
            if char not in guessed_letters:
                probs[char] += self.global_freq.get(char, 0.0) * 1.0
        
        # Strategy 2: Bigrams
        for i, char in enumerate(masked_word):
            if char is not None:
                if i + 1 < len(masked_word) and masked_word[i+1] is None:
                    if char in self.bigrams:
                        total = sum(self.bigrams[char].values())
                        if total > 0:
                            for next_char, count in self.bigrams[char].items():
                                if next_char not in guessed_letters:
                                    probs[next_char] += (count / total) * 2.0
                
                if i > 0 and masked_word[i-1] is None:
                    for prev_char in self.alphabet:
                        if prev_char not in guessed_letters and prev_char in self.bigrams:
                            if char in self.bigrams[prev_char]:
                                count = self.bigrams[prev_char][char]
                                total = sum(self.bigrams[prev_char].values())
                                if total > 0:
                                    probs[prev_char] += (count / total) * 2.0
        
        # Strategy 3: Position frequency
        for i, char in enumerate(masked_word):
            if char is None and i < 20:
                if i in self.position_freq:
                    total = sum(self.position_freq[i].values())
                    if total > 0:
                        for c, count in self.position_freq[i].items():
                            if c not in guessed_letters:
                                probs[c] += (count / total) * 1.5
        
        # Strategy 4: Length patterns
        if word_length in self.length_patterns:
            total = sum(self.length_patterns[word_length].values())
            if total > 0:
                for c, count in self.length_patterns[word_length].items():
                    if c not in guessed_letters:
                        probs[c] += (count / total) * 0.5
        
        # Normalize
        total = sum(probs.values())
        if total > 0:
            probs = {c: p/total for c, p in probs.items()}
        
        return probs

# Load the improved HMM
with open('../models/improved_hmm.pkl', 'rb') as f:
    hmm = pickle.load(f)

print("âœ“ Loaded improved HMM")

# Load corpus for training
with open('../Data/corpus.txt', 'r', encoding='utf-8') as f:
    corpus_words = [line.strip().lower() for line in f if line.strip()]
corpus_words = [''.join(c for c in word if c.isalpha()) for word in corpus_words]
corpus_words = [w for w in corpus_words if len(w) > 0]

print(f"âœ“ Loaded {len(corpus_words)} training words")

âœ“ Loaded improved HMM
âœ“ Loaded 50000 training words


## Define Improved RL Agent

Key improvements:
1. Better state representation
2. Optimized hyperparameters
3. Better exploration strategy
4. Adaptive HMM weight

In [2]:
class ImprovedRLAgent:
    def __init__(self, hmm, alphabet='abcdefghijklmnopqrstuvwxyz'):
        self.alphabet = alphabet
        self.hmm = hmm
        
        # Q-table
        self.q_table = defaultdict(lambda: np.zeros(len(alphabet)))
        
        # Hyperparameters - optimized
        self.alpha = 0.1  # Learning rate
        self.gamma = 0.9  # Discount factor
        self.epsilon = 1.0  # Exploration rate (starts high)
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        
        # HMM weight - adaptive
        self.hmm_weight_start = 2.0  # Start with high HMM influence
        self.hmm_weight_end = 1.0    # End with balanced approach
        self.hmm_weight = self.hmm_weight_start
    
    def get_state(self, env):
        """Enhanced state representation."""
        masked = env.get_masked_word()
        lives = env.lives
        word_len = len(env.word)
        guessed_count = len(env.guessed_letters)
        
        # Create state tuple
        state = (masked, lives, word_len, guessed_count)
        return state
    
    def select_action(self, state, env, training=True):
        """Select action using epsilon-greedy with HMM prior."""
        # Exploration
        if training and np.random.random() < self.epsilon:
            available = [c for c in self.alphabet if c not in env.guessed_letters]
            if available:
                return np.random.choice(available)
            return None
        
        # Exploitation: Q-values + HMM probabilities
        q_values = self.q_table[state].copy()
        
        # Get HMM probabilities
        masked_list = env.get_masked_word_list()
        hmm_probs = self.hmm.predict_letter_probabilities(
            masked_list, 
            env.guessed_letters, 
            len(env.word)
        )
        
        # Combine Q-values and HMM probabilities
        action_values = np.zeros(len(self.alphabet))
        for i, char in enumerate(self.alphabet):
            if char not in env.guessed_letters:
                action_values[i] = q_values[i] + self.hmm_weight * hmm_probs.get(char, 0.0)
            else:
                action_values[i] = -np.inf
        
        if np.all(action_values == -np.inf):
            return None
        
        action_idx = np.argmax(action_values)
        return self.alphabet[action_idx]
    
    def update(self, state, action, reward, next_state, done):
        """Update Q-value using Q-learning."""
        if action is None:
            return
        
        action_idx = self.alphabet.index(action)
        
        # Current Q-value
        current_q = self.q_table[state][action_idx]
        
        # Max Q-value for next state
        if done:
            max_next_q = 0
        else:
            max_next_q = np.max(self.q_table[next_state])
        
        # Q-learning update
        new_q = current_q + self.alpha * (reward + self.gamma * max_next_q - current_q)
        self.q_table[state][action_idx] = new_q
    
    def decay_epsilon(self):
        """Decay exploration rate."""
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
    
    def update_hmm_weight(self, progress):
        """Gradually reduce HMM weight as agent learns."""
        self.hmm_weight = self.hmm_weight_start + progress * (self.hmm_weight_end - self.hmm_weight_start)
    
    def save(self, filepath):
        """Save agent."""
        data = {
            'q_table': dict(self.q_table),
            'epsilon': self.epsilon,
            'hmm_weight': self.hmm_weight
        }
        with open(filepath, 'wb') as f:
            pickle.dump(data, f)
        print(f"âœ“ Agent saved to {filepath}")
    
    def load(self, filepath):
        """Load agent."""
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
        self.q_table = defaultdict(lambda: np.zeros(len(self.alphabet)), data['q_table'])
        self.epsilon = data['epsilon']
        self.hmm_weight = data['hmm_weight']
        print(f"âœ“ Agent loaded from {filepath}")

print("âœ“ ImprovedRLAgent class defined")

âœ“ ImprovedRLAgent class defined


## Train RL Agent

Train on corpus words with proper reward shaping

In [5]:
def train_agent(agent, train_words, episodes=5000):
    """Train the RL agent."""
    print(f"Training RL agent for {episodes} episodes...\n")
    
    episode_rewards = []
    win_rates = []
    
    for episode in tqdm(range(episodes), desc="Training"):
        # Sample random word
        word = np.random.choice(train_words)
        env = HangmanEnv(word, max_lives=6)
        state = agent.get_state(env)
        
        episode_reward = 0
        
        while not env.done:
            # Select action
            action = agent.select_action(state, env, training=True)
            
            if action is None:
                break
            
            # Take action - returns (observation, reward, done, info)
            _, reward, done, info = env.step(action)
            
            episode_reward += reward
            
            # Get next state
            next_state = agent.get_state(env)
            
            # Update Q-table
            agent.update(state, action, reward, next_state, env.done)
            
            state = next_state
        
        episode_rewards.append(episode_reward)
        win_rates.append(1 if env.won else 0)
        
        # Decay epsilon
        agent.decay_epsilon()
        
        # Update HMM weight
        progress = episode / episodes
        agent.update_hmm_weight(progress)
        
        # Log progress
        if (episode + 1) % 1000 == 0:
            recent_win_rate = np.mean(win_rates[-1000:])
            recent_reward = np.mean(episode_rewards[-1000:])
            print(f"\nEpisode {episode + 1}:")
            print(f"  Win Rate (last 1000): {recent_win_rate:.2%}")
            print(f"  Avg Reward: {recent_reward:.2f}")
            print(f"  Epsilon: {agent.epsilon:.4f}")
            print(f"  HMM Weight: {agent.hmm_weight:.2f}")
            print(f"  Q-table size: {len(agent.q_table)}")
    
    print("\nâœ… Training complete!")
    return episode_rewards, win_rates

# Create and train agent
agent = ImprovedRLAgent(hmm)
episode_rewards, win_rates = train_agent(agent, corpus_words, episodes=5000)

Training RL agent for 5000 episodes...



Training:  20%|â–ˆâ–ˆ        | 1025/5000 [00:07<00:27, 142.04it/s]


Episode 1000:
  Win Rate (last 1000): 16.40%
  Avg Reward: 0.15
  Epsilon: 0.0100
  HMM Weight: 1.80
  Q-table size: 8134


Training:  41%|â–ˆâ–ˆâ–ˆâ–ˆ      | 2028/5000 [00:14<00:21, 141.44it/s]


Episode 2000:
  Win Rate (last 1000): 19.80%
  Avg Reward: 1.43
  Epsilon: 0.0100
  HMM Weight: 1.60
  Q-table size: 15993


Training:  61%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ    | 3027/5000 [00:21<00:13, 141.32it/s]


Episode 3000:
  Win Rate (last 1000): 20.60%
  Avg Reward: 1.78
  Epsilon: 0.0100
  HMM Weight: 1.40
  Q-table size: 23631


Training:  80%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ  | 4015/5000 [00:28<00:06, 140.89it/s]


Episode 4000:
  Win Rate (last 1000): 20.80%
  Avg Reward: 1.93
  Epsilon: 0.0100
  HMM Weight: 1.20
  Q-table size: 30992


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5000/5000 [00:35<00:00, 141.19it/s]


Episode 5000:
  Win Rate (last 1000): 23.80%
  Avg Reward: 2.38
  Epsilon: 0.0100
  HMM Weight: 1.00
  Q-table size: 38155

âœ… Training complete!





## Evaluate Trained Agent

In [6]:
# Load test words
with open('../Data/test.txt', 'r') as f:
    test_words = [''.join(c for c in line.strip().lower() if c.isalpha()) for line in f if line.strip()][:500]

def evaluate_agent(agent, test_words):
    """Evaluate agent on test set."""
    results = []
    
    for word in tqdm(test_words, desc="Evaluating"):
        env = HangmanEnv(word, max_lives=6)
        state = agent.get_state(env)
        
        while not env.done:
            action = agent.select_action(state, env, training=False)
            if action is None:
                break
            env.step(action)
            state = agent.get_state(env)
        
        stats = env.get_stats()
        results.append({
            'won': env.won,
            'wrong': stats['wrong_count'],
            'repeated': stats['repeated_count']
        })
    
    wins = sum(1 for r in results if r['won'])
    rate = wins / len(results)
    wrong = sum(r['wrong'] for r in results)
    repeated = sum(r['repeated'] for r in results)
    score = calculate_final_score(rate, wrong, repeated, len(results))
    
    return rate, score, wrong, repeated

print("\n" + "="*60)
print("EVALUATING IMPROVED RL AGENT (500 test words)")
print("="*60)

rate, score, wrong, repeated = evaluate_agent(agent, test_words)

print(f"\nWin Rate: {rate:.4f} ({rate*100:.2f}%)")
print(f"Wrong Guesses: {wrong} (avg: {wrong/len(test_words):.2f})")
print(f"Repeated: {repeated}")
print(f"Score: {score:.2f}")
print("="*60)


EVALUATING IMPROVED RL AGENT (500 test words)


Evaluating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 500/500 [00:00<00:00, 1476.68it/s]


Win Rate: 0.2360 (23.60%)
Wrong Guesses: 2728 (avg: 5.46)
Repeated: 0
Score: -13522.00





## Save Trained Agent

In [7]:
# Save the trained agent
agent.save('../models/improved_rl_agent.pkl')
print("\nâœ… Improved RL agent saved!")

âœ“ Agent saved to ../models/improved_rl_agent.pkl

âœ… Improved RL agent saved!


## Final Evaluation on Full Test Set

In [8]:
# Load ALL test words
with open('../Data/test.txt', 'r') as f:
    all_test_words = [''.join(c for c in line.strip().lower() if c.isalpha()) for line in f if line.strip()]

print(f"\nRunning FINAL evaluation on all {len(all_test_words)} test words...\n")

rate, score, wrong, repeated = evaluate_agent(agent, all_test_words)

print("\n" + "="*70)
print("FINAL RESULTS - IMPROVED RL + IMPROVED HMM")
print("="*70)
print(f"Win Rate: {rate:.4f} ({rate*100:.2f}%)")
print(f"Total Wrong Guesses: {wrong}")
print(f"Total Repeated: {repeated}")
print(f"\nðŸŽ¯ FINAL SCORE: {score:.2f}")
print("="*70)

# Compare with baseline
print("\nðŸ“Š Comparison:")
print(f"  Previous best: 19.25% win rate, -55,325 score")
print(f"  Current:       {rate*100:.2f}% win rate, {score:.2f} score")
improvement = rate * 100 - 19.25
print(f"  Improvement:   {improvement:+.2f} percentage points")


Running FINAL evaluation on all 2000 test words...



Evaluating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2000/2000 [00:01<00:00, 1555.56it/s]


FINAL RESULTS - IMPROVED RL + IMPROVED HMM
Win Rate: 0.2110 (21.10%)
Total Wrong Guesses: 11023
Total Repeated: 0

ðŸŽ¯ FINAL SCORE: -54693.00

ðŸ“Š Comparison:
  Previous best: 19.25% win rate, -55,325 score
  Current:       21.10% win rate, -54693.00 score
  Improvement:   +1.85 percentage points



