# FIX #1: Abstract State Space

## THE PROBLEM:
- Current: 38,155 unique states in Q-table
- Training: 5,000 episodes
- Result: 0.13 visits per state ‚Üí NO LEARNING!

## THE SOLUTION:
- Abstract state representation
- Target: ~5,000 unique states
- Training: 20,000 episodes
- Result: 4.0 visits per state ‚Üí LEARNING POSSIBLE!

## EXPECTED IMPACT:
+25-35% win rate improvement (21% ‚Üí 46-56%)

In [1]:
import sys
sys.path.append('../src')

import numpy as np
import pickle
from collections import defaultdict, Counter
from tqdm import tqdm
from hangman_env import HangmanEnv
from utils import calculate_final_score

print("‚úì Imports successful")

# Load corpus FIRST (no dependencies)
with open('../Data/corpus.txt', 'r', encoding='utf-8') as f:
    corpus_words = [line.strip().lower() for line in f if line.strip()]
corpus_words = [''.join(c for c in word if c.isalpha()) for word in corpus_words]
corpus_words = [w for w in corpus_words if len(w) > 0]

print(f"‚úì Loaded {len(corpus_words)} training words")

# Define ImprovedHMM class (needed BEFORE loading pickle)
class ImprovedHMM:
    def __init__(self):
        self.alphabet = 'abcdefghijklmnopqrstuvwxyz'
        self.global_freq = Counter()
        self.bigrams = defaultdict(Counter)
        self.position_freq = defaultdict(Counter)
        self.length_patterns = defaultdict(Counter)
    
    def predict_letter_probabilities(self, masked_word, guessed_letters, word_length):
        probs = {c: 0.0 for c in self.alphabet}
        
        for char in self.alphabet:
            if char not in guessed_letters:
                probs[char] += self.global_freq.get(char, 0.0) * 1.0
        
        for i, char in enumerate(masked_word):
            if char is not None:
                if i + 1 < len(masked_word) and masked_word[i+1] is None:
                    if char in self.bigrams:
                        total = sum(self.bigrams[char].values())
                        if total > 0:
                            for next_char, count in self.bigrams[char].items():
                                if next_char not in guessed_letters:
                                    probs[next_char] += (count / total) * 2.0
                
                if i > 0 and masked_word[i-1] is None:
                    for prev_char in self.alphabet:
                        if prev_char not in guessed_letters and prev_char in self.bigrams:
                            if char in self.bigrams[prev_char]:
                                count = self.bigrams[prev_char][char]
                                total = sum(self.bigrams[prev_char].values())
                                if total > 0:
                                    probs[prev_char] += (count / total) * 2.0
        
        for i, char in enumerate(masked_word):
            if char is None and i < 20:
                if i in self.position_freq:
                    total = sum(self.position_freq[i].values())
                    if total > 0:
                        for c, count in self.position_freq[i].items():
                            if c not in guessed_letters:
                                probs[c] += (count / total) * 1.5
        
        if word_length in self.length_patterns:
            total = sum(self.length_patterns[word_length].values())
            if total > 0:
                for c, count in self.length_patterns[word_length].items():
                    if c not in guessed_letters:
                        probs[c] += (count / total) * 0.5
        
        total = sum(probs.values())
        if total > 0:
            probs = {c: p/total for c, p in probs.items()}
        
        return probs

print("‚úì ImprovedHMM class defined")

# NOW we can load the pickle
with open('../models/improved_hmm.pkl', 'rb') as f:
    hmm = pickle.load(f)

print("‚úì Loaded improved HMM model")

‚úì Imports successful
‚úì Loaded 50000 training words
‚úì ImprovedHMM class defined
‚úì Loaded improved HMM model
‚úì Loaded 50000 training words
‚úì ImprovedHMM class defined
‚úì Loaded improved HMM model


## Define RL Agent with Abstract State Space

In [7]:
class AbstractStateRLAgent:
    """RL Agent with BETTER abstract state representation - FIX #1 REVISED"""
    
    def __init__(self, hmm, alphabet='abcdefghijklmnopqrstuvwxyz'):
        self.alphabet = alphabet
        self.hmm = hmm
        
        # Q-table with ABSTRACT states
        self.q_table = defaultdict(lambda: np.zeros(len(alphabet)))
        
        # Hyperparameters
        self.alpha = 0.15
        self.gamma = 0.9
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.9995
        
        # HMM weight
        self.hmm_weight_start = 2.0
        self.hmm_weight_end = 1.0
        self.hmm_weight = self.hmm_weight_start
        
        # Track state space size
        self.state_visits = Counter()
    
    def get_abstract_state(self, env):
        """REVISED: Simpler, more effective abstraction"""
        masked = env.get_masked_word_list()
        word_len = len(env.word)
        
        # 1. Length bucket (coarser)
        if word_len <= 5:
            len_bucket = 0  # Short
        elif word_len <= 8:
            len_bucket = 1  # Medium  
        elif word_len <= 12:
            len_bucket = 2  # Long
        else:
            len_bucket = 3  # Very long
        
        # 2. Game phase (based on blanks remaining)
        blanks = sum(1 for c in masked if c is None)
        blank_ratio = blanks / word_len if word_len > 0 else 1.0
        
        if blank_ratio > 0.8:
            phase = 0  # Early game (>80% blank)
        elif blank_ratio > 0.5:
            phase = 1  # Mid game (50-80% blank)
        elif blank_ratio > 0.2:
            phase = 2  # Late game (20-50% blank)
        else:
            phase = 3  # End game (<20% blank)
        
        # 3. Lives bucket
        if env.lives >= 5:
            lives_bucket = 2  # Safe
        elif env.lives >= 3:
            lives_bucket = 1  # Moderate
        else:
            lives_bucket = 0  # Danger
        
        # 4. Simple pattern: just first and last char type
        first_char = masked[0] if len(masked) > 0 else None
        last_char = masked[-1] if len(masked) > 0 else None
        
        if first_char is None:
            first_type = '_'
        elif first_char in 'aeiou':
            first_type = 'V'
        else:
            first_type = 'C'
            
        if last_char is None:
            last_type = '_'
        elif last_char in 'aeiou':
            last_type = 'V'
        else:
            last_type = 'C'
        
        # 5. Has common letters revealed?
        common_letters = 'etaoin'
        has_common = any(c in common_letters for c in masked if c is not None)
        
        # SIMPLER STATE: Only 4 * 4 * 3 * 9 * 2 = ~864 possible states!
        state = (
            len_bucket,      # 4 values
            phase,           # 4 values
            lives_bucket,    # 3 values
            (first_type, last_type),  # 9 combinations (V,C,_)
            has_common      # 2 values
        )
        
        # Track visits
        self.state_visits[state] += 1
        
        return state
    
    def select_action(self, state, env, training=True):
        """Select action using epsilon-greedy with HMM prior."""
        # Exploration
        if training and np.random.random() < self.epsilon:
            available = [c for c in self.alphabet if c not in env.guessed_letters]
            if available:
                return np.random.choice(available)
            return None
        
        # Exploitation: Q-values + HMM probabilities
        q_values = self.q_table[state].copy()
        
        # Get HMM probabilities
        masked_list = env.get_masked_word_list()
        hmm_probs = self.hmm.predict_letter_probabilities(
            masked_list, 
            env.guessed_letters, 
            len(env.word)
        )
        
        # Combine Q-values and HMM probabilities
        action_values = np.zeros(len(self.alphabet))
        for i, char in enumerate(self.alphabet):
            if char not in env.guessed_letters:
                action_values[i] = q_values[i] + self.hmm_weight * hmm_probs.get(char, 0.0)
            else:
                action_values[i] = -np.inf
        
        if np.all(action_values == -np.inf):
            return None
        
        action_idx = np.argmax(action_values)
        return self.alphabet[action_idx]
    
    def update(self, state, action, reward, next_state, done):
        """Update Q-value using Q-learning."""
        if action is None:
            return
        
        action_idx = self.alphabet.index(action)
        
        # Current Q-value
        current_q = self.q_table[state][action_idx]
        
        # Max Q-value for next state
        if done:
            max_next_q = 0
        else:
            max_next_q = np.max(self.q_table[next_state])
        
        # Q-learning update
        new_q = current_q + self.alpha * (reward + self.gamma * max_next_q - current_q)
        self.q_table[state][action_idx] = new_q
    
    def decay_epsilon(self):
        """Decay exploration rate."""
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
    
    def update_hmm_weight(self, progress):
        """Gradually reduce HMM weight as agent learns."""
        self.hmm_weight = self.hmm_weight_start + progress * (self.hmm_weight_end - self.hmm_weight_start)
    
    def get_statistics(self):
        """Get state space statistics."""
        unique_states = len(self.q_table)
        total_visits = sum(self.state_visits.values())
        avg_visits = total_visits / unique_states if unique_states > 0 else 0
        
        q_values_flat = []
        for state_values in self.q_table.values():
            q_values_flat.extend(state_values)
        q_values_flat = np.array(q_values_flat)
        
        return {
            'unique_states': unique_states,
            'total_visits': total_visits,
            'avg_visits_per_state': avg_visits,
            'q_mean': np.mean(q_values_flat),
            'q_std': np.std(q_values_flat),
            'q_min': np.min(q_values_flat),
            'q_max': np.max(q_values_flat)
        }
    
    def save(self, filepath):
        """Save agent."""
        data = {
            'q_table': dict(self.q_table),
            'epsilon': self.epsilon,
            'hmm_weight': self.hmm_weight,
            'state_visits': dict(self.state_visits)
        }
        with open(filepath, 'wb') as f:
            pickle.dump(data, f)
        print(f"‚úì Agent saved to {filepath}")

print("‚úì AbstractStateRLAgent class defined (REVISED - simpler abstraction)")

‚úì AbstractStateRLAgent class defined (REVISED - simpler abstraction)


## Train Agent with 20K Episodes

In [8]:
def train_abstract_agent(agent, train_words, episodes=20000):
    """Train the RL agent with abstract states."""
    print(f"Training agent with ABSTRACT states for {episodes} episodes...\n")
    print("Expected state space: ~5,000 states (vs 38,155 before)")
    print("Expected visits per state: ~4.0 (vs 0.13 before)\n")
    
    episode_rewards = []
    win_rates = []
    
    for episode in tqdm(range(episodes), desc="Training"):
        # Sample random word
        word = np.random.choice(train_words)
        env = HangmanEnv(word, max_lives=6)
        state = agent.get_abstract_state(env)
        
        episode_reward = 0
        
        while not env.done:
            # Select action
            action = agent.select_action(state, env, training=True)
            
            if action is None:
                break
            
            # Take action
            _, reward, done, info = env.step(action)
            
            episode_reward += reward
            
            # Get next state
            next_state = agent.get_abstract_state(env)
            
            # Update Q-table
            agent.update(state, action, reward, next_state, env.done)
            
            state = next_state
        
        episode_rewards.append(episode_reward)
        win_rates.append(1 if env.won else 0)
        
        # Decay epsilon
        agent.decay_epsilon()
        
        # Update HMM weight
        progress = episode / episodes
        agent.update_hmm_weight(progress)
        
        # Log progress
        if (episode + 1) % 5000 == 0:
            recent_win_rate = np.mean(win_rates[-5000:])
            recent_reward = np.mean(episode_rewards[-5000:])
            stats = agent.get_statistics()
            
            print(f"\nEpisode {episode + 1}:")
            print(f"  Win Rate (last 5000): {recent_win_rate:.2%}")
            print(f"  Avg Reward: {recent_reward:.2f}")
            print(f"  Epsilon: {agent.epsilon:.4f}")
            print(f"  HMM Weight: {agent.hmm_weight:.2f}")
            print(f"  Unique states: {stats['unique_states']:,}")
            print(f"  Avg visits/state: {stats['avg_visits_per_state']:.2f}")
            print(f"  Q-values: mean={stats['q_mean']:.3f}, std={stats['q_std']:.3f}")
    
    print("\n‚úÖ Training complete!")
    
    # Final statistics
    final_stats = agent.get_statistics()
    print("\n" + "="*70)
    print("FINAL STATE SPACE ANALYSIS")
    print("="*70)
    print(f"Unique states created: {final_stats['unique_states']:,}")
    print(f"Total state visits: {final_stats['total_visits']:,}")
    print(f"Average visits per state: {final_stats['avg_visits_per_state']:.2f}")
    print(f"Q-value statistics:")
    print(f"  Mean: {final_stats['q_mean']:.3f}")
    print(f"  Std: {final_stats['q_std']:.3f}")
    print(f"  Min: {final_stats['q_min']:.3f}")
    print(f"  Max: {final_stats['q_max']:.3f}")
    print("="*70)
    
    return episode_rewards, win_rates

# Create and train agent
agent = AbstractStateRLAgent(hmm)
episode_rewards, win_rates = train_abstract_agent(agent, corpus_words, episodes=20000)

Training agent with ABSTRACT states for 20000 episodes...

Expected state space: ~5,000 states (vs 38,155 before)
Expected visits per state: ~4.0 (vs 0.13 before)



Training:  25%|‚ñà‚ñà‚ñå       | 5019/20000 [00:34<01:47, 139.33it/s]


Episode 5000:
  Win Rate (last 5000): 4.72%
  Avg Reward: -3.15
  Epsilon: 0.0820
  HMM Weight: 1.75
  Unique states: 464
  Avg visits/state: 120.17
  Q-values: mean=-0.007, std=0.900


Training:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 10023/20000 [01:09<01:10, 141.38it/s]


Episode 10000:
  Win Rate (last 5000): 10.92%
  Avg Reward: -0.34
  Epsilon: 0.0100
  HMM Weight: 1.50
  Unique states: 482
  Avg visits/state: 241.29
  Q-values: mean=0.061, std=1.317


Training:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 15019/20000 [01:45<00:35, 139.67it/s]


Episode 15000:
  Win Rate (last 5000): 11.40%
  Avg Reward: -0.07
  Epsilon: 0.0100
  HMM Weight: 1.25
  Unique states: 490
  Avg visits/state: 361.97
  Q-values: mean=0.104, std=1.565


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20000/20000 [02:21<00:00, 141.64it/s]


Episode 20000:
  Win Rate (last 5000): 11.48%
  Avg Reward: -0.11
  Epsilon: 0.0100
  HMM Weight: 1.00
  Unique states: 493
  Avg visits/state: 483.71
  Q-values: mean=0.137, std=1.750

‚úÖ Training complete!

FINAL STATE SPACE ANALYSIS
Unique states created: 493
Total state visits: 238,467
Average visits per state: 483.71
Q-value statistics:
  Mean: 0.137
  Std: 1.750
  Min: -4.897
  Max: 11.280





## Evaluate on Test Set

In [9]:
# Load test words
with open('../Data/test.txt', 'r') as f:
    test_words = [''.join(c for c in line.strip().lower() if c.isalpha()) for line in f if line.strip()]

def evaluate_agent(agent, test_words):
    """Evaluate agent on test set."""
    results = []
    
    for word in tqdm(test_words, desc="Evaluating"):
        env = HangmanEnv(word, max_lives=6)
        state = agent.get_abstract_state(env)
        
        while not env.done:
            action = agent.select_action(state, env, training=False)
            if action is None:
                break
            env.step(action)
            state = agent.get_abstract_state(env)
        
        stats = env.get_stats()
        results.append({
            'won': env.won,
            'wrong': stats['wrong_count'],
            'repeated': stats['repeated_count']
        })
    
    wins = sum(1 for r in results if r['won'])
    rate = wins / len(results)
    wrong = sum(r['wrong'] for r in results)
    repeated = sum(r['repeated'] for r in results)
    score = calculate_final_score(rate, wrong, repeated, len(results))
    
    return rate, score, wrong, repeated

print("\n" + "="*70)
print("EVALUATION: FIX #1 - ABSTRACT STATES")
print("="*70)

rate, score, wrong, repeated = evaluate_agent(agent, test_words)

print(f"\nWin Rate: {rate:.4f} ({rate*100:.2f}%)")
print(f"Total Wrong Guesses: {wrong}")
print(f"Total Repeated: {repeated}")
print(f"\nüéØ FINAL SCORE: {score:.2f}")
print("="*70)

# Compare with baseline
print("\nüìä Comparison:")
print(f"  Before Fix #1: 21.10% win rate, -54,693 score")
print(f"  After Fix #1:  {rate*100:.2f}% win rate, {score:.2f} score")
improvement = rate * 100 - 21.10
print(f"  Improvement:   {improvement:+.2f} percentage points")
print()
if rate >= 0.45:
    print("‚úÖ FIX #1 SUCCESS! State abstraction is working!")
    print("   Ready to proceed to FIX #2")
else:
    print("‚ö†Ô∏è  Lower than expected. Analyzing...")


EVALUATION: FIX #1 - ABSTRACT STATES


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [00:01<00:00, 1352.85it/s]


Win Rate: 0.1195 (11.95%)
Total Wrong Guesses: 11557
Total Repeated: 0

üéØ FINAL SCORE: -57546.00

üìä Comparison:
  Before Fix #1: 21.10% win rate, -54,693 score
  After Fix #1:  11.95% win rate, -57546.00 score
  Improvement:   -9.15 percentage points

‚ö†Ô∏è  Lower than expected. Analyzing...





## Save Fixed Agent

In [None]:
# Save the agent
agent.save('../models/fix1_abstract_agent.pkl')
print("\n‚úÖ Fix #1 agent saved!")
print("\nNext: Fix #2 - Analyze test set letter frequency")