In [5]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import pickle
import re
from typing import List, Dict, Tuple, Set
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


In [6]:
with open('C:/Users/Chara/OneDrive/Desktop/PESU/ML/Hackathon/Data/corpus.txt', 'r') as f:
    corpus = [word.strip().lower() for word in f.readlines()]

print(f"Total words in corpus: {len(corpus)}")
print(f"Sample words: {corpus[:10]}")

from collections import Counter
length_dist = Counter(len(word) for word in corpus)
print(f"\nWord length distribution:")
for length in sorted(length_dist.keys())[:15]:
    print(f"  Length {length}: {length_dist[length]} words")

all_letters = set(''.join(corpus))
print(f"\nUnique letters in corpus: {sorted(all_letters)}")
print(f"Total unique letters: {len(all_letters)}")

Total words in corpus: 50000
Sample words: ['suburbanize', 'asmack', 'hypotypic', 'promoderationist', 'consonantly', 'philatelically', 'cacomelia', 'thicklips', 'luciferase', 'cinematography']

Word length distribution:
  Length 1: 46 words
  Length 2: 84 words
  Length 3: 388 words
  Length 4: 1169 words
  Length 5: 2340 words
  Length 6: 3755 words
  Length 7: 5111 words
  Length 8: 6348 words
  Length 9: 6808 words
  Length 10: 6465 words
  Length 11: 5452 words
  Length 12: 4292 words
  Length 13: 3094 words
  Length 14: 2019 words
  Length 15: 1226 words

Unique letters in corpus: [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Total unique letters: 27


In [7]:
letter_freq = Counter()
for word in corpus:
    letter_freq.update(word)

total_letters = sum(letter_freq.values())
letter_probs = {letter: count/total_letters for letter, count in letter_freq.items()}

sorted_letters = sorted(letter_probs.items(), key=lambda x: x[1], reverse=True)

print("Most frequent letters:")
for letter, prob in sorted_letters[:15]:
    print(f"  '{letter}': {prob:.4f}")

LETTER_FREQUENCY = [letter for letter, _ in sorted_letters]

Most frequent letters:
  'e': 0.1037
  'a': 0.0887
  'i': 0.0886
  'o': 0.0754
  'r': 0.0708
  'n': 0.0702
  't': 0.0678
  's': 0.0612
  'l': 0.0577
  'c': 0.0457
  'u': 0.0387
  'p': 0.0346
  'm': 0.0309
  'd': 0.0302
  'h': 0.0287


In [8]:
class LengthSpecificHMM:
    
    def __init__(self, length):
        self.length = length
        self.position_letter_counts = [defaultdict(int) for _ in range(length)]
        self.position_totals = [0] * length
        self.vocabulary = set()
        
    def train(self, words):
        for word in words:
            if len(word) != self.length:
                continue
                
            for pos, letter in enumerate(word):
                self.position_letter_counts[pos][letter] += 1
                self.position_totals[pos] += 1
                self.vocabulary.add(letter)
    
    def get_letter_probabilities(self, masked_word, guessed_letters):
        letter_scores = defaultdict(float)
        available_letters = self.vocabulary - guessed_letters
        
        for letter in available_letters:
            score = 0.0
            valid_positions = 0
            
            for pos, char in enumerate(masked_word):
                if char == '_':
                    if self.position_totals[pos] > 0:
                        prob = self.position_letter_counts[pos].get(letter, 0.5) / self.position_totals[pos]
                        score += prob
                        valid_positions += 1
                elif char == letter:
                    score += 1.0
                    valid_positions += 1
            
            if valid_positions > 0:
                letter_scores[letter] = score / valid_positions
        
        total = sum(letter_scores.values())
        if total > 0:
            letter_probs = {letter: score/total for letter, score in letter_scores.items()}
        else:
            letter_probs = {letter: 1.0/len(available_letters) for letter in available_letters}
        
        return letter_probs

print("HMM class defined successfully!")

HMM class defined successfully!


In [9]:
words_by_length = defaultdict(list)
for word in corpus:
    words_by_length[len(word)].append(word)

print(f"Training HMMs for {len(words_by_length)} different word lengths...")

hmms = {}
for length, words in words_by_length.items():
    if len(words) >= 5:
        hmm = LengthSpecificHMM(length)
        hmm.train(words)
        hmms[length] = hmm
        print(f"  Length {length}: Trained on {len(words)} words")

print(f"\nTotal HMMs trained: {len(hmms)}")
print(f"Covered word lengths: {sorted(hmms.keys())}")

Training HMMs for 24 different word lengths...
  Length 11: Trained on 5452 words
  Length 6: Trained on 3755 words
  Length 9: Trained on 6808 words
  Length 16: Trained on 698 words
  Length 14: Trained on 2019 words
  Length 10: Trained on 6465 words
  Length 8: Trained on 6348 words
  Length 12: Trained on 4292 words
  Length 13: Trained on 3094 words
  Length 5: Trained on 2340 words
  Length 18: Trained on 174 words
  Length 4: Trained on 1169 words
  Length 3: Trained on 388 words
  Length 7: Trained on 5111 words
  Length 15: Trained on 1226 words
  Length 17: Trained on 375 words
  Length 22: Trained on 8 words
  Length 19: Trained on 88 words
  Length 2: Trained on 84 words
  Length 1: Trained on 46 words
  Length 20: Trained on 40 words
  Length 21: Trained on 16 words

Total HMMs trained: 22
Covered word lengths: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]


In [10]:
def get_matching_words(corpus, masked_word, guessed_letters):
    length = len(masked_word)
    pattern = ""
    
    for char in masked_word:
        if char == '_':
            pattern += '.'
        else:
            pattern += char
    
    regex = re.compile(f"^{pattern}$")
    
    matching = []
    for word in corpus:
        if len(word) != length:
            continue
        if regex.match(word):
            word_letters = set(word)
            if not (guessed_letters - set(masked_word) - {'_'}).intersection(word_letters):
                matching.append(word)
    
    return matching

print("Pattern matching function defined!")

Pattern matching function defined!


In [11]:
class HangmanAgent:
    
    def __init__(self, hmms, corpus, letter_frequency):
        self.hmms = hmms
        self.corpus = corpus
        self.letter_frequency = letter_frequency
        
    def guess_letter(self, masked_word, guessed_letters, lives_remaining):
        length = len(masked_word)
        available_letters = set('abcdefghijklmnopqrstuvwxyz') - guessed_letters
        
        if not available_letters:
            return None
        
        hmm_probs = {}
        if length in self.hmms:
            hmm_probs = self.hmms[length].get_letter_probabilities(masked_word, guessed_letters)
        
        matching_words = get_matching_words(self.corpus, masked_word, guessed_letters)
        pattern_probs = {}
        
        if matching_words:
            letter_counts = Counter()
            for word in matching_words:
                for pos, char in enumerate(word):
                    if masked_word[pos] == '_' and char in available_letters:
                        letter_counts[char] += 1
            
            total = sum(letter_counts.values())
            if total > 0:
                pattern_probs = {letter: count/total for letter, count in letter_counts.items()}
        
        if hmm_probs and pattern_probs:
            combined_probs = {}
            all_letters = set(hmm_probs.keys()) | set(pattern_probs.keys())
            
            for letter in all_letters:
                hmm_score = hmm_probs.get(letter, 0)
                pattern_score = pattern_probs.get(letter, 0)
                combined_probs[letter] = 0.4 * hmm_score + 0.6 * pattern_score
            
            return max(combined_probs.items(), key=lambda x: x[1])[0]
        
        elif pattern_probs:
            return max(pattern_probs.items(), key=lambda x: x[1])[0]
        
        elif hmm_probs:
            return max(hmm_probs.items(), key=lambda x: x[1])[0]
        
        else:
            for letter in self.letter_frequency:
                if letter in available_letters:
                    return letter
        
        return list(available_letters)[0]

agent = HangmanAgent(hmms, corpus, LETTER_FREQUENCY)
print("Hangman Agent initialized successfully!")

Hangman Agent initialized successfully!


In [12]:
class HangmanGame:
    
    def __init__(self, word, max_lives=6):
        self.word = word.lower()
        self.max_lives = max_lives
        self.lives = max_lives
        self.guessed_letters = set()
        self.correct_guesses = set()
        self.wrong_guesses = 0
        self.repeated_guesses = 0
        
    def get_masked_word(self):
        return ''.join([char if char in self.correct_guesses else '_' for char in self.word])
    
    def guess(self, letter):
        letter = letter.lower()
        
        if letter in self.guessed_letters:
            self.repeated_guesses += 1
            return False, True, self.lives <= 0, self.is_won()
        
        self.guessed_letters.add(letter)
        
        if letter in self.word:
            self.correct_guesses.add(letter)
            is_correct = True
        else:
            self.lives -= 1
            self.wrong_guesses += 1
            is_correct = False
        
        game_over = self.lives <= 0 or self.is_won()
        return is_correct, False, game_over, self.is_won()
    
    def is_won(self):
        return set(self.word) == self.correct_guesses
    
    def get_state(self):
        return {
            'masked_word': self.get_masked_word(),
            'guessed_letters': self.guessed_letters.copy(),
            'lives': self.lives,
            'wrong_guesses': self.wrong_guesses,
            'repeated_guesses': self.repeated_guesses
        }

print("Hangman game environment created!")

Hangman game environment created!


In [None]:
def play_game(agent, word, max_lives=6, verbose=True):
    """Play a single game of Hangman"""
    game = HangmanGame(word, max_lives)
    
    if verbose:
        print(f"\nPlaying word: {'*' * len(word)} (length {len(word)})")
    
    while True:
        state = game.get_state()
        
        if verbose:
            print(f"  {state['masked_word']} | Lives: {state['lives']} | Guessed: {sorted(state['guessed_letters'])}")
        
        guess = agent.guess_letter(state['masked_word'], state['guessed_letters'], state['lives'])
        
        if guess is None:
            break
        
        is_correct, is_repeated, game_over, won = game.guess(guess)
        
        if verbose:
            status = "✓ CORRECT" if is_correct else ("⟳ REPEATED" if is_repeated else "✗ WRONG")
            print(f"    Guessed '{guess}': {status}")
        
        if game_over:
            break
    
    state = game.get_state()
    if verbose:
        if state['lives'] > 0 and game.is_won():
            print(f"WON! Word: {word}")
        else:
            print(f"LOST! Word was: {word}")
        print(f"  Final stats - Wrong: {state['wrong_guesses']}, Repeated: {state['repeated_guesses']}")
    
    return game.is_won(), state['wrong_guesses'], state['repeated_guesses']

print("play_game function defined!")

play_game function defined!


In [14]:
def evaluate_agent(agent, test_words, max_lives=6, verbose=True):
    """
    Evaluate agent on a set of test words
    Returns: success_rate, total_wrong, total_repeated, final_score
    """
    wins = 0
    total_wrong = 0
    total_repeated = 0
    
    for i, word in enumerate(test_words):
        won, wrong, repeated = play_game(agent, word, max_lives, verbose=False)
        
        if won:
            wins += 1
        total_wrong += wrong
        total_repeated += repeated
        
        if verbose and (i + 1) % 100 == 0:
            print(f"Progress: {i + 1}/{len(test_words)} games played")
    
    success_rate = wins / len(test_words)
    final_score = (success_rate * len(test_words)) - (total_wrong * 5) - (total_repeated * 2)
    
    print(f"\n{'='*60}")
    print(f"EVALUATION RESULTS ({len(test_words)} games)")
    print(f"{'='*60}")
    print(f"Games Won: {wins}/{len(test_words)}")
    print(f"Success Rate: {success_rate*100:.2f}%")
    print(f"Total Wrong Guesses: {total_wrong}")
    print(f"Total Repeated Guesses: {total_repeated}")
    print(f"Average Wrong Guesses per Game: {total_wrong/len(test_words):.2f}")
    print(f"Average Repeated Guesses per Game: {total_repeated/len(test_words):.2f}")
    print(f"\nFINAL SCORE: {final_score:.2f}")
    print(f"{'='*60}")
    
    return success_rate, total_wrong, total_repeated, final_score

print("Evaluation function ready!")

Evaluation function ready!


In [None]:
import numpy as np
np.random.seed(42)
test_subset = np.random.choice(corpus, 200, replace=False)

print("Running evaluation on 200 random words...")
success_rate, total_wrong, total_repeated, score = evaluate_agent(
    agent, test_subset, max_lives=6, verbose=True
)

Running evaluation on 200 random words...
Progress: 100/200 games played
Progress: 200/200 games played

EVALUATION RESULTS (200 games)
Games Won: 197/200
Success Rate: 98.50%
Total Wrong Guesses: 277
Total Repeated Guesses: 0
Average Wrong Guesses per Game: 1.39
Average Repeated Guesses per Game: 0.00

FINAL SCORE: -1188.00


In [None]:
import pickle

models_to_save = {
    'hmms': hmms,
    'corpus': corpus,
    'letter_frequency': LETTER_FREQUENCY,
    'words_by_length': dict(words_by_length)
}

with open('hmm_models.pkl', 'wb') as f:
    pickle.dump(models_to_save, f)

print("Models saved to 'hmm_models.pkl'")
print(f"Saved {len(hmms)} HMMs for word lengths: {sorted(hmms.keys())}")

Models saved to 'hmm_models.pkl'
Saved 22 HMMs for word lengths: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]


In [None]:
# When you receive test_set.txt, use this cell to evaluate
# For now, this is a template

def evaluate_on_test_set(test_file='test_set.txt'):
    """
    Evaluate agent on the official test set
    """
    try:
        with open(test_file, 'r') as f:
            test_words = [word.strip().lower() for word in f.readlines()]
        
        print(f"Loaded {len(test_words)} words from {test_file}")
        
        success_rate, total_wrong, total_repeated, final_score = evaluate_agent(
            agent, test_words, max_lives=6, verbose=True
        )
        
        return final_score
        
    except FileNotFoundError:
        print(f"Test file '{test_file}' not found.")
        print("This cell is ready to run once you receive the test set.")
        return None

final_score = evaluate_on_test_set('C:/Users/Chara/OneDrive/Desktop/PESU/ML/Hackathon/Data/test.txt')

print("Final evaluation function ready!")

Loaded 2000 words from C:/Users/Chara/OneDrive/Desktop/PESU/ML/Hackathon/Data/test.txt
Progress: 100/2000 games played
Progress: 200/2000 games played
Progress: 300/2000 games played
Progress: 400/2000 games played
Progress: 500/2000 games played
Progress: 600/2000 games played
Progress: 700/2000 games played
Progress: 800/2000 games played
Progress: 900/2000 games played
Progress: 1000/2000 games played
Progress: 1100/2000 games played
Progress: 1200/2000 games played
Progress: 1300/2000 games played
Progress: 1400/2000 games played
Progress: 1500/2000 games played
Progress: 1600/2000 games played
Progress: 1700/2000 games played
Progress: 1800/2000 games played
Progress: 1900/2000 games played
Progress: 2000/2000 games played

EVALUATION RESULTS (2000 games)
Games Won: 410/2000
Success Rate: 20.50%
Total Wrong Guesses: 11056
Total Repeated Guesses: 0
Average Wrong Guesses per Game: 5.53
Average Repeated Guesses per Game: 0.00

FINAL SCORE: -54870.00
Final evaluation function ready!
