In [1]:
import numpy as np
import tensorflow as tf
import random
import string

MAX_LEN = 10  # max word length
VOCAB_SIZE = 27  # 26 letters + 1 for "_" (padding)
PAD_IDX = 26

with open("words_250000_train.txt", "r") as f:
    words = [line.strip().lower() for line in f if line.strip().isalpha() and 4 <= len(line.strip()) <= MAX_LEN]


In [3]:
def generate_samples(words, num_samples=100000, max_len=10):
    samples = []
    PAD_IDX = 26

    for _ in range(num_samples):
        word = random.choice(words)
        guessed = set()
        wrong_guesses = set()
        remaining = list(set(word))
        random.shuffle(remaining)

        for next_letter in remaining:
            masked = [c if c in guessed else '_' for c in word]
            mask_encoded = [ord(c) - 97 if c != '_' else PAD_IDX for c in masked]
            
            # 🔧 Pad to MAX_LEN
            while len(mask_encoded) < max_len:
                mask_encoded.append(PAD_IDX)

            wrong_encoded = [1 if chr(i + 97) in wrong_guesses else 0 for i in range(26)]

            samples.append((mask_encoded, wrong_encoded, ord(next_letter) - 97))

            if next_letter in word:
                guessed.add(next_letter)
            else:
                wrong_guesses.add(next_letter)

    return samples


In [5]:
samples = generate_samples(words, num_samples=100000, max_len=MAX_LEN)

X_masked = np.array([s[0] for s in samples], dtype=np.int32)      # shape: (100000, MAX_LEN)
X_wrong  = np.array([s[1] for s in samples], dtype=np.float32)    # shape: (100000, 26)
y        = np.array([s[2] for s in samples], dtype=np.int32)      # shape: (100000,)

X_combined = np.concatenate([X_masked, X_wrong], axis=1)          # shape: (100000, MAX_LEN + 26)


In [6]:
def cnn_bilstm_model(input_dim, vocab_size=VOCAB_SIZE, embed_dim=64):
    model = tf.keras.Sequential([
        tf.keras.layers.Input((input_dim,)),
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim),
        tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(26, activation='softmax')  # 26 letters a–z
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

model = cnn_bilstm_model(input_dim=MAX_LEN + 26)
model.summary()


In [11]:
print(tf.config.list_physical_devices('GPU'))

[]


In [13]:
!nvidia-smi


Wed Jun 11 17:30:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.52                 Driver Version: 576.52         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4050 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   58C    P8              5W /  140W |    1978MiB /   6141MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_combined, y, test_size=0.1, random_state=42)
with tf.device('/GPU:0'):
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=10,
        batch_size=128,
        callbacks=[tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)]
    )


Epoch 1/10
[1m4587/4587[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 55ms/step - accuracy: 0.1342 - loss: 2.8595 - val_accuracy: 0.1654 - val_loss: 2.7087
Epoch 2/10
[1m3887/4587[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m40s[0m 58ms/step - accuracy: 0.1678 - loss: 2.6979 

KeyboardInterrupt: 