# 2 Training the model
Before starting, we will load a list of lemmas from the previous part.

In [17]:
# Dump the lemmas to a json file
import json
import random

# Change this variable to load another list of lemmas
locale = "cy_GB"

# Define the file path
file_path = f"lemmae.json"

# Write the lemmas list to the JSON file
try:
    with open(file_path) as f:
        content = f.read()
        if not content.strip():
            raise ValueError("The JSON file is empty.")
        lemmas = json.loads(content)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    lemmas = []
except ValueError as e:
    print(f"Error: {e}")
    lemmas = []
except json.JSONDecodeError:
    print(f"Error: Invalid JSON content in {file_path}")
    lemmas = []

print(f"{len(lemmas)} items loaded from {file_path}")

10722 items loaded from lemmae.json


In [18]:
# ensure you have the necessary library
%pip install 'numpy<2', torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## 2 Defining the Model

In this part we design our network. We first initialize a PyTorch [module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module) by defining the different parts of the network: an embedding layer to turn each character in a 16 dimensional vector (an array of 16 numbers), one LSTM cell (`layers_number`) that will do the actual pattern recognition and prediction work and the linear fully connected (self.fc) layer converts these predictions in a simple discrete value, i.e. the index of the next character.

The forward function defines the order in which the input data will go through the network. It outputs the prediction and the updated hidden layer of the LSTM cells (these hidden states are updated even during the forward pass). And finally we have a function initializing the these hidden states with empty tensors of the good shape.

In [19]:
import torch.nn as nn

import random
import torch
from torch.utils.data import Dataset, DataLoader

class CharDataset(Dataset):
    def __init__(self, sequences, vocab, separator_tag=None):
        self.sequences = sequences
        self.vocab = vocab
        self.char_to_idx = {char: idx for idx, char in enumerate(vocab)}
        self.idx_to_char = {idx: char for idx, char in enumerate(vocab)}
        if separator_tag != None:
            self.sep_tag = separator_tag

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        input_seq = [self.char_to_idx[char] for char in sequence[:-1]]
        target_seq = [self.char_to_idx[char] for char in sequence[1:]]
        return torch.tensor(input_seq), torch.tensor(target_seq)

# In this case "vocab" is literally the latin alphabet
vocab = sorted(set("".join(lemmas)))
dataset = CharDataset(lemmas, vocab)

random.shuffle(lemmas)
percent_len = len(lemmas)//1000
sequences = ["\n" + "\n".join(lemmas[(n-1)*percent_len:n*percent_len])+ "\n" for n in range(1, 1001)]
seq_training = sequences[:85]
seq_validating = sequences[85:]
vocab = sorted(set("".join(sequences)))
dataset = CharDataset(seq_training, vocab, "\n")
dataset_eval = CharDataset(seq_validating, vocab, "\n")
dataloader = DataLoader(dataset, shuffle=True)
dataloader_eval = DataLoader(dataset_eval, shuffle=True)
print("Data loaders ready:\n", dataloader, "\n", dataloader_eval)

class LSTMModel(nn.Module):
    def __init__(self, embedding_dim=4, hidden_dim=16, layers_number=1, char_to_idx={}, idx_to_char={}):
        super().__init__()
        vocab_size = len(char_to_idx.keys())
        self.char_to_idx = char_to_idx
        self.idx_to_char = idx_to_char
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, layers_number, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    # The forward function is the one getting called everytime
    # the model created by an instance of this class is called
    # model(x, hidden) == model.forward(x, hidden)
    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size=1):
        return (torch.zeros(layers_number, batch_size , hidden_dim),
                torch.zeros(layers_number, batch_size , hidden_dim))

# Example usage
embedding_dim = 8
hidden_dim = 128
layers_number = 1
char_to_idx = dataset.char_to_idx
idx_to_char = dataset.idx_to_char

model = LSTMModel(embedding_dim, hidden_dim, layers_number, char_to_idx, idx_to_char)

total_params = sum(p.numel() for p in model.parameters())
print(f'Model ready! Total number of parameters: {total_params}')

Data loaders ready:
 <torch.utils.data.dataloader.DataLoader object at 0x149665eb0> 
 <torch.utils.data.dataloader.DataLoader object at 0x1222d7350>
Model ready! Total number of parameters: 75314


# 2.3 Training
After defining a couple of hyperparameters, we are ready to train our model.


In [22]:
import torch.optim as optim
from tqdm import tqdm

# Hyperparameters
num_epochs = 10
learning_rate = 0.002
vocab_size = len(char_to_idx)

# Loss function and optimizer
cross_entropy = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in tqdm(range(num_epochs)):
    
    random.shuffle(lemmas)
    percent_len = len(lemmas)//1000
    sequences = ["\n" + "\n".join(lemmas[(n-1)*percent_len:n*percent_len])+ "\n" for n in range(1, 1001)]
    seq_training = sequences[:850]
    seq_validating = sequences[850:]
    vocab = sorted(set("".join(sequences)))
    dataset = CharDataset(seq_training, vocab, "\n")
    dataset_eval = CharDataset(seq_validating, vocab, "\n")
    dataloader = DataLoader(dataset, shuffle=True)
    dataloader_eval = DataLoader(dataset_eval, shuffle=True)
    
    # first, train the model
    model.train()
    hidden = model.init_hidden()
    training_loss = 0
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs, hidden = model.forward(inputs, hidden)
        loss = cross_entropy(outputs.view(-1, vocab_size), targets.squeeze(0))
        loss.backward()
        optimizer.step()
        training_loss = loss.item()
        hidden = (hidden[0].detach(), hidden[1].detach())
        
    # second, evaluate the model to avoid overfitting
    model.eval()
    total_loss = 0
    for inputs, targets in dataloader_eval:
        hidden = model.init_hidden()

        # forward pass
        outputs, hidden = model.forward(inputs, hidden)
        loss = cross_entropy(outputs.view(-1, vocab_size), targets.squeeze(0))
        total_loss += loss.item()

    avg_loss_eval = total_loss / len(dataloader_eval)
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_loss_eval:.4f}, Validation Loss: {training_loss:.4f}')


 10%|██▊                         | 1/10 [00:02<00:21,  2.41s/it]

Epoch [1/10], Training Loss: 1.7166, Validation Loss: 1.4276


 20%|█████▌                      | 2/10 [00:04<00:19,  2.41s/it]

Epoch [2/10], Training Loss: 1.6570, Validation Loss: 1.4876


 30%|████████▍                   | 3/10 [00:07<00:16,  2.35s/it]

Epoch [3/10], Training Loss: 1.6350, Validation Loss: 1.5481


 40%|███████████▏                | 4/10 [00:09<00:14,  2.40s/it]

Epoch [4/10], Training Loss: 1.6117, Validation Loss: 1.6668


 50%|██████████████              | 5/10 [00:11<00:11,  2.39s/it]

Epoch [5/10], Training Loss: 1.6033, Validation Loss: 1.3737


 60%|████████████████▊           | 6/10 [00:14<00:09,  2.37s/it]

Epoch [6/10], Training Loss: 1.5905, Validation Loss: 1.8814


 70%|███████████████████▌        | 7/10 [00:16<00:07,  2.35s/it]

Epoch [7/10], Training Loss: 1.5743, Validation Loss: 1.3325


 80%|██████████████████████▍     | 8/10 [00:18<00:04,  2.35s/it]

Epoch [8/10], Training Loss: 1.5710, Validation Loss: 1.9198


 90%|█████████████████████████▏  | 9/10 [00:21<00:02,  2.34s/it]

Epoch [9/10], Training Loss: 1.5608, Validation Loss: 1.5755


100%|███████████████████████████| 10/10 [00:23<00:00,  2.35s/it]

Epoch [10/10], Training Loss: 1.5772, Validation Loss: 1.6323





## 2.4 Generating the Pseudo-words


In [23]:
# import torch
import torch.nn.functional as F
from spylls.hunspell import Dictionary
import sys
dictionary = Dictionary.from_files(f"hunspell/{locale}")


def generate_pseudoword(model, length=15, temperature=0.87):
    model.eval()
    hidden = model.init_hidden(1)
    start_seq = [0]
    inputs = torch.tensor(start_seq).unsqueeze(0)  # Shape: (1, seq_len)
    generated_seq = []
    words_generated = set([])

    with torch.no_grad():
        while len(words_generated) < length:
            outputs, hidden = model(inputs, hidden)

            # outputs shape: (1, seq_len, vocab_size)
            # We need the last time step's output for the next prediction
            last_output = outputs[:, -1]  # Shape: (1, vocab_size)

            # Apply temperature scaling
            last_output = last_output / temperature
            probs = F.softmax(last_output, dim=-1).squeeze(0)  # the multinomial accepts only one order tensors

            # Ensure all the probabilities are valid
            if torch.isnan(probs).any() or torch.isinf(probs).any() or (probs < 0).any():
                print("Invalid probabilities detected. Resetting to uniform distribution.")
                probs = torch.ones_like(probs) / probs.size(0)

            # Sample the next character
            predicted_idx = torch.multinomial(probs, 1).item()
            generated_seq.append(predicted_idx)
            inputs = torch.tensor([[predicted_idx]])  # Shape: (1, 1)

            if vocab[predicted_idx] == "\n":
                new_word = ''.join([vocab[i] for i in generated_seq[:-1]])
                generated_seq = []
                if not dictionary.lookup(new_word.capitalize()) and new_word not in lemmas:
                    words_generated.add(new_word)
                sys.stdout.write(f"\r{len(words_generated)} words so far")

    return list(sorted(words_generated))

# Example usage
generated_pseudoword = generate_pseudoword(model, 1000)
print()
print("\n".join(generated_pseudoword[:10]))
print("\n".join(generated_pseudoword[-40:]))
print(len("\n".join(generated_pseudoword))/len(generated_pseudoword))

1000 words so far
abamseriad
achlas
addiain
addoes
addolch
addoledd
addun
addurnad
addysgiad
adnat
ungarthedd
uniamaniad
unigolwr
uns
unsid
uwchmog
uwl
wncoed
wysyn
wythpeuol
ymachyd
ymadlog
ymadraidd
ymadrwydd
ymafat
ymbelinol
ymdarod
ymddiddorol
ymddifwy
ymddisglas
ymdrif
ymgeidyn
ymguddwr
ymgyferfyn
ymgyrrwn
ymgyru
ymhodi
ymhylu
ymlusg
ymrapiad
ynghyfranadl
ynghyfynbent
ysberbynnu
ysboleb
ysgadlwydd
ysgafnydd
ysgarol
ysgori
ysgrifenyddyd
ŷrwyf
8.507


# 5 Saving and loading our results

If you are happy with the results, like the loss, especially against the validation set, and the words generated, you can run the following block to save the model's weights.

In [24]:
# Save the best model you've trained so far
torch.save(model, f'lstm_model-{locale}.pth')

In [None]:
# generate words from the the last version of the model you saved
model = torch.load(f'locales/{locale}/lstm_model-{locale}.pth')

We can now generate our pseudo-lexicon. To find it, look out for the pseudo-lemmas.json file in the dictionary folder of your source dictionary.

In [26]:
# Dump the lemmas to a json file
import json
import time
start_time = time.time()

# Define the output file path
output_file_path = f"pseudo-lemmae.json"

generated_pseudoword = generate_pseudoword(model, 10000)

# Write the lemmas list to the JSON file
with open(output_file_path, 'w', encoding='utf-8') as outfile:
    json.dump(generated_pseudoword, outfile, ensure_ascii=False, indent=2)

print()
time = time.time() - start_time

print(f"{len(generated_pseudoword)} pseudo words successfully generated and loaded in {time//60:.0f}:{(time%60):.3f}")

10000 words so far
10000 pseudo words successfully generated and loaded in 1:11.085
