# Training the model
Before starting, we will load a list of lemmas from the previous part.

In [399]:
# Dump the lemmas to a json file
import json
import random

# Change this variable to load another list of lemmas
locale = "cy_GB"

# Define the file path
file_path = f"dictionaries/{locale}/lemmas.json"

# Write the lemmas list to the JSON file
try:
    with open(file_path) as f:
        content = f.read()
        if not content.strip():
            raise ValueError("The JSON file is empty.")
        lemmas = json.loads(content)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    lemmas = []
except ValueError as e:
    print(f"Error: {e}")
    lemmas = []
except json.JSONDecodeError:
    print(f"Error: Invalid JSON content in {file_path}")
    lemmas = []

print(f"{len(lemmas)} items loaded from {file_path}")

58953 items loaded from dictionaries/cy_GB/lemmas.json


## 1 Data Preparation
Now we can start tokenizing our data.

In [None]:
# TODO: use bite pair encoding tokenization before

In [105]:
# ensure you have the necessary library
%pip install 'numpy<2', torch

Note: you may need to restart the kernel to use updated packages.


In [361]:
import torch
from torch.utils.data import Dataset, DataLoader

class CharDataset(Dataset):
    def __init__(self, sequences, vocab):
        self.sequences = sequences
        self.vocab = vocab
        self.char_to_idx = {char: idx for idx, char in enumerate(vocab)}
        self.idx_to_char = {idx: char for idx, char in enumerate(vocab)}

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        input_seq = [self.char_to_idx[char] for char in sequence[:-1]]
        target_seq = [self.char_to_idx[char] for char in sequence[1:]]
        return torch.tensor(input_seq), torch.tensor(target_seq)

# In this case "vocab" is literally the latin alphabet
vocab = sorted(set("".join(lemmas)))
dataset = CharDataset(lemmas, vocab)

This loaded the lemmas in a dataset in a format that torch can understand. Each word is turned in a pair of sequences, an input (missing the last character) and a target (missing the first character). In this case, because the input sequences start with an added "start of sequence" special token, the target sequence is the full word. In plain English, this means that we also want our model to learn what is the most likely first letter of a word, not only the next most likely character based on the beginning of the sequence. 

All the characters are converted to numbers, each being the index of the input neuron that will be activated during the training. The system has as many inputs neurons, or input dimension, as there are items in the vocabulary. This is a reasonable number that allows the model to train on any computer, but imagine the size of a model when the vocabulary contains hundred of thousands of words (from different languages), and that each one needs its own input neuron... 

Run the following block to see how your data will be processed by the neural network.

In [362]:
from random import randrange
n = randrange(len(lemmas))

print(f"== {lemmas[n]} == \nbecomes the sequences:\n{dataset[n][0]} (input)\nand {dataset[n][1]} (target)")

== throatiness == 
becomes the sequences:
tensor([19,  7, 17, 14,  0, 19,  8, 13,  4, 18]) (input)
and tensor([ 7, 17, 14,  0, 19,  8, 13,  4, 18, 18]) (target)


For convenience during both training and generation, we'll group the words in lists of a percent of the total number of words and separate each word by a special newline character "\n". We also extract five sequences for validation.

In [363]:
import random

random.shuffle(lemmas)
percent_len = len(lemmas)//100
sequences = ["\n" + "\n".join(lemmas[(n-1)*percent_len:n*percent_len])+ "\n" for n in range(1, 101)]
seq_training = sequences[:95]
seq_validating = sequences[95:]
vocab = sorted(set("".join(sequences)))
dataset = CharDataset(seq_training, vocab)
dataset_eval = CharDataset(seq_validating, vocab)
dataloader = DataLoader(dataset, shuffle=True)
dataloader_eval = DataLoader(dataset_eval, shuffle=True)

## 2 Defining the Model

In this part we design our network. We first initialize a PyTorch [module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module) by defining the different parts of the network: an embedding layer to turn each character in a 64 dimensional vector (an array of 64 numbers), two LSTM cell that will do the actual pattern recognition and prediction work and the linear fully connected (self.fc) layer converts these predictions in a simple discrete value, i.e. the index of the next character.

The forward function defines the order in which the input data will go through the network. It outputs the prediction and the updated hidden layer of the LSTM cells (these hidden states are updated even during the forward pass). And finally we have a function initializing the these hidden states with empty tensors of the good shape.

In [404]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size=1):
        return (torch.zeros(num_layers, batch_size , hidden_dim),
                torch.zeros(num_layers, batch_size , hidden_dim))

# Example usage
vocab_size = len(vocab)
embedding_dim = 16
hidden_dim = 32
num_layers = 1
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers)

total_params = sum(p.numel() for p in model.parameters())
print(f'Model ready! Total number of parameters: {total_params}')

Model ready! Total number of parameters: 7723


# 3 Training
After defining a couple of hyperparameters, we are ready to train our model.

In [405]:
import torch.optim as optim
from tqdm import tqdm

# Hyperparameters
num_epochs = 20
learning_rate = 0.005

# Loss function and optimizer
cross_entropy = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in tqdm(range(num_epochs)):
    model.train()
    hidden = model.init_hidden()
    training_loss = 0
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs, hidden = model(inputs, hidden)
        loss = cross_entropy(outputs.view(-1, vocab_size), targets.squeeze(0))
        loss.backward()
        optimizer.step()
        training_loss = loss.item()
        hidden = (hidden[0].detach(), hidden[1].detach())
    
    model.eval()
    total_loss = 0
    for inputs, targets in dataloader_eval:
        hidden = model.init_hidden()

        # forward pass
        outputs, hidden = model(inputs, hidden)
        loss = cross_entropy(outputs.view(-1, vocab_size), targets.squeeze(0))
        total_loss += loss.item()

    avg_loss_eval = total_loss / len(dataloader_eval)
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_loss_eval:.4f}, Validation Loss: {training_loss:.4f}')


  5%|▉                 | 1/20 [00:01<00:22,  1.19s/it]

Epoch [1/20], Training Loss: 2.4454, Validation Loss: 2.4621


 10%|█▊                | 2/20 [00:02<00:20,  1.12s/it]

Epoch [2/20], Training Loss: 2.3104, Validation Loss: 2.3279


 15%|██▋               | 3/20 [00:03<00:18,  1.10s/it]

Epoch [3/20], Training Loss: 2.2489, Validation Loss: 2.2388


 20%|███▌              | 4/20 [00:04<00:17,  1.09s/it]

Epoch [4/20], Training Loss: 2.2085, Validation Loss: 2.1689


 25%|████▌             | 5/20 [00:05<00:17,  1.15s/it]

Epoch [5/20], Training Loss: 2.1868, Validation Loss: 2.1740


 30%|█████▍            | 6/20 [00:06<00:16,  1.16s/it]

Epoch [6/20], Training Loss: 2.1679, Validation Loss: 2.1557


 35%|██████▎           | 7/20 [00:08<00:15,  1.17s/it]

Epoch [7/20], Training Loss: 2.1568, Validation Loss: 2.1400


 40%|███████▏          | 8/20 [00:09<00:14,  1.17s/it]

Epoch [8/20], Training Loss: 2.1454, Validation Loss: 2.1591


 45%|████████          | 9/20 [00:10<00:12,  1.17s/it]

Epoch [9/20], Training Loss: 2.1374, Validation Loss: 2.1326


 50%|████████▌        | 10/20 [00:11<00:11,  1.15s/it]

Epoch [10/20], Training Loss: 2.1309, Validation Loss: 2.1157


 55%|█████████▎       | 11/20 [00:12<00:10,  1.14s/it]

Epoch [11/20], Training Loss: 2.1264, Validation Loss: 2.1310


 60%|██████████▏      | 12/20 [00:13<00:09,  1.13s/it]

Epoch [12/20], Training Loss: 2.1203, Validation Loss: 2.1190


 65%|███████████      | 13/20 [00:14<00:07,  1.13s/it]

Epoch [13/20], Training Loss: 2.1182, Validation Loss: 2.0851


 70%|███████████▉     | 14/20 [00:15<00:06,  1.13s/it]

Epoch [14/20], Training Loss: 2.1125, Validation Loss: 2.0696


 75%|████████████▊    | 15/20 [00:17<00:05,  1.10s/it]

Epoch [15/20], Training Loss: 2.1086, Validation Loss: 2.1229


 80%|█████████████▌   | 16/20 [00:18<00:04,  1.07s/it]

Epoch [16/20], Training Loss: 2.1048, Validation Loss: 2.0687


 85%|██████████████▍  | 17/20 [00:19<00:03,  1.06s/it]

Epoch [17/20], Training Loss: 2.1052, Validation Loss: 2.0477


 90%|███████████████▎ | 18/20 [00:20<00:02,  1.05s/it]

Epoch [18/20], Training Loss: 2.0981, Validation Loss: 2.0810


 95%|████████████████▏| 19/20 [00:21<00:01,  1.04s/it]

Epoch [19/20], Training Loss: 2.0982, Validation Loss: 2.0927


100%|█████████████████| 20/20 [00:22<00:00,  1.11s/it]

Epoch [20/20], Training Loss: 2.0942, Validation Loss: 2.0655





If you are happy with the loss obtained, especially against the validation set, you can run the following block to save the model's weights.

In [375]:
# Save the best model you've trained so far
torch.save(model.state_dict(), f'lstm_model-{locale}.pth')

In [377]:
# generate words from the the last version of the model you saved
model.load_state_dict(torch.load(f'lstm_model-{locale}.pth'))

<All keys matched successfully>

In [398]:
import torch
import torch.nn.functional as F

def generate_pseudoword(model, length=1000, temperature=0.4, top_k=None, top_p=None):
    model.eval()
    hidden = model.init_hidden(1)
    start_seq = [0]
    inputs = torch.tensor(start_seq).unsqueeze(0)  # Shape: (1, seq_len)
    generated_seq = start_seq

    with torch.no_grad():
        for _ in range(length):
            outputs, hidden = model(inputs, hidden)
            # outputs shape: (1, seq_len, vocab_size)
            # We need the last time step's output for the next prediction
            last_output = outputs[:, -1, :]  # Shape: (1, vocab_size)

            # Apply temperature scaling
            last_output = last_output / temperature
            probs = F.softmax(last_output, dim=-1).squeeze(0)  # Shape: (vocab_size)

            # Ensure the probabilities are valid
            if torch.isnan(probs).any() or torch.isinf(probs).any() or (probs < 0).any():
                print("Invalid probabilities detected. Resetting to uniform distribution.")
                probs = torch.ones_like(probs) / probs.size(0)

            # Apply top-k sampling
            if top_k is not None:
                top_k_probs, top_k_idx = probs.topk(top_k)
                mask = torch.zeros_like(probs)
                mask[top_k_idx] = 1
                probs = probs * mask
                probs = probs / probs.sum()

            # Apply nucleus sampling (top-p sampling)
            if top_p is not None:
                sorted_probs, sorted_idx = probs.sort(descending=True)
                cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
                nucleus = cumulative_probs <= top_p
                mask = torch.zeros_like(probs)
                mask[sorted_idx[:nucleus.sum()]] = 1
                probs = probs * mask
                probs = probs / probs.sum()

            # Ensure the probabilities are valid after sampling
            if torch.isnan(probs).any() or torch.isinf(probs).any() or (probs < 0).any():
                print("Invalid probabilities detected after sampling. Resetting to uniform distribution.")
                probs = torch.ones_like(probs) / probs.size(0)

            # Sample the next character
            predicted_idx = torch.multinomial(probs, 1).item()
            generated_seq.append(predicted_idx)
            inputs = torch.tensor([[predicted_idx]])  # Shape: (1, 1)

    return "".join([vocab[idx] for idx in generated_seq[1:]])

# Example usage
generated_pseudoword = generate_pseudoword(model)
print(generated_pseudoword)


bolding
pentic
deconting
stand
podite
shagger
pressicious
shimper
constallion
tiner
probite
prover
stranger
inviness
appologe
therrapher
stricking
insector
suppoding
state
pack
splin
stonophect
carnour
fing
solling
rescrap
portan
ance
batter
tender
singer
monel
speriation
bastter
conconium
prominate
polricate
suppricise
pending
faster
screating
conviture
substinate
cland
consection
perter
rick
sharth
arricting
content
spead
linger
ching
substression
thepention
skyhen
reproten
concommant
deconting
mone
confulness
consurate
grange
partan
descombiness
censor
pack
consorting
contine
scate
consection
constite
conset
ancention
ascressing
mange
substony
consperogen
granter
houndless
corrate
bushness
resenting
bater
strance
serper
condround
spressing
shamp
poleston
intranist
stand
constiverate
strate
consection
sasting
part
paronald
melerous
constant
birect
basting
collous
porter
paction
trinessing
sabban
carter
forerate
calloger
blatter
porminess
ensolity
conention
monophene
fine
borset
const