# Training the model
Before starting, we will load a list of lemmas from the previous part.

In [170]:
# Dump the lemmas to a json file
import json
import random

# Change this variable to load another list of lemmas
locale = "cy_GB"

# Define the file path
file_path = f"dictionaries/{locale}/lemmas.json"

# Write the lemmas list to the JSON file
try:
    with open(file_path) as f:
        content = f.read()
        if not content.strip():
            raise ValueError("The JSON file is empty.")
        lemmas = json.loads(content)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    lemmas = []
except ValueError as e:
    print(f"Error: {e}")
    lemmas = []
except json.JSONDecodeError:
    print(f"Error: Invalid JSON content in {file_path}")
    lemmas = []

print(f"{len(lemmas)} items loaded from {file_path}")

58953 items loaded from dictionaries/cy_GB/lemmas.json


## 1 Data Preparation
Now we can start tokenizing our data.

In [None]:
# TODO: use bite pair encoding tokenization before

In [105]:
# ensure you have the necessary library
%pip install 'numpy<2', torch

Note: you may need to restart the kernel to use updated packages.


In [171]:
import torch
from torch.utils.data import Dataset, DataLoader

class CharDataset(Dataset):
    def __init__(self, sequences, vocab):
        self.sequences = sequences
        self.vocab = vocab
        self.char_to_idx = {char: idx for idx, char in enumerate(vocab)}
        self.idx_to_char = {idx: char for idx, char in enumerate(vocab)}
        # add a start, end and padding of sequence tags
        self.sos_token = self.char_to_idx['<SoS>']

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        input_seq = [self.sos_token] + [self.char_to_idx[char] for char in sequence[:-1]]
        target_seq = [self.char_to_idx[char] for char in sequence]
        return torch.tensor(input_seq), torch.tensor(target_seq)

# In this case "vocab" is literally the latin alphabet
vocab = sorted(set("".join(lemmas)) | {'<SoS>'})
dataset = CharDataset(lemmas, vocab)

This loaded the lemmas in a dataset in a format that torch can understand. Each word is turned in a pair of sequences, an input (missing the last character) and a target (missing the first character). In this case, because the input sequences start with an added "start of sequence" special token, the target sequence is the full word. In plain English, this means that we also want our model to learn what is the most likely first letter of a word, not only the next most likely character based on the beginning of the sequence. 

All the characters are converted to numbers, each being the index of the input neuron that will be activated during the training. The system has as many inputs neurons, or input dimension, as there are items in the vocabulary. This is a reasonable number that allows the model to train on any computer, but imagine the size of a model when the vocabulary contains hundred of thousands of words (from different languages), and that each one needs its own input neuron... 

Run the following block to see how your data will be processed by the neural network.

In [172]:
from random import randrange
n = randrange(len(lemmas))

print(f"== {lemmas[n]} == \nbecomes the sequences:\n{dataset[n][0]} (input)\nand {dataset[n][1]} (target)")

== tagellog == 
becomes the sequences:
tensor([ 0, 20,  1,  7,  5, 12, 12, 15]) (input)
and tensor([20,  1,  7,  5, 12, 12, 15,  7]) (target)


For convenience during both training and generation, we'll group the words in lists of a percent of the total number of words and separate each word by a special newline character "\n". We also extract five sequences for validation.

In [271]:
import random

random.shuffle(lemmas)
percent_len = len(lemmas)//100
sequences = ["\n".join(lemmas[(n-1)*percent_len:n*percent_len])+ "\n" for n in range(1, 101)]
seq_training = sequences[:95]
seq_validating = sequences[95:]
vocab = sorted(set("".join(sequences)) | {'<SoS>'})
dataset = CharDataset(seq_training, vocab)
dataset_eval = CharDataset(seq_validating, vocab)
dataloader = DataLoader(dataset, shuffle=True)
dataloader_eval = DataLoader(dataset_eval, shuffle=True)

## 2 Defining the Model

In this part we design our network. We first initialize a PyTorch [module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module) by defining the different parts of the network: an embedding layer to turn each character in a 64 dimensional vector (an array of 64 numbers), two LSTM cell that will do the actual pattern recognition and prediction work and the linear fully connected (self.fc) layer converts these predictions in a simple discrete value, i.e. the index of the next character.

The forward function defines the order in which the input data will go through the network. It outputs the prediction and the updated hidden layer of the LSTM cells (these hidden states are updated even during the forward pass). And finally we have a function initializing the these hidden states with empty tensors of the good shape.

In [316]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size=1):
        return (torch.zeros(num_layers, batch_size , hidden_dim),
                torch.zeros(num_layers, batch_size , hidden_dim))

# Example usage
vocab_size = len(vocab)
embedding_dim = 64
hidden_dim = 64
num_layers = 1
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers)
print("Model ready!")

Model ready!


# 3 Training
After defining a couple of hyperparameters, we are ready to train our model.

In [277]:
print(list(dataloader)[0][0].unsqueeze(1))

tensor([[[ 1, 10,  6,  ..., 24,  7, 19]]])


In [321]:
import torch.optim as optim
from tqdm import tqdm

# Hyperparameters
num_epochs = 10
learning_rate = 0.005

# Loss function and optimizer
cross_entropy = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in tqdm(range(num_epochs)):
    model.train()
    hidden = model.init_hidden()
    training_loss = 0
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs, hidden = model(inputs, hidden)
        loss = cross_entropy(outputs.view(-1, vocab_size), targets.squeeze(0))
        loss.backward()
        optimizer.step()
        training_loss = loss.item()
        hidden = (hidden[0].detach(), hidden[1].detach())
    
    model.eval()
    total_loss = 0
    for inputs, targets in dataloader_eval:
        hidden = model.init_hidden()

        # forward pass
        outputs, hidden = model(inputs, hidden)
        loss = cross_entropy(outputs.view(-1, vocab_size), targets.squeeze(0))
        total_loss += loss.item()

    avg_loss_eval = total_loss / len(dataloader_eval)
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_loss_eval:.4f}, Validation Loss: {training_loss:.4f}')

# Save the trained model
torch.save(model.state_dict(), 'lstm_model.pth')

 10%|█▊                | 1/10 [00:04<00:37,  4.15s/it]

Epoch [1/10], Training Loss: 1.9391, Validation Loss: 1.9349


 20%|███▌              | 2/10 [00:08<00:32,  4.05s/it]

Epoch [2/10], Training Loss: 1.9307, Validation Loss: 1.9151


 30%|█████▍            | 3/10 [00:11<00:27,  3.89s/it]

Epoch [3/10], Training Loss: 1.9263, Validation Loss: 1.9051


 40%|███████▏          | 4/10 [00:15<00:22,  3.78s/it]

Epoch [4/10], Training Loss: 1.9195, Validation Loss: 1.8902


 50%|█████████         | 5/10 [00:19<00:18,  3.71s/it]

Epoch [5/10], Training Loss: 1.9127, Validation Loss: 1.8961


 60%|██████████▊       | 6/10 [00:22<00:14,  3.69s/it]

Epoch [6/10], Training Loss: 1.9092, Validation Loss: 1.8945


 70%|████████████▌     | 7/10 [00:26<00:10,  3.66s/it]

Epoch [7/10], Training Loss: 1.9029, Validation Loss: 1.8759


 80%|██████████████▍   | 8/10 [00:29<00:07,  3.68s/it]

Epoch [8/10], Training Loss: 1.8989, Validation Loss: 1.8884


 90%|████████████████▏ | 9/10 [00:33<00:03,  3.66s/it]

Epoch [9/10], Training Loss: 1.8935, Validation Loss: 1.8415


100%|█████████████████| 10/10 [00:37<00:00,  3.72s/it]

Epoch [10/10], Training Loss: 1.8910, Validation Loss: 1.8623





In [322]:
import torch
import torch.nn.functional as F

def generate_pseudoword(model, length=1000, temperature=0.5, top_k=None, top_p=None):
    model.eval()
    hidden = model.init_hidden(1)
    start_seq = [0]
    inputs = torch.tensor(start_seq).unsqueeze(0)  # Shape: (1, seq_len)
    generated_seq = start_seq

    with torch.no_grad():
        for _ in range(length):
            outputs, hidden = model(inputs, hidden)
            # outputs shape: (1, seq_len, vocab_size)
            # We need the last time step's output for the next prediction
            last_output = outputs[:, -1, :]  # Shape: (1, vocab_size)

            # Apply temperature scaling
            last_output = last_output / temperature
            probs = F.softmax(last_output, dim=-1).squeeze(0)  # Shape: (vocab_size)

            # Ensure the probabilities are valid
            if torch.isnan(probs).any() or torch.isinf(probs).any() or (probs < 0).any():
                print("Invalid probabilities detected. Resetting to uniform distribution.")
                probs = torch.ones_like(probs) / probs.size(0)

            # Apply top-k sampling
            if top_k is not None:
                top_k_probs, top_k_idx = probs.topk(top_k)
                mask = torch.zeros_like(probs)
                mask[top_k_idx] = 1
                probs = probs * mask
                probs = probs / probs.sum()

            # Apply nucleus sampling (top-p sampling)
            if top_p is not None:
                sorted_probs, sorted_idx = probs.sort(descending=True)
                cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
                nucleus = cumulative_probs <= top_p
                mask = torch.zeros_like(probs)
                mask[sorted_idx[:nucleus.sum()]] = 1
                probs = probs * mask
                probs = probs / probs.sum()

            # Ensure the probabilities are valid after sampling
            if torch.isnan(probs).any() or torch.isinf(probs).any() or (probs < 0).any():
                print("Invalid probabilities detected after sampling. Resetting to uniform distribution.")
                probs = torch.ones_like(probs) / probs.size(0)

            # Sample the next character
            predicted_idx = torch.multinomial(probs, 1).item()
            generated_seq.append(predicted_idx)
            inputs = torch.tensor([[predicted_idx]])  # Shape: (1, 1)

    return "".join([vocab[idx] for idx in generated_seq[1:]])

# Example usage
generated_pseudoword = generate_pseudoword(model)
print(generated_pseudoword)


safan
gwyrdd
ailadwyweli
carchfa
cymateg
cydferthyn
anghyfeirio
camwynai
cyselli
tydwr
cydgyfarfyddai
argheuddai
arddadledd
corffori
cydgyfarfyddog
gleiriau
anwerth
corddef
cyfferthyn
ysgali
cyfarfoddol
lledasai
gwrthddwyn
marchddaliad
gwrthelwyr
cyfreddasai
anghyfarfyddon
carnod
troestrol
bactoriai
amlantiai
cyfranni
twymiad
corechredd
sban
cynnod
trawsloddiad
cyfrannai
aranddiledd
trugaf
dyfala
cyfarfyddau
cyfanfudd
cyflonder
adfynno
clercia
cyffrwydda
cloriad
proffist
cyfeiriasai
cyfreithiaid
cambarth
difartho
ailgyfarfyddasai
anghyfreintiai
clorianni
cydgyfarfyddo
dylin
ailbrwydrown
dialltyn
anghyfanhedd
cyfrif
terusasai
clyfa
cynlladenni
addardduriad
anfydlanu
cydgenai
cyfrannol
pendyliai
darfyddai
trawslyffyrdd
ymgyfeiriau
dyfniad
awtocth
cyfrannod
cyfryngiad
cydgymwys
rhiaethol
petol
cyfathradwy
chwilcasai
symyngai
anrheddo
ymgyrchasai
trosglodiad
ymddatganfa
cyfarfyddo
cortiasai
diwynno
distwydd
cyfeiriasai
bonodai
anwedda
cyfferth
morhanno
cydnabydded
crynnod
bystroffynna
croe