# Training the model
Before starting, we will load a list of lemmas from the previous part.

In [170]:
# Dump the lemmas to a json file
import json
import random

# Change this variable to load another list of lemmas
locale = "cy_GB"

# Define the file path
file_path = f"dictionaries/{locale}/lemmas.json"

# Write the lemmas list to the JSON file
try:
    with open(file_path) as f:
        content = f.read()
        if not content.strip():
            raise ValueError("The JSON file is empty.")
        lemmas = json.loads(content)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    lemmas = []
except ValueError as e:
    print(f"Error: {e}")
    lemmas = []
except json.JSONDecodeError:
    print(f"Error: Invalid JSON content in {file_path}")
    lemmas = []

print(f"{len(lemmas)} items loaded from {file_path}")

58953 items loaded from dictionaries/cy_GB/lemmas.json


## 1 Data Preparation
Now we can start tokenizing our data.

In [None]:
# TODO: use bite pair encoding tokenization before

In [105]:
# ensure you have the necessary library
%pip install 'numpy<2', torch

Note: you may need to restart the kernel to use updated packages.


In [171]:
import torch
from torch.utils.data import Dataset, DataLoader

class CharDataset(Dataset):
    def __init__(self, sequences, vocab):
        self.sequences = sequences
        self.vocab = vocab
        self.char_to_idx = {char: idx for idx, char in enumerate(vocab)}
        self.idx_to_char = {idx: char for idx, char in enumerate(vocab)}
        # add a start, end and padding of sequence tags
        self.sos_token = self.char_to_idx['<SoS>']

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        input_seq = [self.sos_token] + [self.char_to_idx[char] for char in sequence[:-1]]
        target_seq = [self.char_to_idx[char] for char in sequence]
        return torch.tensor(input_seq), torch.tensor(target_seq)

# In this case "vocab" is literally the latin alphabet
vocab = sorted(set("".join(lemmas)) | {'<SoS>'})
dataset = CharDataset(lemmas, vocab)

This loaded the lemmas in a dataset in a format that torch can understand. Each word is turned in a pair of sequences, an input (missing the last character) and a target (missing the first character). In this case, because the input sequences start with an added "start of sequence" special token, the target sequence is the full word. In plain English, this means that we also want our model to learn what is the most likely first letter of a word, not only the next most likely character based on the beginning of the sequence. 

All the characters are converted to numbers, each being the index of the input neuron that will be activated during the training. The system has as many inputs neurons, or input dimension, as there are items in the vocabulary. This is a reasonable number that allows the model to train on any computer, but imagine the size of a model when the vocabulary contains hundred of thousands of words (from different languages), and that each one needs its own input neuron... 

Run the following block to see how your data will be processed by the neural network.

In [172]:
from random import randrange
n = randrange(len(lemmas))

print(f"== {lemmas[n]} == \nbecomes the sequences:\n{dataset[n][0]} (input)\nand {dataset[n][1]} (target)")

== tagellog == 
becomes the sequences:
tensor([ 0, 20,  1,  7,  5, 12, 12, 15]) (input)
and tensor([20,  1,  7,  5, 12, 12, 15,  7]) (target)


For convenience during both training and generation, we'll group the words in lists of a percent of the total number of words and separate each word by a special newline character "\n". We also extract five sequences for validation.

In [271]:
import random

random.shuffle(lemmas)
percent_len = len(lemmas)//100
sequences = ["\n".join(lemmas[(n-1)*percent_len:n*percent_len])+ "\n" for n in range(1, 101)]
seq_training = sequences[:95]
seq_validating = sequences[95:]
vocab = sorted(set("".join(sequences)) | {'<SoS>'})
dataset = CharDataset(seq_training, vocab)
dataset_eval = CharDataset(seq_validating, vocab)
dataloader = DataLoader(dataset, shuffle=True)
dataloader_eval = DataLoader(dataset_eval, shuffle=True)

## 2 Defining the Model

In this part we design our network. We first initialize a PyTorch [module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module) by defining the different parts of the network: an embedding layer to turn each character in a 64 dimensional vector (an array of 64 numbers), two LSTM cell that will do the actual pattern recognition and prediction work and the linear fully connected (self.fc) layer converts these predictions in a simple discrete value, i.e. the index of the next character.

The forward function defines the order in which the input data will go through the network. It outputs the prediction and the updated hidden layer of the LSTM cells (these hidden states are updated even during the forward pass). And finally we have a function initializing the these hidden states with empty tensors of the good shape.

In [272]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size=1):
        return (torch.zeros(num_layers, batch_size , hidden_dim),
                torch.zeros(num_layers, batch_size , hidden_dim))

# Example usage
vocab_size = len(vocab)
embedding_dim = 16
hidden_dim = 16
num_layers = 2
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers)
print("Model ready!")

Model ready!


# 3 Training
After defining a couple of hyperparameters, we are ready to train our model.

In [210]:
print(list(dataloader)[0][0].unsqueeze(1))

tensor([[[ 1, 13, 13,  ..., 15,  6, 20]]])


In [274]:
import torch.optim as optim
from tqdm import tqdm

# Hyperparameters
num_epochs = 20
learning_rate = 0.1

# Loss function and optimizer
cross_entropy = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in tqdm(range(num_epochs)):
    model.train()
    hidden = model.init_hidden()
    training_loss = 0
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs, hidden = model(inputs, hidden)
        loss = cross_entropy(outputs.view(-1, vocab_size), targets.squeeze(0))
        loss.backward()
        optimizer.step()
        training_loss = loss.item()
        hidden = (hidden[0].detach(), hidden[1].detach())
    
    model.eval()
    total_loss = 0
    for inputs, targets in dataloader_eval:
        hidden = model.init_hidden()

        # forward pass
        outputs, hidden = model(inputs, hidden)
        loss = cross_entropy(outputs.view(-1, vocab_size), targets.squeeze(0))
        total_loss += loss.item()

    avg_loss_eval = total_loss / len(dataloader_eval)
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_loss_eval:.4f}, Validation Loss: {training_loss:.4f}')

# Save the trained model
torch.save(model.state_dict(), 'lstm_model.pth')

  5%|▉                 | 1/20 [00:03<01:14,  3.90s/it]

Epoch [1/20], Training Loss: 2.1876, Validation Loss: 2.1628


 10%|█▊                | 2/20 [00:07<01:07,  3.76s/it]

Epoch [2/20], Training Loss: 2.1419, Validation Loss: 2.1248


 15%|██▋               | 3/20 [00:11<01:03,  3.73s/it]

Epoch [3/20], Training Loss: 2.1263, Validation Loss: 2.1120


 20%|███▌              | 4/20 [00:14<00:59,  3.73s/it]

Epoch [4/20], Training Loss: 2.1195, Validation Loss: 2.0908


 25%|████▌             | 5/20 [00:18<00:55,  3.71s/it]

Epoch [5/20], Training Loss: 2.1099, Validation Loss: 2.0546


 30%|█████▍            | 6/20 [00:22<00:51,  3.71s/it]

Epoch [6/20], Training Loss: 2.1065, Validation Loss: 2.1037


 35%|██████▎           | 7/20 [00:26<00:48,  3.75s/it]

Epoch [7/20], Training Loss: 2.1022, Validation Loss: 2.1059


 40%|███████▏          | 8/20 [00:29<00:44,  3.73s/it]

Epoch [8/20], Training Loss: 2.0948, Validation Loss: 2.0743


 45%|████████          | 9/20 [00:33<00:41,  3.74s/it]

Epoch [9/20], Training Loss: 2.0971, Validation Loss: 2.1097


 50%|████████▌        | 10/20 [00:37<00:37,  3.74s/it]

Epoch [10/20], Training Loss: 2.0957, Validation Loss: 2.0943


 55%|█████████▎       | 11/20 [00:41<00:33,  3.77s/it]

Epoch [11/20], Training Loss: 2.0967, Validation Loss: 2.0784


 60%|██████████▏      | 12/20 [00:44<00:30,  3.75s/it]

Epoch [12/20], Training Loss: 2.0921, Validation Loss: 2.0898


 65%|███████████      | 13/20 [00:48<00:26,  3.75s/it]

Epoch [13/20], Training Loss: 2.0870, Validation Loss: 2.0755


 70%|███████████▉     | 14/20 [00:52<00:22,  3.76s/it]

Epoch [14/20], Training Loss: 2.0917, Validation Loss: 2.0768


 75%|████████████▊    | 15/20 [00:56<00:18,  3.76s/it]

Epoch [15/20], Training Loss: 2.0954, Validation Loss: 2.0929


 80%|█████████████▌   | 16/20 [01:00<00:15,  3.81s/it]

Epoch [16/20], Training Loss: 2.0848, Validation Loss: 2.0897


 85%|██████████████▍  | 17/20 [01:03<00:11,  3.79s/it]

Epoch [17/20], Training Loss: 2.0905, Validation Loss: 2.0594


 90%|███████████████▎ | 18/20 [01:07<00:07,  3.80s/it]

Epoch [18/20], Training Loss: 2.0924, Validation Loss: 2.0920


 95%|████████████████▏| 19/20 [01:11<00:03,  3.81s/it]

Epoch [19/20], Training Loss: 2.0910, Validation Loss: 2.0899


100%|█████████████████| 20/20 [01:15<00:00,  3.77s/it]

Epoch [20/20], Training Loss: 2.0911, Validation Loss: 2.0738





In [275]:
import torch
import torch.nn.functional as F

def generate_pseudoword(model, length=1000, temperature=.5, top_k=None, top_p=None):
    model.eval()
    hidden = model.init_hidden(1)
    start_seq = [0]
    inputs = torch.tensor(start_seq).unsqueeze(0)  # Shape: (1, seq_len)
    generated_seq = start_seq

    with torch.no_grad():
        for _ in range(length):
            outputs, hidden = model(inputs, hidden)
            # outputs shape: (1, seq_len, vocab_size)
            # We need the last time step's output for the next prediction
            last_output = outputs[:, -1, :]  # Shape: (1, vocab_size)

            # Apply temperature scaling
            last_output = last_output / temperature
            probs = F.softmax(last_output, dim=-1).squeeze(0)  # Shape: (vocab_size)

            # Ensure the probabilities are valid
            if torch.isnan(probs).any() or torch.isinf(probs).any() or (probs < 0).any():
                print("Invalid probabilities detected. Resetting to uniform distribution.")
                probs = torch.ones_like(probs) / probs.size(0)

            # Apply top-k sampling
            if top_k is not None:
                top_k_probs, top_k_idx = probs.topk(top_k)
                mask = torch.zeros_like(probs)
                mask[top_k_idx] = 1
                probs = probs * mask
                probs = probs / probs.sum()

            # Apply nucleus sampling (top-p sampling)
            if top_p is not None:
                sorted_probs, sorted_idx = probs.sort(descending=True)
                cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
                nucleus = cumulative_probs <= top_p
                mask = torch.zeros_like(probs)
                mask[sorted_idx[:nucleus.sum()]] = 1
                probs = probs * mask
                probs = probs / probs.sum()

            # Ensure the probabilities are valid after sampling
            if torch.isnan(probs).any() or torch.isinf(probs).any() or (probs < 0).any():
                print("Invalid probabilities detected after sampling. Resetting to uniform distribution.")
                probs = torch.ones_like(probs) / probs.size(0)

            # Sample the next character
            predicted_idx = torch.multinomial(probs, 1).item()
            generated_seq.append(predicted_idx)
            inputs = torch.tensor([[predicted_idx]])  # Shape: (1, 1)

    return "".join([vocab[idx] for idx in generated_seq[1:]])

# Example usage
generated_pseudoword = generate_pseudoword(model)
print(generated_pseudoword)


yglifasai
perasai
fferwarasai
dilynni
prysgrydiai
tarddasai
gwarediad
dirch
gwrthgygoch
corchwynt
dadwyddol
cyfryfarasai
cydhreswyr
tynguri
cylthwyddio
diweiriasai
cydymddiweich
stesiasai
eheswyrasai
delydr
diasgau
cydryddydd
cydresynasai
bledd
cyfarnon
ffalisg
brigwyr
fflogyddi
rhochi
cyfarsasai
annerchol
ailgynni
branen
prysolyd
distychi
cyfeddydd
cynysgio
brifna
gwrthgwysig
comasai
rhastiwn
gwrthaid
gorgyddai
crasai
gweten
diyffyr
cydlesau
afnaws
barchi
cyffeddio
diweidiau
dydden
cyfandrasai
cynseddol
carsian
prydla
ymddiomathwydd
mygnod
metrasai
cystyrchodau
mensau
camwybydd
cyfluwi
cydymddernai
adlonus
sani
cyforddyddo
difrwyr
difrwys
difrodia
rhodwydd
bradwisio
cydheniasai
cymwyswr
cyffeddasai
ciliasai
diharfwydd
diddon
cystarchiasai
cydrenni
cydgyngwedd
rhyfydd
diwasnin
dioddoli
cyfeiriasai
trapffeddasai
cyfarthodasai
cyfateiddi
carfyria
disganwyr
cyflannwydd
cywilyn
ampeirforasai
cydrasgi
cyflysg
darsynasai
brastio
herferi
anghrywio
cyfranna
fforiasai
gwarddur
arwynnol
gwyrfydd