# Training the model
Before starting, we will load a list of lemmas from the previous part.

In [2]:
# Dump the lemmas to a json file
import json

# Change this variable to load another list of lemmas
locale = "en-GB"

# Define the file path
file_path = f"dictionaries/{locale}/lemmas.json"

# Write the lemmas list to the JSON file
try:
    with open(file_path) as f:
        content = f.read()
        if not content.strip():
            raise ValueError("The JSON file is empty.")
        lemmas = json.loads(content)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    lemmas = []
except ValueError as e:
    print(f"Error: {e}")
    lemmas = []
except json.JSONDecodeError:
    print(f"Error: Invalid JSON content in {file_path}")
    lemmas = []

print(f"{len(lemmas)} items loaded from {file_path}")

30614 items loaded from dictionaries/en-GB/lemmas.json


## 1 Data Preparation
Now we can start tokenizing our data.

In [None]:
# TODO: use bite pair encoding tokenization before

In [25]:
# ensure you have the necessary library
%pip install 'numpy<2', torch

Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

class CharDataset(Dataset):
    def __init__(self, sequences, vocab):
        self.sequences = sequences
        self.vocab = vocab
        self.char_to_idx = {char: idx for idx, char in enumerate(vocab)}
        self.idx_to_char = {idx: char for idx, char in enumerate(vocab)}
        # add a start of sequence tag
        self.sos_token = self.char_to_idx['<SoS>']

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        input_seq = [self.char_to_idx[char] for char in sequence[:-1]]
        target_seq = [self.char_to_idx[char] for char in sequence[1:]]
        return torch.tensor(input_seq), torch.tensor(target_seq)

# In this case "vocab" is literally the latin alphabet
vocab = sorted(set("".join(lemmas)) | {'<SoS>'})
dataset = CharDataset(lemmas, vocab)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

This loaded the lemmas in a dataset in a format that torch can understand. Each word is turned in a pair of sequences, an input (missing the last character) and a target (missing the first one), 

In [13]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(num_layers, batch_size, hidden_dim),
                torch.zeros(num_layers, batch_size, hidden_dim))

# Example usage
vocab_size = len(vocab)
embedding_dim = 128
hidden_dim = 256
num_layers = 2
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers)

(tensor([ 0,  1,  0, 11, 14, 13]), tensor([ 1,  0, 11, 14, 13,  4]))