# Name Encoder-Decoder

Now try an encoder decoder. Rather than classifying the country, let's encode the name; and feed a decoder a '<sos>' token and see if it can generate the country sequentially.

## Libraries

In [1]:
import os
import numpy as np

from common.utils import (
    load_language,
)

import torch
import lightning as L
from lightning.pytorch.loggers import TensorBoardLogger
from pytorch_lightning import seed_everything

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Globals

In [2]:
seed_everything(2718)

Seed set to 2718


2718

In [3]:
# File paths
ENGLISH_INPUT_PATH = "../data/processed/english.txt"
ENGLISH_INDICES_INPUT_PATH = "../data/processed/english_indices.txt"
FRENCH_INPUT_PATH = "../data/processed/french.txt"
FRENCH_INDICES_INPUT_PATH = "../data/processed/french_indices.txt"

# Language model paths
ENGLISH_MODEL_PATH = "../models/english_model.pkl"
FRENCH_MODEL_PATH = "../models/french_model.pkl"


# Training params
EPOCHS = 100
BATCH_SIZE = 128
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 256
UNIT_TYPE = "GRU"
ACCELERATOR = "gpu"

# CPUS to give each dataloader
NUM_WORKERS = 4

In [4]:
# Load the language models
source_language = load_language(ENGLISH_MODEL_PATH)
target_language = load_language(FRENCH_MODEL_PATH)

SOURCE_VOCAB_SIZE = len(source_language.vocabulary)
TARGET_VOCAB_SIZE = len(target_language.vocabulary)

print(f"English vocab size: {SOURCE_VOCAB_SIZE}")
print(f"French vocab size: {TARGET_VOCAB_SIZE}")

INFO:common.utils:Loading from ../models/english_model.pkl
INFO:common.utils:Creating a language object for english


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


INFO:common.utils:Loading from ../models/french_model.pkl
INFO:common.utils:Creating a language object for french


Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
English vocab size: 7504
French vocab size: 7504


## Utilities

In [5]:
class LanguageTranslationDataset(torch.utils.data.Dataset):
    """
    Dataset for the names dataset.
    """

    def __init__(
        self,
        source_label_path,
        target_label_path,
        source_indices_path,
        target_indices_path,
    ):
        self.source_label_path = source_label_path
        self.target_label_path = target_label_path
        self.source_indices_path = source_indices_path
        self.target_indices_path = target_indices_path

        self.source = []
        self.target = []
        self.source_indices = []
        self.target_indices = []

        # Load files
        self._load_data()

        return None

    def _load_label_data(self, path):
        """
        Load in a file where each line is a sentence.
        """
        with open(path, "r") as f:
            data = f.readlines()

        data = [x.strip() for x in data]

        return data

    def _load_index_data(self, path):
        """
        Load in a file where each line is a list of indices.
        """
        with open(path, "r") as f:
            data = f.readlines()

        data = [x.strip().split(" ") for x in data]
        data = [[int(x) for x in y] for y in data]

        return data

    def _load_data(self):
        self.source = self._load_label_data(self.source_label_path)
        self.target = self._load_label_data(self.target_label_path)
        self.source_indices = self._load_index_data(self.source_indices_path)
        self.target_indices = self._load_index_data(self.target_indices_path)

        assert len(self.source) == len(self.target)
        return None

    def __len__(self):
        return len(self.source)

    def __getitem__(self, idx):
        source = self.source[idx]
        target = self.target[idx]

        source_indices = self.source_indices[idx]
        target_indices = self.target_indices[idx]

        # Convert to tensors
        source_indices = torch.tensor(source_indices).long()
        target_indices = torch.tensor(target_indices).long()

        return source_indices, target_indices, source, target

In [6]:
def custom_collate_fn(batch):
    """
    We receive a list of tuples 4 long.
    Each tuple is a tokenized name, a tokenized country (which is one-hot),
    the name, and the country as strings.

    We want to pad and stack them.
    """

    x = [item[0] for item in batch]
    y = [item[1] for item in batch]
    source = [item[2] for item in batch]
    target = [item[3] for item in batch]

    # Lengths to pass the pack and pad sequence function
    x_len = [len(item) for item in x]
    y_len = [len(item) for item in y]

    # Pad the sequences
    x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True)
    y = torch.nn.utils.rnn.pad_sequence(y, batch_first=True)

    # Now since we're teacher forcing we need two versions of y. One with
    # missing the start token, y_target, and one with missing the end token,
    # y_input
    y_target = y[:, 1:]
    y_input = y[:, :-1]

    return x, y_input, y_target

In [7]:
dataset = LanguageTranslationDataset(
    source_label_path=ENGLISH_INPUT_PATH,
    target_label_path=FRENCH_INPUT_PATH,
    source_indices_path=ENGLISH_INDICES_INPUT_PATH,
    target_indices_path=FRENCH_INDICES_INPUT_PATH,
)

In [8]:
# Inspect a random sample
# Show random example
x, y, source, target = dataset[np.random.randint(0, len(dataset))]
print("Input: ", x)
print("Target: ", y)
print("source: ", source)
print("target: ", target)

# And our language model can invert the indices
print("\nInput: ", source_language.index_to_token(x))
print("Target: ", target_language.index_to_token(y))

Input:  tensor([   2,   18,   76,  528,    7, 3379,    7,  109,    4,    3])
Target:  tensor([   2,   26,   16,  656,    6,  490,  250,   25, 1036,    4,    3])
source:  Tom has decided to propose to Mary.
target:  Tom a décidé de demander Marie en mariage.

Input:  ['<SOS>', 'Tom', 'has', 'decided', 'to', 'propose', 'to', 'Mary', '.', '<EOS>']
Target:  ['<SOS>', 'Tom', 'a', 'décidé', 'de', 'demander', 'Marie', 'en', 'mariage', '.', '<EOS>']


In [9]:
train, val = torch.utils.data.random_split(dataset, [0.7, 0.3])

In [10]:
# Make a dataloader that pulls 1 batch at a time. Note that more than 1 batch
# will throw an error since we have variable length sequences. We'd need to pass
# a custom collation function for that, which we'll do in the next notebook.
train_dataloader = torch.utils.data.DataLoader(
    train,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    collate_fn=custom_collate_fn,
)

val_dataloader = torch.utils.data.DataLoader(
    val,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    collate_fn=custom_collate_fn,
)

In [11]:
# Inspect the data loader.

x, y_input, y_target = next(iter(train_dataloader))
print("X shape: ", x.shape)
print("Y input shape: ", y_input.shape)
print("Y target shape: ", y_target.shape)

# Just to be sure, decode the first row
print("\nDecode the first row")
print("X: ", source_language.index_to_token(x[0]))
print("Y input: ", target_language.index_to_token(y_input[0]))
print("Y target: ", target_language.index_to_token(y_target[0]))

X shape:  torch.Size([128, 19])
Y input shape:  torch.Size([128, 18])
Y target shape:  torch.Size([128, 18])

Decode the first row
X:  ['<SOS>', 'I', 'met', 'a', 'friend', 'while', 'I', 'was', 'waiting', 'for', 'a', 'bus', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
Y input:  ['<SOS>', "J'", 'ai', 'rencontré', 'un', 'ami', 'tandis', 'que', "j'", 'attendais', 'un', 'bus', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
Y target:  ["J'", 'ai', 'rencontré', 'un', 'ami', 'tandis', 'que', "j'", 'attendais', 'un', 'bus', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


## Model

In [19]:
class EncoderDecoder(L.LightningModule):
    def __init__(
        self, input_vocab_len, output_vocab_len, embedding_size, hidden_size
    ):
        super().__init__()

        self.input_vocab_len = input_vocab_len
        self.output_vocab_len = output_vocab_len
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size

        self.criterion = torch.nn.NLLLoss()

        # Embedding
        self.input_embedding = torch.nn.Embedding(
            self.input_vocab_len, self.embedding_size
        )
        self.output_embedding = torch.nn.Embedding(
            self.output_vocab_len, self.embedding_size
        )

        # Encoder
        self.encoder = torch.nn.GRU(
            self.embedding_size,
            self.hidden_size,
            batch_first=True,
            dropout=0.2,
        )

        # Decoder
        # The input to the decoder will be a concat of the
        self.decoder = torch.nn.GRU(
            self.embedding_size + self.hidden_size,
            self.hidden_size,
            batch_first=True,
            dropout=0.2,
        )

        self.dense = torch.nn.Linear(self.hidden_size, self.output_vocab_len)

        self.log_softmax = torch.nn.LogSoftmax(dim=-1)

    def encoder_step(self, x):
        """
        Push inputs through encoder.
        """

        # Shapes are:
        # x: (batch_size, seq_len)
        # decoder_input: (batch_size, seq_len-1)
        # decoder_target: (batch_size, seq_len-1)

        # Construct embeddings
        x = self.input_embedding(x)
        # x: (batch_size, seq_len, embedding_size)

        # Run the encoder
        encoder_output, context_vector = self.encoder(x)
        # output: (batch_size, seq_len, hidden_size)
        # context_vector: (1, batch_size, hidden_size)

        return context_vector

    def decoder_step(self, decoder_input, context_vector, decoder_state=None):
        """
        Push inputs through decoder.
        """

        # Embed target outputs
        decoder_input = self.output_embedding(decoder_input)
        # decoder_input: (batch_size, seq_len-1, embedding_size)

        # Currently the context vector is (1, batch_size, hidden_size)
        # decoder_input is (batch_size, seq_len-1, embedding_size)
        # Rather than construct a special RNN module that can handle two inputs,
        # we're simply going to concatenate the context vector to the decoder input.

        # Permute the dimensions of context vector to be conformable with decoder_input
        context_vector = context_vector.permute(1, 0, 2)
        # context_vector: (batch_size, 1, hidden_size)

        # Make copies of the context vector along the sequence length demiension
        context_vector = context_vector.repeat(1, decoder_input.shape[1], 1)
        # context_vector: (batch_size, seq_len-1, hidden_size)

        decoder_input = torch.cat([decoder_input, context_vector], dim=2)
        # decoder_input: (batch_size, seq_len-1, embedding_size + hidden_size)

        # Now run the decoder_input through the decoder
        decoder_output, decoder_state = self.decoder(
            decoder_input, decoder_state
        )
        # decoder_output: (batch_size, seq_len-1, hidden_size)

        # decoder_output is (batch_size, seq_len-1, hidden_size)
        # Add a dense layer to convert it to the decoder_output vocab size
        decoder_output = self.dense(decoder_output)
        # decoder_output: (batch_size, seq_len-1, decoder_output_vocab_size)

        # Now log softmax the decoder_output along last dimension
        decoder_output = self.log_softmax(decoder_output)
        # output: (batch_size, seq_len-1, output_vocab_size)

        return decoder_output, decoder_state

    def training_step(self, batch, batch_idx):
        x, decoder_input, decoder_target = batch

        # Get context vector
        context_vector = self.encoder_step(x)

        # Get decoder output
        decoder_output, _ = self.decoder_step(decoder_input, context_vector)
        # decoder_output: (batch_size, seq_len-1, output_vocab_size)

        # To use our NLL loss, we need to reshape the output and target
        # The outputs need to be (N, class_size) and the targets (N)
        # So flatten the batch and sequence dimensions.
        decoder_output = decoder_output.reshape(-1, self.output_vocab_len)
        decoder_target = decoder_target.reshape(-1)

        # output: (batch_size * seq_len-1, output_vocab_size)
        # decoder_target: (batch_size * seq_len-1, )

        loss = self.criterion(decoder_output, decoder_target)

        self.log("train_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx):
        """
        Same as training step. We have the option to add more metrics here.
        """
        x, decoder_input, decoder_target = batch
        context_vector = self.encoder_step(x)

        decoder_output, _ = self.decoder_step(decoder_input, context_vector)
        decoder_output = decoder_output.reshape(-1, self.output_vocab_len)
        decoder_target = decoder_target.reshape(-1)

        loss = self.criterion(decoder_output, decoder_target)

        self.log("validation_loss", loss)

        return None

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)

        # One cycle learning rate scheduler
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=0.01,
            steps_per_epoch=len(train_dataloader),
            epochs=EPOCHS,
        )

        # # Reduce on plateau
        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        #     optimizer,
        #     mode="min",
        #     factor=0.5,
        #     patience=3,
        #     verbose=True,
        # )

        return {
            "optimizer": optimizer,
            "lr_scheduler": scheduler,
            "monitor": "validation_loss",
        }

    def inference(self, name: str):
        """
        Run inference on a single example.
        """
        assert isinstance(name, str)

        # Tokenize the input
        input_tokens = source_language.tokenizer(name)

        # Add <SOS> and <EOS> tokens
        input_tokens = ["<SOS>"] + input_tokens + ["<EOS>"]

        input_indices = source_language.token_to_index(input_tokens)

        x = torch.tensor(input_indices).unsqueeze(0).long()
        # x should have shape (1, seq_len)
        assert x.shape[0] == 1

        # Get context vector
        context_vector = self.encoder_step(x)

        # Now we need to run the decoder
        # We'll start with a start token and no state
        decoder_input = torch.tensor(
            [target_language.token_to_index("<SOS>")]
        ).unsqueeze(0)
        decoder_state = None
        # decoder_input: (1, 1) and it's a long

        # We'll keep track of the output
        output = []

        reached_eos = False
        for i in range(10):
            # Run the decoder
            decoder_output, decoder_state = self.decoder_step(
                decoder_input, context_vector, decoder_state
            )
            # decoder_output: (1, 1, output_vocab_size)

            # Get the most likely token
            token = torch.argmax(decoder_output, dim=-1)
            # token: (1, 1)

            # Append to the output
            output.append(token.squeeze(0).item())

            # If we've reached the end of the sentence, break
            if token.item() == target_language.token_to_index("<EOS>"):
                reached_eos = True
                break

            # Update the decoder input
            decoder_input = token

        if not reached_eos:
            logger.info("Failed to reach EOS token")

        # Convert the output to a string
        output = [target_language.index_to_token(x) for x in output]
        # If the last token is <EOS> remove it
        if output[-1] == "<EOS>":
            output = output[:-1]

        output = " ".join(output)

        return output

In [20]:
model = EncoderDecoder(
    input_vocab_len=SOURCE_VOCAB_SIZE,
    output_vocab_len=TARGET_VOCAB_SIZE,
    embedding_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
)

cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)

tensorboard_logger = TensorBoardLogger(
    save_dir=parent_dir,
    name="logs/english_french_encoder_decoder",
    version=f"UNIT={UNIT_TYPE}_BATCH_SIZE={BATCH_SIZE}_EMBEDDING_SIZE={EMBEDDING_SIZE}_HIDDEN_SIZE={HIDDEN_SIZE}",
)

In [14]:
model.train()
trainer = L.Trainer(
    devices=1,
    accelerator=ACCELERATOR,
    logger=tensorboard_logger,
    max_epochs=EPOCHS,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [15]:
trainer.fit(model, train_dataloader, val_dataloader)
model.freeze()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type       | Params | Mode 
--------------------------------------------------------
0 | criterion        | NLLLoss    | 0      | train
1 | input_embedding  | Embedding  | 960 K  | train
2 | output_embedding | Embedding  | 960 K  | train
3 | encoder          | GRU        | 296 K  | train
4 | decoder          | GRU        | 493 K  | train
5 | dense            | Linear     | 1.9 M  | train
6 | log_softmax      | LogSoftmax | 0      | train
--------------------------------------------------------
4.6 M     Trainable params
0         Non-trainable params
4.6 M     Total params
18.556    Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode


Epoch 99: 100%|██████████| 743/743 [00:13<00:00, 54.61it/s, v_num==256]    

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|██████████| 743/743 [00:13<00:00, 53.54it/s, v_num==256]


In [16]:
# Save the model
if not os.path.exists("../models"):
    os.makedirs("../models")

# torch.save(model.state_dict(), "../models/english_french_encoder_decoder.pth")

In [21]:
# Load the model
model = EncoderDecoder(
    input_vocab_len=SOURCE_VOCAB_SIZE,
    output_vocab_len=TARGET_VOCAB_SIZE,
    embedding_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
)

model.load_state_dict(
    torch.load("../models/english_french_encoder_decoder.pth")
)

  model.load_state_dict(torch.load("../models/english_french_encoder_decoder.pth"))


<All keys matched successfully>

In [22]:
sentence = "Hello, how are you?"
model.inference(sentence)

'Bonjour , comment allez -vous \u202f ?'

In [39]:
# Run it over the dataset

for i in range(3):
    x, y, source, target = dataset[np.random.randint(0, len(dataset))]
    print("source: \t", source)
    print("target: \t", target)
    print("Predicted: \t", model.inference(source))
    print("\n")

source: 	 I've already apologized for that.
target: 	 J'ai déjà présenté mes excuses pour cela.
Predicted: 	 J' ai déjà présenté mes excuses pour ça .


source: 	 The army had plenty of weapons.
target: 	 L'armée disposait de tas d'armes.
Predicted: 	 L' armée disposait de tas d' armes .


source: 	 You're forgiven.
target: 	 Vous êtes pardonnées.
Predicted: 	 Vous êtes <UNK> .


