# English-to-French Translation

Okay, this is a harder task and a bigger data set. It'll take a little while longer to train this one.

## Libraries

In [1]:
import os
import numpy as np
import yaml

import torch
import lightning as L
from lightning.pytorch.loggers import TensorBoardLogger
from pytorch_lightning import seed_everything
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.utilities.model_summary import ModelSummary
from lightning.pytorch.callbacks import LearningRateMonitor

from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

import matplotlib.pyplot as plt

from common.language import load_language

from common.dataloaders import (
    TranslationDataset,
    padding_collator,
)

from common.models import (
    EncoderDecoder,
)

from common.utils import (
    get_logger,
    Timer,
)

logger = get_logger("english-to-french-translation")

## Parameters

In [None]:
seed_everything(2718)

# Set the cwd to the root of the project.
# Only let this execute once
if os.getcwd().endswith("src"):
    os.chdir("..")

logger.info(f"Current working directory: {os.getcwd()}")

In [3]:
# Load config.yaml. This contains all of our paths and constants.
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Training params
EPOCHS = 30
BATCH_SIZE = 128
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 1024
SCHEDULER = "onecycle"  # "onecycle" or "reduceonplateau"

# No unit type option here, just using GRUs. No multiple layers yet.

# Trainer params
ACCELERATOR = "gpu"  # "cpu" or "gpu"

# CPUS to give each dataloader
NUM_WORKERS = 3

# Every time you run training, the logs will have this tag attached.
# If you rerun with the same tag, the log will be overwritten.
TAG = (
    f"en-to-fr_"
    f"{SCHEDULER}_"
    f"BATCH_SIZE={BATCH_SIZE}_"
    f"EMBEDDING_SIZE={EMBEDDING_SIZE}_"
    f"HIDDEN_SIZE={HIDDEN_SIZE}"
)

In [4]:
english_language = load_language(config["ENGLISH_LANGUAGE_MODEL_PATH"])
french_language = load_language(config["FRENCH_LANGUAGE_MODEL_PATH"])

In [None]:
NAMES_VOCAB_SIZE = len(english_language.vocabulary)
COUNTRIES_VOCAB_SIZE = len(french_language.vocabulary)

print(f"Names vocab size: {NAMES_VOCAB_SIZE}")
print(f"Countries vocab size: {COUNTRIES_VOCAB_SIZE}")

# Vocabulary attributes are a dictionary with the token being the
# key and the index being how frequently the token appears in the corpus.
# Note that since we've added the special tokens ourselves, they will
# have frequency 0.
name_vocab = list(english_language.vocabulary.keys())
country_vocab = list(french_language.vocabulary.keys())

print("Top 10 most common tokens in names vocabulary:")
for i in range(10):
    print(f"{name_vocab[i]}: {english_language.vocabulary[name_vocab[i]]}")

print("\nTop 10 most common tokens in countries vocabulary:")
for i in range(10):
    print(f"{country_vocab[i]}: {french_language.vocabulary[country_vocab[i]]}")

## Dataset and Dataloader

In [6]:
dataset = TranslationDataset(
    source_labels_path=config["ENGLISH_INPUT_PATH"],
    target_labels_path=config["FRENCH_INPUT_PATH"],
    source_indices_path=config["ENGLISH_OUTPUT_PATH"],
    target_indices_path=config["FRENCH_OUTPUT_PATH"],
)

In [None]:
# Show random example
x, y, english, french = dataset[np.random.randint(0, len(dataset))]
print("Input: ", x)
print("Target: ", y)
print("english: ", english)
print("french: ", french)

# Note that both source and target languages have <SOS> and  <EOS> tokens now.

In [8]:
train_data, val_data = torch.utils.data.random_split(dataset, [0.8, 0.2])

In [9]:
# Create the dataloaders
# We use a collate function to pad the sequences to the same length.

train_dataloader = torch.utils.data.DataLoader(
    train_data,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    collate_fn=padding_collator,
)

val_dataloader = torch.utils.data.DataLoader(
    val_data,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    collate_fn=padding_collator,
)

In [None]:
# Inspect the data loader.

x, y, english, french = next(iter(train_dataloader))
print("Source shape: ", x.shape)
print("Target shape: ", y.shape)


# Just to be sure, detokenize the first row
print("\nDetokenize the first row")
print("Source: ", english_language.index_to_token(x[0]))
print("Target: ", french_language.index_to_token(y[0]))

## Model

In [11]:
model = EncoderDecoder(
    source_language=english_language,
    target_language=french_language,
    detokenizer=lambda s: " ".join(s),
    input_size=NAMES_VOCAB_SIZE,
    output_size=COUNTRIES_VOCAB_SIZE,
    embedding_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    epochs=EPOCHS,
    data_length=len(train_data),
    max_output_length=30,
    scheduler=SCHEDULER,
)

In [None]:
ModelSummary(model)

In [None]:
# Check that we can forward pass with the x,y generated above.

context_vector = model.encoder_step(x)
print(f"Context vector shape: {context_vector.shape}")

output, hidden = model.decoder_step(y[:, :-1], context_vector, context_vector)
print(f"Output shape: {output.shape}")
print(f"Hidden shape: {hidden.shape}")

In [None]:
# Print how many parameters the model has
print(
    f"Model has {sum(p.numel() for p in model.parameters() if p.requires_grad):,} parameters."
)

Okay, so that's a little bigger than previously. Most of these are in the dense layer.

## Training

In [15]:
# Training logs

tensorboard_logger = TensorBoardLogger(
    save_dir=config["TENSORBOARD_LOGS_PATH"],
    name="english-to-french-translation/",
    version=TAG,
)

In [16]:
# Use our nominated accelerator and log to tensorboard
trainer = L.Trainer(
    devices=1,
    accelerator=ACCELERATOR,
    logger=tensorboard_logger,
    max_epochs=EPOCHS,
    callbacks=[
        EarlyStopping(monitor="validation_loss", patience=3, mode="min"),
        LearningRateMonitor(logging_interval="step"),
    ],
)

In [None]:
timer = Timer()
trainer.fit(model, train_dataloader, val_dataloader)
print(f"Elapsed time: {timer.toc()}")

In [18]:
# Save the model
torch.save(
    model.state_dict(), config["ENGLISH_TO_FRENCH_TRANSLATION_MODEL_PATH"]
)

## Let's try it out!

In [None]:
model.freeze()

english_sentences = [
    "Do you speak English?",
    "Where is the library?",
    "I think, therefore I am.",
    "Where is my wine and my cheese? Do you have it?",
]

expected_french_sentences = [
    "Parlez-vous anglais?",
    "Où est la bibliothèque?",
    "Je pense, donc je suis.",
    "Où est mon vin et mon fromage? L'avez-vous?",
]

for sentence, target in zip(english_sentences, expected_french_sentences):
    translated = model.inference(sentence)
    print(f"Input: {sentence}")
    print(f"Target: {target}")
    print(f"Output: {translated}")
    print()

In [None]:
# Cases from the training set

for i in range(5):
    x, y, english, french = train_data[i]
    translated = model.inference(english)
    print(f"Input: {english}")
    print(f"Target: {french}")
    print(f"Output: {translated}")
    print()

In [None]:
# What about some cases from the validation data set?

for i in range(5):
    x, y, english, french = val_data[i]
    translated = model.inference(english)
    print(f"Input: {english}")
    print(f"Target: {french}")
    print(f"Output: {translated}")
    print()