# Setup

In [None]:
%pip install datasets transformers evaluate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m9.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import tqdm
import evaluate
from transformers import AutoTokenizer

In [None]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

# Dataset

In [None]:
dataset = datasets.load_dataset("harouzie/vi_en-translation")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/665 [00:00<?, ?B/s]

(…)-00000-of-00001-8fc21cb8e80d3a2d.parquet:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

(…)-00000-of-00001-858c0e989d9c5637.parquet:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

(…)-00000-of-00001-99e7e50144d1c164.parquet:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/203272 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25409 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/25409 [00:00<?, ? examples/s]

In [None]:
train_data, test_data, valid_data = (dataset['train'], dataset['test'], dataset['valid'])

In [None]:
train_data[1]

{'English': 'The pharmacy is on Fresno Street',
 'Vietnamese': 'hiệu thuốc nằm trên đường fresno'}

# Tokenizer


In [None]:
unk_token = "<unk>"
pad_token = "<pad>"
bos_token = "<s>"
eos_token = "</s>"


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt",
                                          unk_token=unk_token,
                                          pad_token=pad_token,
                                          bos_token=bos_token,
                                          eos_token=eos_token)

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [None]:
def convert_to_ids(example, tokenizer):
    en_ids = tokenizer(example["English"], padding="max_length", truncation=True)
    vi_ids = tokenizer(example["Vietnamese"], padding="max_length", truncation=True)
    return {"en_ids": en_ids['input_ids'], "vi_ids": vi_ids['input_ids']}

In [None]:
fn_kwargs = {"tokenizer":tokenizer}

train_data = train_data.map(convert_to_ids, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(convert_to_ids, fn_kwargs=fn_kwargs)
test_data = test_data.map(convert_to_ids, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/203272 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/25409 [00:00<?, ? examples/s]

Map:   0%|          | 0/25409 [00:00<?, ? examples/s]

In [None]:
data_type = "torch"
format_columns = ["en_ids", "vi_ids"]

train_data = train_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [None]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_vi_ids = [example["vi_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_vi_ids = nn.utils.rnn.pad_sequence(batch_vi_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "vi_ids": batch_vi_ids,
        }
        return batch

    return collate_fn

In [None]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [None]:
pad_index = tokenizer.pad_token_id
unk_inedx = tokenizer.unk_token_id

In [None]:
batch_size = 32

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [None]:
del train_data, valid_data, test_data

# The Model

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.attention = nn.MultiheadAttention(hidden_dim, num_heads=1, dropout=dropout)  # Add attention

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        # Apply attention to the outputs
        attn_output, attn_weights = self.attention(outputs, outputs, outputs)
        return attn_output, hidden, cell





In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.attention = attention  # Use attention as passed during initialization

    def forward(self, input, hidden, cell, encoder_outputs):
        # input = [batch size]
        # hidden = [n layers, batch size, hidden dim]
        # cell = [n layers, batch size, hidden dim]
        # encoder_outputs = [src length, batch size, hidden dim]  (from Encoder)

        input = input.unsqueeze(0)  # Convert input to shape [1, batch size]
        embedded = self.dropout(self.embedding(input))  # Shape [1, batch size, embedding_dim]

        # Pass through the LSTM
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))  # output shape: [seq_len, batch_size, hidden_dim]

        # Apply attention
        # The attention layer expects the input to have shape [seq_len, batch_size, hidden_dim]
        attn_output, attn_weights = self.attention(output, encoder_outputs, encoder_outputs)

        # The attention output shape will be the same as the input to attention
        output = attn_output  # Output after attention is applied
        prediction = self.fc_out(output.squeeze(0))  # Prediction shape: [batch_size, output_dim]

        return prediction, hidden, cell



In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # Tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)

        # Get encoder outputs and hidden state
        encoder_outputs, hidden, cell = self.encoder(src)

        # The first input to the decoder is the <sos> token
        input = trg[0, :]

        for t in range(1, trg_length):
            # Get output from decoder and apply attention
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)

            outputs[t] = output

            # Decide whether to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)

            # If teacher forcing, use actual token; otherwise, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs



# Training

In [None]:
input_dim = len(tokenizer)
output_dim = len(tokenizer)
encoder_embedding_dim = 256
decoder_embedding_dim = 512
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
attention = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=1, dropout=decoder_dropout)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
    attention
)

model = Seq2Seq(encoder, decoder, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)


model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(250054, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
    (attention): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
  )
  (decoder): Decoder(
    (embedding): Embedding(250054, 512)
    (rnn): LSTM(512, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=250054, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (attention): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 330,301,126 trainable parameters


In [None]:
# optimizer = optim.Adam(model.parameters())
# criterion = nn.CrossEntropyLoss(ignore_index=pad_index)
import torch.optim as optim

# Cross-entropy loss function
criterion = nn.CrossEntropyLoss()

# Adam optimizer
optimizer = optim.Adam(model.parameters())


In [None]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["en_ids"].to(device)
        trg = batch["vi_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        if (i + 1) % 100 == 0:
                print(f"Batch: {i + 1}/ {len(data_loader)}: Loss {epoch_loss / (i+1)}")
    return epoch_loss / len(data_loader)

In [None]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["en_ids"].to(device)
            trg = batch["vi_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(data_loader)

In [None]:
n_epochs = 1
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

  0%|          | 0/1 [00:00<?, ?it/s]

Batch: 100/ 6353: Loss 4.959028408527375
Batch: 200/ 6353: Loss 4.238363156914711
Batch: 300/ 6353: Loss 3.973370914061864
Batch: 400/ 6353: Loss 3.795057913661003
Batch: 500/ 6353: Loss 3.6533816764354707
Batch: 600/ 6353: Loss 3.569396867553393
Batch: 700/ 6353: Loss 3.505464609009879
Batch: 800/ 6353: Loss 3.441346311867237
Batch: 900/ 6353: Loss 3.3855546351273853
Batch: 1000/ 6353: Loss 3.3366445498466493
Batch: 1100/ 6353: Loss 3.2971754184636204
Batch: 1200/ 6353: Loss 3.2604738440116248
Batch: 1300/ 6353: Loss 3.2346497413745294
Batch: 1400/ 6353: Loss 3.2014908987283706
Batch: 1500/ 6353: Loss 3.1769421736399335
Batch: 1600/ 6353: Loss 3.1546530497074126
Batch: 1700/ 6353: Loss 3.13155200418304
Batch: 1800/ 6353: Loss 3.111148479845789
Batch: 1900/ 6353: Loss 3.089326001405716
Batch: 2000/ 6353: Loss 3.067261937439442
Batch: 2100/ 6353: Loss 3.0527945419152576
Batch: 2200/ 6353: Loss 3.0386882524056866
Batch: 2300/ 6353: Loss 3.0216537302473316
Batch: 2400/ 6353: Loss 3.005790

100%|██████████| 1/1 [2:34:09<00:00, 9249.27s/it]

	Train Loss:   2.661 | Train PPL:  14.308
	Valid Loss:   5.039 | Valid PPL: 154.350





In [None]:
model.load_state_dict(torch.load("model.pt"))

test_loss = evaluate_fn(model, test_data_loader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

  model.load_state_dict(torch.load("model.pt"))


| Test Loss: 5.049 | Test PPL: 155.870 |
