# Machine Translation using Attention

In [10]:
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
import spacy
import numpy as np
import pandas as pd
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import random

In [11]:
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 29.4 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fr-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.4.0/fr_core_news_sm-3.4.0-py3-none-any.whl (16.3 MB)
[K     |████████████████████████████████| 16.3 MB 34.0 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [12]:
spacy_en = spacy.load('en_core_web_sm')
spacy_fr = spacy.load('fr_core_news_sm')


class Vocabulary:
    def __init__(self, frequency_threshold):
        self.itos = {
            0: '<PAD>',
            1: '<SOS>',
            2: '<EOS>',
            3: '<UNK>'
        }

        self.stoi = {
            '<PAD>': 0,
            '<SOS>': 1,
            '<EOS>': 2,
            '<UNK>': 3
        }

        self.frequency_threshold = frequency_threshold

    def __len__(self):
        return len(self.itos)

    def __getitem__(self, value):
      if isinstance(value, int):
        return self.itos[value]
      else:
        return self.stoi[value]

    @staticmethod
    def tokenizer(text):
        return []

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4
        for sentence in sentence_list:
            for word in self.tokenizer(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                if frequencies[word] == self.frequency_threshold:
                    self.itos[idx] = word
                    self.stoi[word] = idx
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer(text)
        return [
            self.stoi[token] if token in self.stoi else self.stoi['<UNK>']
            for token in tokenized_text
        ]

    def un_numericalize(self, encoding):
        return " ".join([
            self.itos[token.data.item()] if token.data.item() in self.itos else self.itos[3]
            for token in encoding
        ])


class EngVocabulary(Vocabulary):
    def __init__(self, frequency_threshold):
        super().__init__(frequency_threshold)

    @staticmethod
    def tokenizer(text):
        return [tok.text.lower() for tok in spacy_en.tokenizer(text)]


class FrVocabulary(Vocabulary):
    def __init__(self, frequency_threshold):
        super().__init__(frequency_threshold)

    @staticmethod
    def tokenizer(text):
        return [tok.text.lower() for tok in spacy_fr.tokenizer(text)]

In [13]:
class CustomDataset(Dataset):
    def __init__(self, root_dir, frequency_threshold_en=2, frequency_threshold_fr=1, vocab=None):
        super(CustomDataset, self).__init__()
        self.root_dir = root_dir
        self.english = open(os.path.join(root_dir, "english.txt")).read().split("\n")[:-1]
        self.french = open(os.path.join(root_dir, "french.txt")).read().split("\n")[:-1]

        if vocab is None:
            self.vocab_en = EngVocabulary(frequency_threshold_en)
            self.vocab_fr = FrVocabulary(frequency_threshold_fr)
            self.vocab_en.build_vocabulary(self.english)
            self.vocab_fr.build_vocabulary(self.french)
        else:
            self.vocab_en = vocab[0]
            self.vocab_fr = vocab[1]

    def __len__(self):
        return len(self.english)

    def __getitem__(self, index):
        english_sentence = self.english[index]
        french_sentence = self.french[index]
        numericalized_en = [self.vocab_en.stoi['<SOS>']]
        numericalized_en += self.vocab_en.numericalize(english_sentence)
        numericalized_en.append(self.vocab_en.stoi['<EOS>'])
        numericalized_en = torch.tensor(numericalized_en)

        numericalized_fr = [self.vocab_fr.stoi['<SOS>']]
        numericalized_fr += self.vocab_fr.numericalize(french_sentence)
        numericalized_fr.append(self.vocab_fr.stoi['<EOS>'])
        numericalized_fr = torch.tensor(numericalized_fr)

        return numericalized_fr, numericalized_en


class MyCollate:
    def __init__(self, pad_idx_fr, pad_idx_en):
        self.pad_idx_fr = pad_idx_fr
        self.pad_idx_en = pad_idx_en

    def __call__(self, batch):
        fr = [item[0] for item in batch]
        lengths = torch.tensor([item.shape[0] for item in fr])
        en = [item[1] for item in batch]
        fr = pad_sequence(fr, padding_value=self.pad_idx_fr)
        en = pad_sequence(en, padding_value=self.pad_idx_en)
        return (fr, lengths), en


def get_loader(root_dir, batch_size, shuffle, vocab=None):
    dataset = CustomDataset(root_dir, vocab=vocab)
    pad_idx_en = dataset.vocab_en.stoi['<PAD>']
    pad_idx_fr = dataset.vocab_fr.stoi['<PAD>']
    loader = DataLoader(
        dataset,
        batch_size=batch_size, shuffle=shuffle,
        collate_fn=MyCollate(pad_idx_fr, pad_idx_en),
    )
    return dataset, loader

In [14]:
!wget https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/019/853/original/data.zip

import zipfile
with zipfile.ZipFile("data.zip", 'r') as zip_ref:
    zip_ref.extractall()

--2022-11-17 11:23:57--  https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/019/853/original/data.zip
Resolving d2beiqkhq929f0.cloudfront.net (d2beiqkhq929f0.cloudfront.net)... 18.164.115.154, 18.164.115.106, 18.164.115.84, ...
Connecting to d2beiqkhq929f0.cloudfront.net (d2beiqkhq929f0.cloudfront.net)|18.164.115.154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1269302 (1.2M) [application/zip]
Saving to: ‘data.zip.1’


2022-11-17 11:23:57 (29.0 MB/s) - ‘data.zip.1’ saved [1269302/1269302]



In [73]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, bidirectional=True):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional=bidirectional)
        self.fc_hidden = nn.Linear(hidden_size*2, hidden_size)
        self.fc_cell = nn.Linear(hidden_size*2, hidden_size)

    def forward(self, source):
        # print("input shape: ", x.shape, self.embedding)
        x, lengths = source
        embedding = self.embedding(x)
        packed_embeds = pack_padded_sequence(embedding, lengths.to('cpu'), enforce_sorted=False)
        # print("---------- Encoder ----------")
        # print("embedding shape: ", embedding.shape)

        encoder_packed_states, (hidden, cell) = self.lstm(packed_embeds)
        encoder_states, _ = pad_packed_sequence(encoder_packed_states)

        # print("output shape: ", encoder_states.shape, hidden.shape, cell.shape)
        
        # hidden shape: (2, N, hidden_size) if num_layers == 1
        # print(hidden[0:1].shape, hidden[1:2].shape)
        # print(torch.cat((hidden[0:1], hidden[1:2]), dim=2).shape)
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))

        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))

        # print("output shape: ", encoder_states.shape, hidden.shape, cell.shape)

        # encoder_state have hidden state for each time step
        # while hidden and cell are only for the rightmost step
        return encoder_states, hidden, cell


class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(hidden_size*2 + embedding_size, hidden_size, num_layers)

        self.energy = nn.Linear(hidden_size*3, 1) # small NN to compute attention scores (alpha)
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, encoder_states, hidden, cell):
        # print("---------- Decoder ----------")
        # since x is only one word so need to add extra dimension of 1
        # x.shape: (1, batch_size)
        x = x.unsqueeze(0)
        embedding = self.embedding(x)
        # embedding.shape: (1, batch_size, 300)

        sequence_length = encoder_states.shape[0]
        h_reshaped = hidden.repeat(sequence_length, 1, 1)

        # print("h_reshaped: ", h_reshaped.shape)

        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
        # print("energy: ", energy.shape)
        attention = self.softmax(energy) # dim=0
        # print("attention: ", attention.shape)
        # shape: seq_length, N, 1
        attention = attention.permute(1,2,0)
        # shape: N, 1, seq_length
        encoder_states = encoder_states.permute(1,0,2)
        # shape: N, seq_length, hidden_size*2
        
        context_vector = torch.bmm(attention, encoder_states).permute(1,0,2)
        # (N, 1, hidden_size*2) --> (1, N, hidden_size*2)

        lstm_input = torch.cat((context_vector, embedding), dim=2)
        # print("lstm_input: ", lstm_input.shape)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        predictions = self.fc(output)
        predictions = predictions.squeeze(0)
        return predictions, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder,  decoder, eng_vocab_size, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.eng_vocab_size = eng_vocab_size
        self.device = device

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source[0].shape[1]
        target_len = target.shape[0]
        # picking 0s because in vocab 0 stands for <PAD> to pad the remaining length
        outputs = torch.zeros(
            (target_len, batch_size, self.eng_vocab_size)).to(self.device)

        encoder_states, hidden, cell = self.encoder(source)

        # passing the first character for each sencetence in batch
        x = target[0] # <SOS> shape: (1, B, embed_size)
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)
            outputs[t] = output
            best_guess = output.argmax(1)  # returns index of maximum value
            x = target[t] if random.random(
            ) < teacher_force_ratio else best_guess
        return outputs

    def predict(self, source, max_len=100):
        result = []
        result.append(1)  # index of <SOS> token
        hidden, cell = self.encoder(source)
        hidden, cell = hidden.unsqueeze(1), cell.unsqueeze(1)
        x = torch.tensor([1]).to(self.device)
        for t in range(1, max_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            best_guess = output.argmax()
            result.append(best_guess)
            print(best_guess.data.item())
            if best_guess.data.item() == 2:
                return torch.tensor(result).to(self.device)
        result.append(2)  # index of <EOS> token

        return torch.tensor(result).to(self.device)

In [48]:
train_set, train_loader = get_loader("data/train", batch_size=128, shuffle=True)
val_set, val_loader = get_loader("data/val", batch_size=128, shuffle=True,
                                  vocab=[train_set.vocab_en, train_set.vocab_fr])

In [37]:
num_epochs = 1000
learning_rate = 0.001
batch_size = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

input_size_encoder = len(train_set.vocab_fr)
input_size_decoder = len(train_set.vocab_en)
output_size = len(train_set.vocab_en)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 1
encoder_dropout = 0.5
decoder_dropout = 0.5
pad_idx = train_set.vocab_en.stoi["<PAD>"]

In [74]:
encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, encoder_dropout, True).to(device)
decoder_net = Decoder(input_size_decoder, decoder_embedding_size,
                      hidden_size, output_size, num_layers, decoder_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net, len(train_set.vocab_en), device).to(device)

In [31]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [35]:
(f,l),e = next(iter(train_loader))
f,e = f.to(device), e.to(device)
f.shape, e.shape

(torch.Size([35, 128]), torch.Size([28, 128]))

In [76]:
op = model((f,l),e)
op.shape

torch.Size([28, 128, 5893])

In [None]:
total_val_loss = 0
for epoch in range(1, num_epochs+1):
    train_loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
    train_loss = 0
    model.train()
    for batch_idx, ((french, lengths), english) in train_loop:
        french = french.to(device)
        english = english.to(device)
        output = model((french, lengths), english)
        output = output[1:].reshape(-1, output.shape[2])
        english = english[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, english)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        train_loss += loss.data.item()
        train_loop.set_description(f"Epoch: {epoch}/{num_epochs}")
        train_loop.set_postfix({"batch_loss": loss.data.item(), "train_loss":train_loss, "val_loss": total_val_loss})
        if epoch % 100 == 0:
          torch.save(model.state_dict(), "checkpoint.pt")
          print("model saving done")
        # ---- validation-----
    model.eval()
    with torch.inference_mode():
        val_loss = 0
        val_loop = tqdm(val_loader, total=len(val_loader), leave=False)
        for (french, lengths), english in val_loop:
            french = french.to(device)
            english = english.to(device)
            output = model((french, lengths), english)
            output = output[1:].reshape(-1, output.shape[2])
            english = english[1:].reshape(-1)
            loss = criterion(output, english).data.item()
            val_loss += loss

            val_loop.set_description("Validating")
            val_loop.set_postfix({'val loss': loss})
        total_val_loss = val_loss

Epoch: 1/1000:  29%|██▊       | 65/227 [00:24<01:03,  2.57it/s, batch_loss=8.68, train_loss=564, val_loss=0]