# Deep Learning

## Assignment 3

### Question 3
### Encoder Decoder with Attention

In this question of the assignment, we will be implementing a Encoder Decoder model with Attention. Let us begin by loading the data give to us

In [5]:
import torch
import torch.nn as nn
import numpy as np
import random
import torch.functional as F
import re
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
def separate_words_and_special_chars(input_string):
    # Define a regular expression pattern to match words and special characters
    pattern = r'([^\w\s])|(\s+)'
    
    # Use re.split() to separate words and special characters
    result = re.split(pattern, input_string)
    
    # Remove empty strings from the result list
    result = [item for item in result if item]

    return result

def separate_words_and_special_chars_1(input_string):
    # Define a regular expression pattern to match words and special characters
    pattern = r'([^\w\s])|(\s+)'
    
    # Use re.split() to separate words and special characters
    result = re.split(pattern, input_string)
    
    # Remove empty strings from the result list
    result = [item for item in result if item]
    result = ['<SRT>'] + result + ['<END>']

    return result

In [8]:
with open("/kaggle/input/deep-leanring3/train.sources", "r") as file:
    X_train = file.readlines()
    X_train = [separate_words_and_special_chars(string) for string in X_train]

with open("/kaggle/input/deep-leanring3/train.targets", "r") as file:
    Y_train = file.readlines()
    Y_train = [separate_words_and_special_chars_1(string) for string in Y_train]


with open("/kaggle/input/deep-leanring3/dev.sources", "r") as file:
    X_val = file.readlines()
    X_val = [separate_words_and_special_chars(string) for string in X_val]

with open("/kaggle/input/deep-leanring3/dev.targets", "r") as file:
    Y_val = file.readlines()
    Y_val = [separate_words_and_special_chars_1(string) for string in Y_val]

with open("/kaggle/input/deep-leanring3/test.sources", "r") as file:
    X_test = file.readlines()
    X_test = [separate_words_and_special_chars(string) for string in X_test]

with open("/kaggle/input/deep-leanring3/test.targets", "r") as file:
    Y_test = file.readlines()
    Y_test = [separate_words_and_special_chars_1(string) for string in Y_test]


In [21]:
input_vocab = {token: idx + 3 for idx, (token, _) in enumerate(Counter([item for sublist in X_train for item in sublist]).most_common())}
output_vocab = {token: idx + 1 for idx, (token, _) in enumerate(Counter([item for sublist in Y_train for item in sublist]).most_common())}

output_vocab['<PAD>'] = 0
input_vocab['<PAD>'] = 0
input_vocab['<SRT>'] = 1
input_vocab['<END>'] = 2

In [22]:
X_train_numerical = [[input_vocab[token] for token in X] for X in X_train]
Y_train_numerical = [[output_vocab[token] for token in X] for X in Y_train]

In [23]:
max_length = 500
X_padded = pad_sequence([torch.tensor(X_train_numerical[i][:max_length]) for i in range(len(X_train_numerical))], padding_value=0).T
Y_padded = pad_sequence([torch.tensor(Y_train_numerical[i][:max_length]) for i in range(len(Y_train_numerical))], padding_value=0).T

In [28]:
train_dataset = TensorDataset(X_padded[:128].to(device), Y_padded[:128].to(device))

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

Now Let us start implementing the model class

In [29]:
class Encoder(nn.Module):
    def __init__(self, input_vocab_size, emb_dim, enc_hid_dim, num_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_vocab_size, emb_dim)
        self.encoder = nn.LSTM(emb_dim, enc_hid_dim, num_layers=num_layers, bidirectional=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src): # Shape of src is (sequence_length, batch_size)
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.encoder(embedded)
        return outputs, hidden, cell #Hidden shape is (sequence_length, batch_size, 2*hidden_size)

class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + (2*dec_hid_dim), dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs): #encoder outputs (sequence_length, batch_size, 2*hidden_size)
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden.permute(1, 0, 2), encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return torch.softmax(attention, dim=1) #(src_len, batch_size)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, num_layers, dropout, attention):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM((enc_hid_dim * 2) + emb_dim, dec_hid_dim, num_layers=num_layers)
        self.output_dim = output_dim
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.attention = attention

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(0) #(1, batch_size)
        embedded = self.dropout(self.embedding(input)) #(1, batch_size, emb_dim)
        a = self.attention(hidden.view(batch_size, -1), encoder_outputs)
        a = a.unsqueeze(1).permute(2, 1, 0) #(batch_size, 1, src_len)
        encoder_outputs = encoder_outputs.permute(1, 0, 2) #(batch_size, seq_len, 2*enc_hid_dim)
        weighted = torch.bmm(a, encoder_outputs)#(batch_size, 1, 2*enc_hid_dim)
        weighted = weighted.permute(1, 0, 2)#(1, batch_size, 2*enc_hid_dim)
        rnn_input = torch.cat((embedded, weighted), dim=2) #(1, batch_size, 2*hid_dim + emb_dim)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        embedded = embedded.squeeze(0) #(batch_size, emb_dim)
        output = output.squeeze(0) #(batch_size, dec_hid_dim)
        weighted = weighted.squeeze(0) #(batch_size, 2*enc_hid_dim)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1)) #(batch_size, output_dim)
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, beam_width):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.beam_width = beam_width

    def forward(self, src, trg, teacher_forcing=True):
        trg_len = trg.shape[0] - 1 # trg shape = (seq_len, batch_size)
        batch_size = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        hidden = hidden[:2]
        cell = cell[:2]
        if teacher_forcing:
            for t in range(0, trg_len):
                input = trg[t]
                output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
                outputs[t] = output

        else:
            beam_search = BeamSearch(self.decoder, hidden, cell, self.beam_width, trg_vocab_size)
            for t in range(0, trg_len):
                beam_search.step(hidden, cell, encoder_outputs)
            outputs = beam_search.get_best_sequence()
        return outputs

class BeamSearch(nn.Module):
    def __init__(self, decoder, initial_hidden, initial_cell, beam_width, vocab_size):
        self.decoder = decoder
        self.beam_width = beam_width
        self.vocab_size = vocab_size
        self.topk = 1 

        self.finished_sequences = []
        self.topk_sequences = [{"sequence": torch.zeros(initial_hidden.shape[1], 1), "score": 0.0,
                                "hidden": initial_hidden, "cell": initial_cell} for _ in range(beam_width)]

    def step(self, hidden, cell, encoder_outputs):
        candidates = []
        for sequence in self.topk_sequences:
            prev_output = sequence["sequence"][:, -1]
            output, hidden, cell = self.decoder(prev_output, sequence["hidden"], sequence["cell"], encoder_outputs)
            log_probs = F.log_softmax(output, dim=1)
            topk_probs, topk_indices = torch.topk(log_probs, self.beam_width, dim=1)

            for i in range(self.beam_width):
                candidate = {
                    "sequence": torch.cat((sequence["sequence"], topk_indices[:, i]), dim=1),
                    "score": sequence["score"] + topk_probs[:, i].item(),
                    "hidden": hidden,
                    "cell": cell
                }
                candidates.append(candidate)

        candidates.sort(key=lambda x: x["score"], reverse=True)
        self.topk_sequences = candidates[:self.beam_width]

        return self.topk_sequences[0]["sequence"][:, -1].unsqueeze(0), self.topk_sequences[0]["hidden"], self.topk_sequences[0]["cell"]

    def get_best_sequence(self):
        return self.topk_sequences[0]["sequence"]

Now, let us define the hyperparameters.

In [30]:
INPUT_DIM = len(input_vocab)
OUTPUT_DIM = len(output_vocab)
EMB_DIM = 512
ENC_HID_DIM = 512
DEC_HID_DIM = 512
NUM_LAYERS = 2
DROPOUT = 0.5
BEAM_WIDTH = 15
BATCH_SIZE = 32
NUM_LAYERS = 2
NUM_EPOCHS = 4

enc = Encoder(INPUT_DIM, EMB_DIM, ENC_HID_DIM, NUM_LAYERS, DROPOUT)
attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
dec = Decoder(OUTPUT_DIM, EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, NUM_LAYERS, DROPOUT, attn)

model = Seq2Seq(enc, dec, device, BEAM_WIDTH).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=0)

Now let us implement the training process and see how it does.

In [31]:
for epoch in range(NUM_EPOCHS):
    for i, batch in enumerate(train_dataloader):
        src, trg = batch
        src = src.T
        print(src.shape)
        trg = trg.T
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output.view(-1, output_dim)
        
        trg = trg[1:].reshape(-1)
        optimizer.zero_grad()
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()

        if (i+1)%2 == 0:
            print(f'epoch {epoch+1}/{NUM_EPOCHS}, step {i+1}/{len(train_dataloader)}, loss  {loss.item():.4f}')

torch.Size([242, 32])
torch.Size([242, 32])
epoch 1/4, step 2/4, loss  3.9854
torch.Size([242, 32])
torch.Size([242, 32])
epoch 1/4, step 4/4, loss  3.3517
torch.Size([242, 32])
torch.Size([242, 32])
epoch 2/4, step 2/4, loss  2.9959
torch.Size([242, 32])
torch.Size([242, 32])
epoch 2/4, step 4/4, loss  2.7250
torch.Size([242, 32])
torch.Size([242, 32])
epoch 3/4, step 2/4, loss  2.5456
torch.Size([242, 32])
torch.Size([242, 32])
epoch 3/4, step 4/4, loss  2.4135
torch.Size([242, 32])
torch.Size([242, 32])
epoch 4/4, step 2/4, loss  2.3549
torch.Size([242, 32])
torch.Size([242, 32])
epoch 4/4, step 4/4, loss  2.2913
