In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

from transformers import BertTokenizer
from torch.utils.data import DataLoader
from datasets import load_from_disk

### Step 2 - Implementing Model

In [56]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        
        output, (hidden, cell) = self.lstm(embedded)
        
        return hidden, cell

In [57]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.out(output.squeeze(0))
        return prediction, hidden, cell

In [58]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)
        
        input = trg[0,:]
        
        for t in range(1, max_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

### Step 3: Model Initialization

In [59]:
# Define the dimensions
input_dim = len(tokenizer.get_vocab())
output_dim = len(tokenizer.get_vocab())
emb_dim = 256
hid_dim = 512
n_layers = 2
dropout = 0.5

# Initialize the encoder and decoder
enc = Encoder(input_dim, emb_dim, hid_dim, n_layers, dropout)
dec = Decoder(output_dim, emb_dim, hid_dim, n_layers, dropout)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the seq2seq model
model = Seq2Seq(enc, dec, device).to(device)

### Step 1: Data Preparation

In [60]:
folder_path = '../../data/seq2seq'
# Load the dataset from disk
seq2seq_dataset = load_from_disk(folder_path)

In [61]:
train_dataset = seq2seq_dataset['train']
test_dataset = seq2seq_dataset['test']
val_dataset = seq2seq_dataset['dev']

In [62]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [23]:
# Tokenize the source and target sentences
train_set = train_dataset.map(lambda x: {'source': tokenizer(x['source'], padding='max_length'), 'target': tokenizer(x['target'], padding='max_length')})
test_set = test_dataset.map(lambda x: {'source': tokenizer(x['source'], padding='max_length'), 'target': tokenizer(x['target'], padding='max_length')})
val_set = val_dataset.map(lambda x: {'source': tokenizer(x['source'], padding='max_length'), 'target': tokenizer(x['target'], padding='max_length')})

# Convert the tokenized sentences into tensors and organize them into batches
train_loader = DataLoader(train_set, batch_size=32)
test_loader = DataLoader(test_set, batch_size=32)
val_loader = DataLoader(val_set, batch_size=32)

Map:   0%|          | 0/51961 [00:00<?, ? examples/s]

Map:   0%|          | 0/963 [00:00<?, ? examples/s]

Map:   0%|          | 0/672 [00:00<?, ? examples/s]

### Step 4: Define the Loss Function and Optimizer

In [63]:
# Define the optimizer
optimizer = optim.Adam(model.parameters())

# Define the loss function
PAD_IDX = tokenizer.pad_token_id
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

### Step 5: Train the Model

In [52]:
def initHiddenCell(self, batch_size):
    return (torch.zeros(self.num_layers, batch_size, self.hidden_size).to(self.device),
            torch.zeros(self.num_layers, batch_size, self.hidden_size).to(self.device))

In [64]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = torch.stack(batch['source']['input_ids']).to(device)
        trg = torch.stack(batch['target']['input_ids']).to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

### Step 6: Evaluate the Model

In [65]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.source
            trg = batch.target
            output = model(src, trg, 0) #turn off teacher forcing
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

### Step 7: Running the Training and Evaluation loop

In [66]:
import os

N_EPOCHS = 10
CLIP = 1
SAVE_DIR = '../../models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'baseline_model.pt')

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_loader, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')