In [4]:
%load_ext autoreload
%autoreload 2

In [96]:
# Device-independent code
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cpu')

In [260]:
# Common Variables
MAX_LENGTH = 15
MIN_LENGTH = 1

# Load Dataset

In [261]:
import pickle
with open('sentencespairs.pkl', 'rb') as f:
    pairs = pickle.load(f)

# Preprocess

In [262]:
len(pairs[0][0].split())

13

In [263]:
# filter
pairs = [pair for pair in pairs if (len(pair[0].split()) < MAX_LENGTH and len(pair[1].split(' ')) < MAX_LENGTH)]
pairs = [pair for pair in pairs if (len(pair[0].split()) > MIN_LENGTH and len(pair[1].split(' ')) > MIN_LENGTH)]
print(len(pairs))

746


In [264]:
pairs

[["Why don't you just cook the breakfast, and try not to burn anything.",
  'Yes, Aunt Petunia.'],
 ['Hurry up! Bring my coffee, boy!', 'Yes, Uncle Vernon.'],
 ['Caveat Smeltona. Proudest moment of my life.',
  'Will I have to wear that, too?'],
 ["Dad! Look! Harry's got a letter!", "Hey, give it back! It's mine!"],
 ['Give me that! Give me that letter!', 'Get off! Ahh!'],
 ["Daddy's gone mad, hasn't he?!", 'Make a wish, Harry.'],
 ["I-I-I'm not Harry.", 'I-I am.'],
 ["It's not every day that your young man turns eleven, now is it?",
  'Excuse me, who are you?'],
 ["You're a wizard, Harry.", "I-I'm a what?"],
 ['Essential bit of equipment, Harry.',
  'One pair of dragon-hide gloves.  Hagrid, do they mean for a real dragon?'],
 ['Vault 713.', "What's in there, Hagrid?"],
 ['Best not mention this to anyone, Harry.', 'I still need...a wand.'],
 ["Right on your left, ma'am.",
  'Excuse me, sir. Can you tell me where I might find Platform 9&¾?'],
 ["I'm only joking. I am Fred.", 'Excuse me!

## Check for multiple sentences

In [265]:
from nltk.tokenize import sent_tokenize
print(pairs[0][0])
print(sent_tokenize(pairs[0][0]))

Why don't you just cook the breakfast, and try not to burn anything.
["Why don't you just cook the breakfast, and try not to burn anything."]


In [266]:
import re

def preprocess(text):
    # remove punctuation and numbers with a regular expression
    text = re.sub(r'[.?!,:;\'()\-\n\d]','', text.lower().strip())
    return text

In [267]:
# tag input sequence with START and END for decoder
def tag_sequence(text):
    bos = "<START> "
    eos = " <END>"
    tagged_text = bos + text + eos
    return tagged_text

In [268]:
for index, pair in enumerate(pairs):
    # preprocess
    pair[0] = preprocess(pair[0])
    pair[1] = preprocess(pair[1])
    # tag sequences
    #pair[0] = tag_sequence(pair[0])


In [327]:
pairs[50:55]

[['i shouldnt said that i should not have said that',
  'nicholas flamel  whos nicholas flamel'],
 ['merry christmas merry christmas ring the hogwart bell merry christmas merry christmas',
  'knight to e'],
 ['oh my mum made it  it looks like youve got one too', 'iive got presents'],
 ['what is it', 'its some kind of cloak'],
 ['well lets see then put it on whoa', 'my bodys gone']]

## Create Vocab

- Add each unique word to vocab set

In [270]:
from nltk import word_tokenize

def create_vocab(doc):
    vocab = {}

    for pair in doc:
        sent1 = pair[0]
        sent2 = pair[1]

        for t in sent1.split():
            if t in vocab:
                vocab[t] += 1
            else:
                vocab[t] = 1
        
        for t in sent2.split():
            if t in vocab:
                vocab[t] += 1
            else:
                vocab[t] = 1
        
    return vocab

In [271]:
vocab = create_vocab(pairs)

In [272]:
PAD_TOKEN = "<PAD>"
END_TOKEN = "<EOS>"
START_TOKEN = "<SOS>"

if PAD_TOKEN not in vocab:
    vocab[PAD_TOKEN] = 1
if START_TOKEN not in vocab:
    vocab[START_TOKEN] = 1
if END_TOKEN not in vocab:
    vocab[END_TOKEN] = 1

In [273]:
word2idx = {}
idx2word = {}

for i, word in enumerate(list(vocab.keys())):
    word2idx[word] = i
    idx2word[i] = word

In [274]:
print(pairs[:5])

[['why dont you just cook the breakfast and try not to burn anything', 'yes aunt petunia'], ['hurry up bring my coffee boy', 'yes uncle vernon'], ['caveat smeltona proudest moment of my life', 'will i have to wear that too'], ['dad look harrys got a letter', 'hey give it back its mine'], ['give me that give me that letter', 'get off ahh']]


In [275]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

In [276]:
# Encoding: convert words -> numbers using word2idx
converted_pairs = []

for pair in pairs:
    sent1 = pair[0]
    sent2 = pair[1]
    sent1_converted = []
    sent2_converted = []

    for word in sent1.split(' '):
        if len(word) > 0:
            sent1_converted.append(word2idx[word])
    for word in sent2.split(' '):
        if len(word) > 0:
            sent2_converted.append(word2idx[word])

    converted_pairs.append([sent1_converted, sent2_converted])

print(converted_pairs[0:5])

[[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [13, 14, 15]], [[16, 17, 18, 19, 20, 21], [13, 22, 23]], [[24, 25, 26, 27, 28, 19, 29], [30, 31, 32, 10, 33, 34, 35]], [[36, 37, 38, 39, 40, 41], [42, 43, 44, 45, 46, 47]], [[43, 48, 34, 43, 48, 34, 41], [49, 50, 51]]]


In [277]:
class TranslationDataset(Dataset):
    def __init__(self, pairs, word2idx):
        self.pairs = pairs
        self.word2idx = word2idx

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        eng, ita = self.pairs[idx]
        phrase = torch.tensor([self.word2idx[word] for word in eng.split() if len(word) > 1]
                                  + [self.word2idx[END_TOKEN]], dtype=torch.long)
        response = torch.tensor([self.word2idx[word] for word in ita.split() if len(word) > 1]
                                  + [self.word2idx[END_TOKEN]], dtype=torch.long)
        return phrase, response

# Custom collate function to handle padding
def collate_fn(batch):
    eng_batch, ita_batch = zip(*batch)
    eng_batch_padded = pad_sequence(eng_batch, batch_first=True, padding_value=word2idx[PAD_TOKEN])
    ita_batch_padded = pad_sequence(ita_batch, batch_first=True, padding_value=word2idx[PAD_TOKEN])
    return eng_batch_padded, ita_batch_padded

In [278]:
# Create the dataset and DataLoader
sequence_pair_dataset = TranslationDataset(pairs, word2idx)
batch_size = 16
sequence_pair_dataloader = DataLoader(sequence_pair_dataset, batch_size=batch_size,
                                    shuffle=True,  drop_last=True, collate_fn=collate_fn)

print("samples: ", len(sequence_pair_dataset))
print("batches: ", len(sequence_pair_dataloader))

samples:  746
batches:  46


In [279]:
# Example: iterating over the DataLoader
for encoder_input, decoder_output in sequence_pair_dataloader:
    print("Encoder batch:", encoder_input)
    print("Decoder batch:", decoder_output)
    break # remove this to iterate over the whole dataset

Encoder batch: tensor([[  59,  560,  105,  141,   19,  563,  560,  105, 1749, 1747, 1747, 1747,
         1747],
        [ 157,  720,  267, 1749, 1747, 1747, 1747, 1747, 1747, 1747, 1747, 1747,
         1747],
        [  94,  574, 1749, 1747, 1747, 1747, 1747, 1747, 1747, 1747, 1747, 1747,
         1747],
        [1387,  145,  408, 1749, 1747, 1747, 1747, 1747, 1747, 1747, 1747, 1747,
         1747],
        [ 111,  213,  273,  127, 1749, 1747, 1747, 1747, 1747, 1747, 1747, 1747,
         1747],
        [   9,  130,  244,  245,   74,  136,  246,  247,  248,  249,  250,   78,
         1749],
        [   2,  211,   32,  228,   44,   59, 1749, 1747, 1747, 1747, 1747, 1747,
         1747],
        [ 502,  213,  977,   19,  978,  979,  117,   90,  980, 1749, 1747, 1747,
         1747],
        [ 116,  129,  130,    5,  131,  129,  132, 1749, 1747, 1747, 1747, 1747,
         1747],
        [ 140,  203,    8, 1273, 1749, 1747, 1747, 1747, 1747, 1747, 1747, 1747,
         1747],
        [1020, 

# Seq2Seq Model using LSTM

In [280]:
import torch
import torch.nn as nn

In [281]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers,
                            batch_first=True)

    def forward(self, x):
        # Reversing the sequence of indices
        x = torch.flip(x, [1])
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

In [282]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers,
                            batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x)
        out, (hidden, cell) = self.lstm(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

In [283]:
# Hyperparameters
vocab_size = len(vocab)
embed_size = 256
hidden_size = 512
num_layers = 1

# Initialize the models
encoder = Encoder(vocab_size, embed_size, hidden_size, num_layers).to(DEVICE)
decoder = Decoder(vocab_size, embed_size, hidden_size, num_layers).to(DEVICE)

# Train Model

In [288]:
len(sequence_pair_dataloader)

46

In [289]:
import torch.optim as optim
import torch.nn as nn
import random

# Loss Function (exclude padding)
loss_fn = nn.CrossEntropyLoss(ignore_index=word2idx[PAD_TOKEN])

# Optimizers
encoder_optimizer = optim.AdamW(encoder.parameters())
decoder_optimizer = optim.AdamW(decoder.parameters())

# Number of epochs
num_epochs = 10

# Training Loop
encoder.train()
decoder.train()

for epoch in range(num_epochs):
    print(f'epoch: {epoch}')
    for i, (input_tensor, target_tensor) in enumerate(sequence_pair_dataloader):
        print(f'batch: {i}')
        input_tensor, target_tensor = input_tensor.to(DEVICE), target_tensor.to(DEVICE)
        # Zero gradients of both optimizers
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        target_length = target_tensor.size(1)

        # Encoder
        _, encoder_hidden, encoder_cell = encoder(input_tensor)

        # Decoder
        decoder_input = torch.full((batch_size, 1), word2idx[START_TOKEN], dtype=torch.long).to(DEVICE)
        decoder_hidden = encoder_hidden
        decoder_cell = encoder_cell

        # Randomly select a word index from the target sequence
        random_word_index = random.randint(0, target_length - 1)

        loss = 0

        for di in range(target_length):
            logits, decoder_hidden, decoder_cell  = decoder(decoder_input, decoder_hidden, decoder_cell)
            loss += loss_fn(logits, target_tensor[:,di])
            decoder_input = target_tensor[:, di].reshape(batch_size, 1)  # Teacher forcing

        # Backpropagation
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        if i % 100 == 0:  # Print loss every 10 batches
            print(f'Epoch {epoch}, Batch {i}, Loss: {loss.item() / target_length:.4f}')

epoch: 0
batch: 0
Epoch 0, Batch 0, Loss: 0.8843
batch: 1
batch: 2
batch: 3
batch: 4
batch: 5
batch: 6
batch: 7
batch: 8
batch: 9
batch: 10
batch: 11
batch: 12
batch: 13
batch: 14
batch: 15
batch: 16
batch: 17
batch: 18
batch: 19
batch: 20
batch: 21
batch: 22
batch: 23
batch: 24
batch: 25
batch: 26
batch: 27
batch: 28
batch: 29
batch: 30
batch: 31
batch: 32
batch: 33
batch: 34
batch: 35
batch: 36
batch: 37
batch: 38
batch: 39
batch: 40
batch: 41
batch: 42
batch: 43
batch: 44
batch: 45
epoch: 1
batch: 0
Epoch 1, Batch 0, Loss: 0.8398
batch: 1
batch: 2
batch: 3
batch: 4
batch: 5
batch: 6
batch: 7
batch: 8
batch: 9
batch: 10
batch: 11
batch: 12
batch: 13
batch: 14
batch: 15
batch: 16
batch: 17
batch: 18
batch: 19
batch: 20
batch: 21
batch: 22
batch: 23
batch: 24
batch: 25
batch: 26
batch: 27
batch: 28
batch: 29
batch: 30
batch: 31
batch: 32
batch: 33
batch: 34
batch: 35
batch: 36
batch: 37
batch: 38
batch: 39
batch: 40
batch: 41
batch: 42
batch: 43
batch: 44
batch: 45
epoch: 2
batch: 0
Ep

In [312]:
# Test model
def translate(encoder, decoder, sentence, word2idx, max_length=15):
    encoder.eval()
    decoder.eval()
    with torch.inference_mode():
        # Tokenize and encode the sentence
        input_tensor = torch.tensor([word2idx[word] for word in sentence.split() if len(word) > 1]
                                    + [word2idx[END_TOKEN]], dtype=torch.long)
        input_tensor = input_tensor.view(1, -1).to(DEVICE)  # batch_first=True

        # Pass the input through the encoder
        _, encoder_hidden, encoder_cell = encoder(input_tensor)

        # Initialize the decoder input with the SOS token
        decoder_input = torch.tensor([[word2idx[START_TOKEN]]], dtype=torch.long)  # SOS
        # Initialize the hidden state of the decoder with the encoder's hidden state
        decoder_hidden, decoder_cell = encoder_hidden, encoder_cell

        # Decoding the sentence
        decoded_words = []
        last_word = torch.tensor([[word2idx[START_TOKEN]]]).to(DEVICE)
        for di in range(max_length):
            logits, decoder_hidden, decoder_cell = decoder(last_word, decoder_hidden, decoder_cell)
            next_token = logits.argmax(dim=1) # greedy
            last_word = torch.tensor([[next_token]]).to(DEVICE)
            if next_token.item() == word2idx[END_TOKEN]:
                break
            else:
                decoded_words.append(idx2word.get(next_token.item()))
        return ' '.join(decoded_words)

In [336]:
sentence = "why are you bad"
translated_sentence = translate(encoder, decoder, sentence, word2idx)
print("Response:", translated_sentence)

Response: its meant to the snake
