In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
#import tqdm
import evaluate

seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import json

corpus_name = "movie-corpus"
corpus = os.path.join("data", corpus_name)

def printLines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

printLines(os.path.join(corpus, "utterances.jsonl"))

b'{"id": "L1045", "conversation_id": "L1044", "text": "They do not!", "speaker": "u0", "meta": {"movie_id": "m0", "parsed": [{"rt": 1, "toks": [{"tok": "They", "tag": "PRP", "dep": "nsubj", "up": 1, "dn": []}, {"tok": "do", "tag": "VBP", "dep": "ROOT", "dn": [0, 2, 3]}, {"tok": "not", "tag": "RB", "dep": "neg", "up": 1, "dn": []}, {"tok": "!", "tag": ".", "dep": "punct", "up": 1, "dn": []}]}]}, "reply-to": "L1044", "timestamp": null, "vectors": []}\n'
b'{"id": "L1044", "conversation_id": "L1044", "text": "They do to!", "speaker": "u2", "meta": {"movie_id": "m0", "parsed": [{"rt": 1, "toks": [{"tok": "They", "tag": "PRP", "dep": "nsubj", "up": 1, "dn": []}, {"tok": "do", "tag": "VBP", "dep": "ROOT", "dn": [0, 2, 3]}, {"tok": "to", "tag": "TO", "dep": "dobj", "up": 1, "dn": []}, {"tok": "!", "tag": ".", "dep": "punct", "up": 1, "dn": []}]}]}, "reply-to": null, "timestamp": null, "vectors": []}\n'
b'{"id": "L985", "conversation_id": "L984", "text": "I hope so.", "speaker": "u0", "meta": {

In [3]:
# Splits each line of the file to create lines and conversations
def loadLinesAndConversations(fileName, n):
    lines = {}
    conversations = {}
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            lineJson = json.loads(line)
            # Extract fields for line object
            lineObj = {}
            lineObj["lineID"] = lineJson["id"] #example : L1045
            lineObj["characterID"] = lineJson["speaker"] # example : u1, u2
            lineObj["text"] = lineJson["text"] #These are the conversations
            lines[lineObj['lineID']] = lineObj

            # Extract fields for conversation object
            if lineJson["conversation_id"] not in conversations:
                convObj = {}
                convObj["conversationID"] = lineJson["conversation_id"]
                convObj["movieID"] = lineJson["meta"]["movie_id"]
                convObj["lines"] = [lineObj]
            else:
                convObj = conversations[lineJson["conversation_id"]]
                convObj["lines"].insert(0, lineObj)
            conversations[convObj["conversationID"]] = convObj
    first_n_items = dict(itertools.islice(lines.items(), n))
    print(first_n_items)
    print("Line dictionary ends!! \n")
    second_n_items = dict(itertools.islice(conversations.items(), n))
    print(second_n_items)

    return lines, conversations


# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
    qa_pairs = []
    for conversation in conversations.values():
        # Iterate over all the lines of the conversation
        for i in range(len(conversation["lines"]) - 1):  # We ignore the last line (no answer for it)
            inputLine = conversation["lines"][i]["text"].strip()
            targetLine = conversation["lines"][i+1]["text"].strip()
            # Filter wrong samples (if one of the lists is empty)
            if inputLine and targetLine:
                qa_pairs.append([inputLine, targetLine])
    return qa_pairs

In [4]:
import codecs
import itertools
# Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")

delimiter = ','
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# Initialize lines dict and conversations dict
lines = {}
conversations = {}
# Load lines and conversations
print("\nProcessing corpus into lines and conversations...")
lines, conversations = loadLinesAndConversations(os.path.join(corpus, "utterances.jsonl"), 10)

# Write new csv file
print("\nWriting newly formatted file...")
# with open(datafile, 'w', encoding='utf-8') as outputfile:
#     writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
#     for pair in extractSentencePairs(conversations):
#         writer.writerow(pair)

# # Print a sample of lines
# print("\nSample lines from file:")
# printLines(datafile)


Processing corpus into lines and conversations...
{'L1045': {'lineID': 'L1045', 'characterID': 'u0', 'text': 'They do not!'}, 'L1044': {'lineID': 'L1044', 'characterID': 'u2', 'text': 'They do to!'}, 'L985': {'lineID': 'L985', 'characterID': 'u0', 'text': 'I hope so.'}, 'L984': {'lineID': 'L984', 'characterID': 'u2', 'text': 'She okay?'}, 'L925': {'lineID': 'L925', 'characterID': 'u0', 'text': "Let's go."}, 'L924': {'lineID': 'L924', 'characterID': 'u2', 'text': 'Wow'}, 'L872': {'lineID': 'L872', 'characterID': 'u0', 'text': "Okay -- you're gonna need to learn how to lie."}, 'L871': {'lineID': 'L871', 'characterID': 'u2', 'text': 'No'}, 'L870': {'lineID': 'L870', 'characterID': 'u0', 'text': 'I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?'}, 'L869': {'lineID': 'L869', 'characterID': 'u0', 'text': 'Like my fear of wearing pastels?'}}
Line dictionary ends!! 

{'L1044': {'conversationID': 'L1044', 'movieID': 'm0', 'lines': [{'lineI

In [5]:
import pandas as pd
df = pd.read_csv("./data/movie-corpus/formatted_movie_lines.txt")
# Define the text to add
import re
def clean_text(txt):
    txt = txt.lower()
    txt = re.sub(r"i'm", "i am", txt)
    txt = re.sub(r"he's", "he is", txt)
    txt = re.sub(r"she's", "she is", txt)
    txt = re.sub(r"that's", "that is", txt)
    txt = re.sub(r"what's", "what is", txt)
    txt = re.sub(r"where's", "where is", txt)
    txt = re.sub(r"\'ll", " will", txt)
    txt = re.sub(r"\'ve", " have", txt)
    txt = re.sub(r"\'re", " are", txt)
    txt = re.sub(r"\'d", " would", txt)
    txt = re.sub(r"won't", "will not", txt)
    txt = re.sub(r"can't", "can not", txt)
    txt = re.sub(r"[^\w\s]", "", txt)
    return txt
df['Input'] = df['Input'].apply(lambda x: clean_text(x))
df['Output'] = df['Output'].apply(lambda x: clean_text(x))

In [6]:
df

Unnamed: 0,Input,Output
0,they do to,they do not
1,she okay,i hope so
2,wow,lets go
3,i am kidding you know how sometimes you just ...,no
4,no,okay you are gonna need to learn how to lie
...,...,...
221277,and i assure you you do not in fact i would be...,so far only their scouts but we have had repor...
221278,your orders mr vereker,i am to take the sikali with the main column t...
221279,i am to take the sikali with the main column t...,lord chelmsford seems to want me to stay back ...
221280,lord chelmsford seems to want me to stay back ...,i think chelmsford wants a good man on the bor...


In [7]:
df.isnull().sum()

Input     0
Output    0
dtype: int64

In [8]:
#pip install scikit-learn

In [9]:
from sklearn.model_selection import train_test_split
X_training, X_testing, y_training, y_testing = train_test_split(df['Input'], df['Output'], test_size = 0.1, random_state = 42)

In [10]:
X_main_training, X_validation, y_main_training, y_validation = train_test_split(X_training, y_training, test_size = 0.1, random_state = 42)

In [11]:
train_df = pd.DataFrame(X_main_training)
train_df['Output'] = y_main_training
train_df = train_df.reset_index()
train_df = train_df.drop(columns = ['index'])

In [12]:
validation_df = pd.DataFrame(X_validation)
validation_df['Output'] = y_validation
validation_df = validation_df.reset_index()
validation_df = validation_df.drop(columns = ['index'])

In [13]:
test_df = pd.DataFrame(X_testing)
test_df['Output'] = y_testing
test_df = test_df.reset_index()
test_df = test_df.drop(columns = ['index'])

In [14]:
#train_df = train_df.reset_index()
#train_df = train_df.drop(columns = ['index'])
print(train_df.isnull().sum())
print(validation_df.isnull().sum())
print(test_df.isnull().sum())
#train_df

Input     0
Output    0
dtype: int64
Input     0
Output    0
dtype: int64
Input     0
Output    0
dtype: int64


In [15]:
print(f'Shape of validation_df: {validation_df.shape}')
print(f'Shape of test_df: {test_df.shape}')
print(f'Shape of train_df: {train_df.shape}')
#print(train_df.shape[0] + validation_df.shape[0] + test_df.shape[0])

Shape of validation_df: (19916, 2)
Shape of test_df: (22129, 2)
Shape of train_df: (179237, 2)


In [16]:
import spacy
en_nlp = spacy.load("en_core_web_sm")

In [17]:
def tokenize_inputs(x, en_nlp, max_length, lower, sos_token, eos_token):
    input_tokens = [token.text for token in en_nlp.tokenizer(x)][:max_length]
    if lower:
        input_tokens = [token.lower() for token in input_tokens]
    input_tokens = [sos_token] + input_tokens + [eos_token]
    return input_tokens

def tokenize_outputs(x, en_nlp, max_length, lower, sos_token, eos_token):
    output_tokens = [token.text for token in en_nlp.tokenizer(x)][:max_length]
    if lower:
        output_tokens = [token.lower() for token in output_tokens]
    output_tokens = [sos_token] + output_tokens + [eos_token]
    return output_tokens

In [18]:
# max_length = 1_000
# en_tokens = [token.text for token in en_nlp.tokenizer(train_df["Input"][0])][:max_length]
# en_tokens

In [19]:
train_df['input_token'] = train_df['Input'].map(lambda x:tokenize_inputs(x, en_nlp, 1000, True, '<sos>', '<eos>'))
train_df['output_token'] = train_df['Output'].map(lambda x:tokenize_outputs(x, en_nlp, 1000, True, '<sos>', '<eos>'))
validation_df['input_token'] = validation_df['Input'].map(lambda x:tokenize_inputs(x, en_nlp, 1000, True, '<sos>', '<eos>'))
validation_df['output_token'] = validation_df['Output'].map(lambda x:tokenize_outputs(x, en_nlp, 1000, True, '<sos>', '<eos>'))
test_df['input_token'] = test_df['Input'].map(lambda x:tokenize_inputs(x, en_nlp, 1000, True, '<sos>', '<eos>'))
test_df['output_token'] = test_df['Output'].map(lambda x:tokenize_outputs(x, en_nlp, 1000, True, '<sos>', '<eos>'))

In [20]:
max = 0
for i in train_df['output_token']:
    current = 0
    for j in i:
        current += 1
    if current > max:
        max = current
print(max)

562


In [21]:
train_df

Unnamed: 0,Input,Output,input_token,output_token
0,sir i have got an overload in disposal unit four,you better check on it mr dunn i will stay he...,"[<sos>, sir, i, have, got, an, overload, in, d...","[<sos>, you, better, check, on, it, mr, dunn, ..."
1,is he italian,no why,"[<sos>, is, he, italian, <eos>]","[<sos>, no, why, <eos>]"
2,is this my fault do you think this is what i am,what,"[<sos>, is, this, my, fault, do, you, think, t...","[<sos>, what, <eos>]"
3,no no thank you,its a real good chocolate cake duncan hines de...,"[<sos>, no, no, thank, you, <eos>]","[<sos>, its, a, real, good, chocolate, cake, d..."
4,the premiere was the first time i have convinc...,viktor you should have said something,"[<sos>, the, premiere, was, the, first, time, ...","[<sos>, viktor, you, should, have, said, somet..."
...,...,...,...,...
179232,look you make this little run for me i will bu...,last year this was a new rig,"[<sos>, look, you, make, this, little, run, fo...","[<sos>, last, year, this, was, a, new, rig, <e..."
179233,actually the only thing i gotta give that guy ...,yeah well that pizza could feed a family of fo...,"[<sos>, actually, the, only, thing, i, got, ta...","[<sos>, yeah, well, that, pizza, could, feed, ..."
179234,i dont care what nobody says this bum creed wo...,hey how ya feelin mickey,"[<sos>, i, do, nt, care, what, nobody, says, t...","[<sos>, hey, how, ya, feelin, mickey, <eos>]"
179235,well what did he say,he said he thinks we have paid him and he want...,"[<sos>, well, what, did, he, say, <eos>]","[<sos>, he, said, he, thinks, we, have, paid, ..."


In [22]:
#type(train_data)

In [23]:
#pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0.tar.gz

In [24]:
from torchtext.vocab import build_vocab_from_iterator



In [25]:
concat_data = pd.concat([train_df['input_token'], train_df['output_token']])
concat_data.shape

(358474,)

In [26]:
min_freq = 4
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [
    unk_token,
    pad_token,
    '<sos>',
    '<eos>',
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    concat_data,
    min_freq=min_freq,
    specials=special_tokens,
)

# de_vocab = torchtext.vocab.build_vocab_from_iterator(
#     train_data["de_tokens"],
#     min_freq=min_freq,
#     specials=special_tokens,
# )

In [27]:
print(en_vocab['<sos>'])
print(en_vocab['<eos>'])
print(en_vocab['<unk>'])
print(en_vocab['<pad>'])

2
3
0
1


In [28]:
en_vocab.set_default_index(en_vocab['<unk>'])
#de_vocab.set_default_index(de_vocab['<unk>'])

In [29]:
tokens = ["i", "love", "watching", "crime", "shows"]
en_vocab.lookup_indices(tokens)

[5, 137, 806, 1009, 1385]

In [30]:
def numericalize_english(example, en_vocab):
    en_ids = en_vocab.lookup_indices(example)
    return en_ids

In [31]:
train_df['input_ids'] = train_df['input_token'].map(lambda x:numericalize_english(x, en_vocab))
train_df['output_ids'] = train_df['output_token'].map(lambda x:numericalize_english(x, en_vocab))
validation_df['input_ids'] = validation_df['input_token'].map(lambda x:numericalize_english(x, en_vocab))
validation_df['output_ids'] = validation_df['output_token'].map(lambda x:numericalize_english(x, en_vocab))
test_df['input_ids'] = test_df['input_token'].map(lambda x:numericalize_english(x, en_vocab))
test_df['output_ids'] = test_df['output_token'].map(lambda x:numericalize_english(x, en_vocab))

In [32]:
train_df

Unnamed: 0,Input,Output,input_token,output_token,input_ids,output_ids
0,sir i have got an overload in disposal unit four,you better check on it mr dunn i will stay he...,"[<sos>, sir, i, have, got, an, overload, in, d...","[<sos>, you, better, check, on, it, mr, dunn, ...","[2, 145, 5, 20, 50, 78, 13872, 19, 6082, 2236,...","[2, 4, 161, 448, 35, 11, 131, 12512, 6, 5, 28,..."
1,is he italian,no why,"[<sos>, is, he, italian, <eos>]","[<sos>, no, why, <eos>]","[2, 10, 24, 1938, 3]","[2, 33, 67, 3]"
2,is this my fault do you think this is what i am,what,"[<sos>, is, this, my, fault, do, you, think, t...","[<sos>, what, <eos>]","[2, 10, 26, 32, 708, 12, 4, 59, 26, 10, 18, 5,...","[2, 18, 3]"
3,no no thank you,its a real good chocolate cake duncan hines de...,"[<sos>, no, no, thank, you, <eos>]","[<sos>, its, a, real, good, chocolate, cake, d...","[2, 33, 33, 211, 4, 3]","[2, 36, 9, 222, 79, 2545, 2276, 7294, 21090, 3..."
4,the premiere was the first time i have convinc...,viktor you should have said something,"[<sos>, the, premiere, was, the, first, time, ...","[<sos>, viktor, you, should, have, said, somet...","[2, 7, 11236, 30, 7, 158, 85, 5, 20, 2814, 66,...","[2, 2775, 4, 115, 20, 116, 101, 3]"
...,...,...,...,...,...,...
179232,look you make this little run for me i will bu...,last year this was a new rig,"[<sos>, look, you, make, this, little, run, fo...","[<sos>, last, year, this, was, a, new, rig, <e...","[2, 95, 4, 105, 26, 108, 293, 27, 21, 5, 28, 4...","[2, 162, 375, 26, 30, 9, 184, 4030, 3]"
179233,actually the only thing i gotta give that guy ...,yeah well that pizza could feed a family of fo...,"[<sos>, actually, the, only, thing, i, got, ta...","[<sos>, yeah, well, that, pizza, could, feed, ...","[2, 376, 7, 121, 118, 5, 50, 216, 132, 14, 170...","[2, 76, 62, 14, 2988, 75, 1511, 9, 370, 16, 32..."
179234,i dont care what nobody says this bum creed wo...,hey how ya feelin mickey,"[<sos>, i, do, nt, care, what, nobody, says, t...","[<sos>, hey, how, ya, feelin, mickey, <eos>]","[2, 5, 12, 13, 236, 18, 365, 282, 26, 2409, 56...","[2, 180, 56, 274, 3272, 1754, 3]"
179235,well what did he say,he said he thinks we have paid him and he want...,"[<sos>, well, what, did, he, say, <eos>]","[<sos>, he, said, he, thinks, we, have, paid, ...","[2, 62, 18, 45, 24, 89, 3]","[2, 24, 116, 24, 581, 23, 20, 695, 55, 15, 24,..."


In [33]:
from datasets import Dataset
train_data = Dataset.from_pandas(train_df)
test_data =  Dataset.from_pandas(test_df)
validation_data = Dataset.from_pandas(validation_df)

In [34]:
train_data

Dataset({
    features: ['Input', 'Output', 'input_token', 'output_token', 'input_ids', 'output_ids'],
    num_rows: 179237
})

In [35]:
data_type = "torch"
format_columns = ["input_ids", "output_ids"]

train_data = train_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

validation_data = validation_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [36]:
train_data[0]

{'input_ids': tensor([    2,   145,     5,    20,    50,    78, 13872,    19,  6082,  2236,
           323,     3]),
 'output_ids': tensor([    2,     4,   161,   448,    35,    11,   131, 12512,     6,     5,
            28,   237,    54,    15,   992,     7,   567,     3]),
 'Input': 'sir i have got an overload in disposal unit four',
 'Output': 'you better check on it mr dunn  i will stay here and fly the ship',
 'input_token': ['<sos>',
  'sir',
  'i',
  'have',
  'got',
  'an',
  'overload',
  'in',
  'disposal',
  'unit',
  'four',
  '<eos>'],
 'output_token': ['<sos>',
  'you',
  'better',
  'check',
  'on',
  'it',
  'mr',
  'dunn',
  ' ',
  'i',
  'will',
  'stay',
  'here',
  'and',
  'fly',
  'the',
  'ship',
  '<eos>']}

In [37]:
pad_index = 1 #Vocab Pad Index value
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["input_ids"] for example in batch]
        batch_de_ids = [example["output_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        batch = {
            "input_ids": batch_en_ids,
            "output_ids": batch_de_ids,
        }
        return batch

    return collate_fn

In [38]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [39]:
batch_size = 8

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
validation_data_loader = get_data_loader(validation_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [40]:
# sum = 0
# count = 0
# batch_number = 0
# for batch in train_data_loader:
#   batch_number +=1
#   # inputs = batch['en_ids']
#   # output = batch['de_ids']
#   #print(f'This is the batch: {batch}')
#   for input in batch['input_ids']:
#     #print(len(input))
#   # print(f'This is the input {inputs}')
#   # print(f'This is the output: {output}')
#     sum +=1
#   print(f'This is the size of each batch:{sum}')
#   sum = 0
#   count +=1

# print(batch_number)

In [41]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src length, batch size, embedding dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # outputs are always from the top hidden layer
        return hidden, cell

In [42]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # n directions in the decoder will both always be 1, therefore:
        # hidden = [n layers, batch size, hidden dim]
        # context = [n layers, batch size, hidden dim]
        input = input.unsqueeze(0)
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, embedding dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output = [seq length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # seq length and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, hidden dim]
        # hidden = [n layers, batch size, hidden dim]
        # cell = [n layers, batch size, hidden dim]
        prediction = self.fc_out(output.squeeze(0))
        # prediction = [batch size, output dim]
        return prediction, hidden, cell

In [43]:
import torch
import torch.nn as nn

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio):
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[0, :]
        
        for t in range(1, trg_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            # Apply softmax to the output logits
            output_probs = nn.functional.softmax(output, dim=1)
            # Get the most probable token indices
            top1 = output_probs.argmax(1)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            input = trg[t] if teacher_force else top1
        
        return outputs


In [44]:
input_dim = len(en_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 128
decoder_embedding_dim = 128
hidden_dim = 256
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device).to(device)

In [45]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(23068, 128)
    (rnn): LSTM(128, 256, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(23068, 128)
    (rnn): LSTM(128, 256, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=256, out_features=23068, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [46]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [47]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["input_ids"].to(device)
        trg = batch["output_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [48]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["input_ids"].to(device)
            trg = batch["output_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [49]:
import tqdm

In [50]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_data_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        device,
    )
    valid_loss = evaluate_fn(
        model,
        valid_data_loader,
        criterion,
        device,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut1-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

  0%|                                                                                         | 0/10 [1:03:27<?, ?it/s]


KeyboardInterrupt: 