In [0]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools

In [494]:
CUDA = torch.cuda.is_available()
device = torch.device("cuda" if CUDA else "cpu")
print(device)

cuda


In [20]:
from google.colab import files
uploaded = files.upload()

Saving movie_conversations.txt to movie_conversations.txt
Saving movie_lines.txt to movie_lines.txt


In [0]:
# Import data
import io
lines_path = 'movie_lines.txt'
conv_path = 'movie_conversations.txt'


In [495]:
# Visualizing the data
with open(lines_path, 'r', encoding='iso-8859-1') as file:
    lines = file.readlines()
for line in lines[0:8]:
    print(line.strip())

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


In [0]:
# Data preparation Part 1
""" Create a dictionary of the form LineID: {LineID : L1
                                             CharID : C1 
                                             .
                                             .
                                             Text : "Something"} """

line_fields = ["lineID", "characterID", "movieID", "character", "text"]
lines = {}

with open(lines_path, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split(" +++$+++ ")

        line_object = {}
        
        for i,field in enumerate(line_fields):
            line_object[field] = values[i]
        lines[line_object["lineID"]] = line_object

In [497]:
lines['L1000']

{'character': 'WALTER',
 'characterID': 'u11',
 'lineID': 'L1000',
 'movieID': 'm0',
 'text': "Oh, Christ.  Don't tell me you've changed your mind.  I already sent 'em a check.\n"}

In [0]:
# Data preparation Part 2

conv_fields = ["character1ID", "character2ID", "movieID", "utteranceIDs"] # c1 and c2 talking in movie M with dialogues - l1,l2,l3,l4 (lineIDs)
conversations = []

with open(conv_path, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split(" +++$+++ ")

        conv_object = {}
        
        for i,field in enumerate(conv_fields):
            conv_object[field] = values[i]
        lineIDs = eval(conv_object["utteranceIDs"])

        conv_object["lines"] = []

        for lineID in lineIDs:
            conv_object["lines"].append(lines[lineID])
        conversations.append(conv_object)


In [499]:
conversations[1]

{'character1ID': 'u0',
 'character2ID': 'u2',
 'lines': [{'character': 'BIANCA',
   'characterID': 'u0',
   'lineID': 'L198',
   'movieID': 'm0',
   'text': "You're asking me out.  That's so cute. What's your name again?\n"},
  {'character': 'CAMERON',
   'characterID': 'u2',
   'lineID': 'L199',
   'movieID': 'm0',
   'text': 'Forget it.\n'}],
 'movieID': 'm0',
 'utteranceIDs': "['L198', 'L199']\n"}

In [0]:
# Extract pairs of sentences from conversations
qa_pairs = []

for conversation in conversations:
    for i in range(len(conversation["lines"]) - 1):
        input_line = conversation["lines"][i]["text"].strip()
        target_line = conversation["lines"][i+1]["text"].strip()
        if input_line and target_line:
            qa_pairs.append([input_line, target_line])



In [501]:
qa_pairs[0:10]

[['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you."],
 ["Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part.  Please.'],
 ['Not the hacking and gagging and spitting part.  Please.',
  "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"],
 ["You're asking me out.  That's so cute. What's your name again?",
  'Forget it.'],
 ["No, no, it's my fault -- we didn't have a proper introduction ---",
  'Cameron.'],
 ['Cameron.',
  "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does."],
 ["The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
  'Seems like she could get a date easy enough...'],
 [

In [0]:
# Saving the formatted conversation pairs
with open("formattedLines.txt", 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter = str(codecs.decode('\t',"unicode_escape")))
    for pair in qa_pairs:
        writer.writerow(pair)
        

In [102]:
# Download the saved file
files.download('formattedLines.txt')

KeyboardInterrupt: ignored

In [503]:
# Loading in the saved formatted file
with open("formattedLines.txt",'r') as file:
    lines = file.readlines()
for line in lines[0:8]:
    print(line)

Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.	Well, I thought we'd start with pronunciation, if that's okay with you.

Well, I thought we'd start with pronunciation, if that's okay with you.	Not the hacking and gagging and spitting part.  Please.

Not the hacking and gagging and spitting part.  Please.	Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?

You're asking me out.  That's so cute. What's your name again?	Forget it.

No, no, it's my fault -- we didn't have a proper introduction ---	Cameron.

Cameron.	The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.

The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.	Seems like she could get a date easy enough...

Why?	Unsolved mystery.  She used to be really popular when she started h

In [0]:
# Word Processing and vocabulary
PAD_TOKEN = 0
SOS_TOKEN = 1
EOS_TOKEN = 2

class Vocabulary():
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_TOKEN: "PAD", SOS_TOKEN: "SOS", EOS_TOKEN: "EOS"}
        self.numwords = 3

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.numwords
            self.word2count[word] = 1
            self.index2word[self.numwords] = word
            self.numwords += 1
        else:
            self.word2count[word] += 1

    # Trim to remove words below a certain threshold, so that the network doesn't get confusedddd.

    def trim(self, min_count):
        keep_words = []
        for k,v in self.word2count.items():
            if v>=min_count:
                keep_words.append(k)

        # Reinitialize the dictionaries with the new set of words to be kept

        print("Keep words {} / {} = {:.4f}".format(len(keep_words), len(self.word2index), len(keep_words)/len(self.word2index)))
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_TOKEN: "PAD", SOS_TOKEN: "SOS", EOS_TOKEN: "EOS"}
        self.numwords = 3

        for word in keep_words:
            self.addWord(word)


In [0]:
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c)!='Mn')

In [230]:
unicodeToAscii('Montréal')

'Montreal'

In [0]:
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [232]:
normalizeString("aa123aa!s's   dd?")

'aa aa !s s dd ?'

In [233]:
lines[0:10]

["Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\n",
 "Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\n",
 "Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n",
 "You're asking me out.  That's so cute. What's your name again?\tForget it.\n",
 "No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\n",
 "Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\n",
 "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\n",
 'Why?\tUnsolved mystery.  She

In [234]:
# Opening up the formatted text file
print("Reading and processing text... Please wait.")
lines = open("formattedLines.txt", encoding='utf-8').read().strip().split('\n') # Lines is gonna become a list of lines 
                                                                                 # where each line is a pair of dialogues
pairs = [[normalizeString(s) for s in pair.split('\t')] for pair in lines]
print("Processing complete!")

Reading and processing text... Please wait.
Processing complete!


In [0]:
vocab = Vocabulary("Cornell")

In [236]:
pairs[0]

['can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .',
 'well i thought we d start with pronunciation if that s okay with you .']

In [0]:
#### Further processing the text ####

# Keep only sentences <= MAX_LEN

MAX_LEN = 10
def filterPair(p):
    return len(p[0].split())<MAX_LEN and len(p[1].split())<MAX_LEN

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


In [238]:
pairs = filterPairs(pairs)
print("Filtered Pairs = {}".format(len(pairs)))

Filtered Pairs = 64271


In [239]:
# Add the question and reply sentence pairs to the vocabulary
for pair in pairs:
    vocab.addSentence(pair[0])
    vocab.addSentence(pair[1])
print("Unique word count = {}".format(vocab.numwords))

Unique word count = 18008


In [240]:
for pair in pairs[0:10]:
    print(pair)

['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


In [241]:
# Keep only those words which occur more than MIN_COUNT times :)

MIN_COUNT = 3

def trimRareWords(vocab, pairs, MIN_COUNT):
    vocab.trim(MIN_COUNT)

    keep_pairs = []

    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]

        keep_input = True
        keep_output = True
    
        for word in input_sentence.split(' '):
            if word not in vocab.word2index:
                keep_input = False
                break
                
        for word in output_sentence.split(' '):
            if word not in vocab.word2index:
                keep_output = False
                break

        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {} pairs, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs)/len(pairs)))
    return keep_pairs

pairs = trimRareWords(vocab, pairs, MIN_COUNT)


Keep words 7823 / 18005 = 0.4345
Trimmed from 64271 pairs to 53165 pairs, 0.8272 of total


In [0]:
################################################### Data Preparation ##########################################################

In [0]:
# Encode a sentence into a list of integers

def indexesFromSentence(vocab, sentence):
    return [vocab.word2index[word] for word in sentence.split(' ')] + [EOS_TOKEN]

In [244]:
print(pairs[1][0])
indexesFromSentence(vocab, pairs[1][0])

you have my word . as a gentleman


[7, 8, 9, 10, 4, 11, 12, 13, 2]

In [245]:
# Padding the sentences for equal length
def zeroPadding(l, fillvalue = 0):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

# Defining a sample for testing
inp = []
out = []
for pair in pairs[:10]:
    inp.append(pair[0])
    out.append(pair[1])
print(inp)
print(len(inp))
indexes = [indexesFromSentence(vocab, sentence) for sentence in inp]
indexes

['there .', 'you have my word . as a gentleman', 'hi .', 'have fun tonight ?', 'well no . . .', 'then that s all you had to say .', 'but', 'do you listen to this crap ?', 'what good stuff ?', 'wow']
10


[[3, 4, 2],
 [7, 8, 9, 10, 4, 11, 12, 13, 2],
 [16, 4, 2],
 [8, 31, 22, 6, 2],
 [33, 34, 4, 4, 4, 2],
 [35, 36, 37, 38, 7, 39, 40, 41, 4, 2],
 [42, 2],
 [47, 7, 48, 40, 45, 49, 6, 2],
 [50, 51, 52, 6, 2],
 [58, 2]]

In [246]:
leng = [len(ind) for ind in indexes]
max(leng)

10

In [247]:
# Prints maximum length (sentence length) x no. of sentences in a batch (batch size) matrix
# Each column is a sentence
test_result = zeroPadding(indexes)
test_result

[(3, 7, 16, 8, 33, 35, 42, 47, 50, 58),
 (4, 8, 4, 31, 34, 36, 2, 7, 51, 2),
 (2, 9, 2, 22, 4, 37, 0, 48, 52, 0),
 (0, 10, 0, 6, 4, 38, 0, 40, 6, 0),
 (0, 4, 0, 2, 4, 7, 0, 45, 2, 0),
 (0, 11, 0, 0, 2, 39, 0, 49, 0, 0),
 (0, 12, 0, 0, 0, 40, 0, 6, 0, 0),
 (0, 13, 0, 0, 0, 41, 0, 2, 0, 0),
 (0, 2, 0, 0, 0, 4, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]

In [0]:
def binaryMatrix(l, value=0):
    m = []
    for i,seq in enumerate(l): # l is a list of lists just like above
        m.append([])
        for token in seq:
            if token == PAD_TOKEN:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

In [249]:
binaryResult = binaryMatrix(test_result)
binaryResult

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 0, 1, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]

In [0]:
# FINALLY DOING SHIT ON ACTUAL DATA omg

def inputVar(l, vocab):
    indexes_batch = [indexesFromSentence(vocab, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

def outputVar(l, vocab):
    indexes_batch = [indexesFromSentence(vocab, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len


In [0]:
def batch2TrainData(vocab, pair_batch):
    # Sort by QUESTION LENGTH in DESCENDING order
    pair_batch.sort(key=lambda x: len(x[0].split(' ')), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, vocab)
    out, mask, max_target_len = outputVar(output_batch, vocab)
    return inp, lengths, out, mask, max_target_len

In [297]:
# Example 
small_batch_size = 5
batches = batch2TrainData(vocab, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("Input Variable:")
print(input_variable)
print("Lengths of each sentence:")
print(lengths)
print("Target Variable:")
print(target_variable)
print("Mask:")
print(mask)
print("Max target length : ", max_target_len)

Input Variable:
tensor([[ 270,   51,   50,  124,   23],
        [  83, 5060,  368,  601,    6],
        [   6, 5061,   40,    4,    2],
        [ 379,   12,   53,    4,    0],
        [  25,   51, 6153,    4,    0],
        [  41, 3426, 2741,    2,    0],
        [ 319,    4,    6,    0,    0],
        [   6,    2,    2,    0,    0],
        [   2,    0,    0,    0,    0]])
Lengths of each sentence:
tensor([9, 8, 8, 6, 3])
Target Variable:
tensor([[  56,   25,  467,  598,  716],
        [ 827,  450,    4, 4065,  109],
        [   4,   24,    2, 1204,  460],
        [   2,   86,    0,    4,  111],
        [   0,   56,    0,    2,    6],
        [   0,    7,    0,    0,    2],
        [   0,    9,    0,    0,    0],
        [   0, 1082,    0,    0,    0],
        [   0,    4,    0,    0,    0],
        [   0,    2,    0,    0,    0]])
Mask:
tensor([[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True

In [0]:
######################################################### DEFINING THE MODEL ##################################################################

In [0]:
class Encoder(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size # No. of neurons in the hidden layer (NOT timesteps)
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers==1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):

        embedded = self.embedding(input_seq)

        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths) 
        # Jitne bhi saare words hain unko ek line mein kr diya
        # Now we have the batch lengths that are supposed to go into each lstm time-step, toh pick out utne words from the list and put it
        # in the lstm. Fir next batch length ke hisaab se next words uthao. So suppose its 6 5 3, total 14 words, toh first choose 6 for the
        # first lstm then 5 for the next then 3. 
        # packed[0] will be 14 fir and packed[1] will be tensor([6,5,3])

        outputs, hidden = self.gru(packed, hidden)

        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)

        outputs = outputs[:,:,:self.hidden_size] + outputs[:,:,self.hidden_size:]

        return outputs, hidden

In [0]:
class Attention(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attention, self).__init__()
        
        self.method = method
        self.hidden_size = hidden_size

    def dot_score(self, hidden, encoder_output): # Hidden is the hidden state from the decoder
            return torch.sum(hidden*encoder_output, dim=2)

    def forward(self, hidden, encoder_output):

        # hidden ka shape = (1, batch_size, hidden_size) cause we feed 1 batch row at a time to the GRU cell in the decoder
        # ie one row = one cell = one time step. Fir next row goes into the next time step, but it happens one at a time.
        # encoder_output ka shape = (max_seq_len, batch_size, hidden_size)
        # Multiply krne ke baad shape = Max_length x batch_size x hidden_size
        # This is summed across dim=2, ie hidden size hmmmmmmmmmmmm
        attention_energies = self.dot_score(hidden, encoder_output) # Max_length x batch_size
        attention_energies = attention_energies.t() # Transpose

        return F.softmax(attention_energies, dim=1).unsqueeze(1)

In [0]:
class Decoder(nn.Module):
    def __init__(self, attention_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(Decoder, self).__init__()
        self.attention_model = attention_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers==1 else dropout))
        self.concat = nn.Linear(hidden_size*2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attention = Attention(attention_model, hidden_size)
    
    def forward(self, input_step, last_hidden, encoder_output):
        # Input step = (1, batch_size), cause one row of words (one batch) picked up from the array of sentence length x batch size
        # Last hidden is the final hidden state of the encoder GRU (n_layers x directions, batch size, hidden size)
        # encoder output is the output of the encoder(full memory) (sentence len, batch size, directions x hidden size)
        # We run this one step (one batch of words) at a time
        
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)

        rnn_output, hidden = self.gru(embedded, last_hidden) # RNN_Output = (1, batch size, hidden size x directions)
                                                             # Hidden state = (n_layers x directions, batch size, hidden size)

        # Attention forward function returns softmax in the form (batch size, 1, max length)
        attention_weights = self.attention(rnn_output, encoder_output)

        # For the context vector, or what to focus on vector, we multiply the attention with the encoder output
        # Attention (batch size, 1, max length) x Encoder output transpose (batch size, max length, hidden size) = (batch size, 1, hidden size)
        context = attention_weights.bmm(encoder_output.transpose(0,1))

        # Concatenate context with GRU output
        rnn_output = rnn_output.squeeze(0) # Remove the 1 from that 3-D tensor to make it 2-D
        context = context.squeeze(1) # Both of these are now batch size x hidden size 2-D tensors
        concat_input = torch.cat((rnn_output, context),1) # Concatenate along columns, so new size = (batch size, hidden size x 2)
        concat_output = torch.tanh(self.concat(concat_input)) # Pass the concat through a linear layer

        output = self.out(concat_output) # Size now is batch size x vocab size
        output = F.softmax(output, dim=1) # Each batch row contains the probabilities of all the words, so softmax across them to get 
                                          # the MOST PROBABLE WORD
        return output, hidden
    

In [0]:
def maskNLLLoss(decoder_out, target, mask): # To NOT calculate loss for padded spaces
    nTotal = mask.sum() # Number of elements to consider
    target = target.view(-1,1)

    gathered_tensor = torch.gather(decoder_out, 1, target)

    crossEntropy = -torch.log(gathered_tensor) # Calculate the loss on the gathered tensor

    loss = crossEntropy.masked_select(mask)
    loss = loss.mean()
    loss = loss.to(device)

    return loss, nTotal.item()

In [0]:
################################################################# TRAINING ########################################################################

In [469]:
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< This is only for visualization >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
small_batch_size = 5
batches = batch2TrainData(vocab, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

### One time step is one batch of words ###

print("Input Variable:")
print(input_variable)
print("Lengths of each sentence:")
print(lengths)
print("Target Variable:")
print(target_variable)
print("Mask:")
print(mask)

print("Input Variable Shape:")
print(input_variable.shape)
print("Lengths Shape:")
print(lengths.shape)
print("Target Variable Shape:")
print(target_variable.shape)
print("Mask Shape:")
print(mask.shape)
print("Max target length : ", max_target_len)

# Defining the parameters
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
attention_model = 'dot'
embedding = nn.Embedding(vocab.numwords, hidden_size)

# Defining the encoder and decoder
encoder = Encoder(hidden_size, embedding, encoder_n_layers, dropout)
decoder = Decoder(attention_model, embedding, hidden_size, vocab.numwords, decoder_n_layers,dropout)
encoder = encoder.to(device)
decoder = decoder.to(device)

encoder.train()
decoder.train()

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.0001) # Parameters() specifies the weights of the encoder/decoder for the optimizer
                                                               # to differentiate and subtract from and do whatever with
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.0001)
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()

input_variable = input_variable.to(device)
lengths = lengths.to(device)
target_variable = target_variable.to(device)
mask = mask.to(device)

loss = 0
print_losses = []
n_totals = 0

encoder_output, encoder_hidden = encoder(input_variable, lengths)
print("Encoder Output Shape = ",encoder_output.shape)
print("Last Encoder Hidden State Shape = ",encoder_hidden.shape)

decoder_input = torch.LongTensor([[SOS_TOKEN for _ in range(small_batch_size)]])
decoder_input = decoder_input.to(device)
print("Initial Decoder Input Shape = ",decoder_input.shape)
print(decoder_input)

# Last encoder hidden state is passed to the decoder as the initial hidden state
decoder_hidden = encoder_hidden[:decoder.n_layers]
print("Initial decoder hidden state shape = ",decoder_hidden.shape)
print("\n")
print("----------------------------------------------------------")
print("THIS IS WHAT HAPPENS AT EVERY TIME STEP OF THE GRU!")
print("----------------------------------------------------------")
print("\n")

for t in range(max_target_len):
    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_output)
    print("Decoder Output Shape = ", decoder_output.shape)    
    print("Decoder Hidden State Shape = ", decoder_hidden.shape)

    decoder_input = target_variable[t].view(1,-1) # Cause Teacher Forcing
    print("Target Variable now = ", target_variable[t])
    print("Target Variable Shape now = ", target_variable[t].shape)
    print("Decoder input shape after reshaping = ", decoder_input.shape)

    # Loss
    print("Mask for current timestep", mask[t])
    print("Mask shape for current timestep", mask[t].shape)
    mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
    print("Mask Loss = ", mask_loss)
    print("Total = ", nTotal)

    loss += mask_loss
    print_losses.append(mask_loss.item()*nTotal)
    print(print_losses)
    n_totals += nTotal
    print(nTotal)

    encoder_optimizer.step()
    decoder_optimizer.step()

    returned_loss = sum(print_losses)/n_totals
    print("Returned Loss = ", returned_loss)
    print("\n")
    print("----------------------------------------DONE ONE STEP-----------------------------------")
    print("\n")
 

Input Variable:
tensor([[  42,   50,  252,   34,  318],
        [  76,   25,  387, 4160,    7],
        [  37,   47,   25,  916,   94],
        [  67,   40,    4,    4,    4],
        [ 325, 5640,    2,    2,    2],
        [ 115,    6,    0,    0,    0],
        [  76,    2,    0,    0,    0],
        [   6,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
Lengths of each sentence:
tensor([9, 7, 5, 5, 5])
Target Variable:
tensor([[  50,  318,   62,    7,  383],
        [   6,  614,   50,  389,    7],
        [   2,   83,   94, 4012,    4],
        [   0, 5640,   27,  276,    2],
        [   0,    4, 1589,    4,    0],
        [   0,    2,    6,    2,    0],
        [   0,    0,    2,    0,    0]])
Mask:
tensor([[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [False,  True,  True,  True,  True],
        [False,  True,  True,  True, False],
        [False,  True,  True,  True, False],

In [0]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder,
          embedding, encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LEN):
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    loss = 0
    print_losses = []
    n_totals = 0

    encoder_output, encoder_hidden = encoder(input_variable, lengths)
    decoder_input = torch.LongTensor([[SOS_TOKEN for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_output)
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_output)
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    loss.backward()

    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses)/n_totals


In [0]:
def trainIters(model_name, vocab, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
               embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, 
               print_every, save_every, clip, corpus_name, loadFilename):
    
    training_batches = [batch2TrainData(vocab, [random.choice(pairs) for _ in range(batch_size)]) for _ in range(n_iteration)]

    # Initializations
    print('Initializing...')
    start_iteration = 1
    print_loss = 0

    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop FINALLY
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration-1]
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': vocab.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))


In [0]:
######################################################### TALKING WITH THE BOT ####################################################################

In [0]:
# For reading in user input and responding

class GreedySearchDecoder(nn.Module): 
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):

        # Encode the input sequence through the encoder model
        encoder_output, encoder_hidden = self.encoder(input_seq, input_length)

        # Encoder's last hidden state is decoder's first hidden state
        decoder_hidden = encoder_hidden[:decoder.n_layers]

        # Decoder input starts with SOS_TOKEN
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_TOKEN

        # Initialize tensors where the words will be appended after they're found
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)

        # Decode one word at a time
        for _ in range(max_length):

            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_output)
            
            # Get most likely word
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)

            # Store the word and score in the tensors
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            
            # Prepare current word to be input for the next one
            decoder_input = torch.unsqueeze(decoder_input, 0)

        return all_tokens, all_scores


In [0]:
# Make input sentence fit for answering, gives it to searcher, gets back the answer, and makes it fit for reading
def evaluate(encoder, decoder, searcher, vocab, sentence, max_length = MAX_LEN):
    indexes_batch = [indexesFromSentence(vocab, sentence)]

    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])

    input_batch = torch.LongTensor(indexes_batch).transpose(0,1)

    input_batch = input_batch.to(device)
    lengths = lengths.to(device)

    tokens, scores = searcher(input_batch, lengths, max_length)

    decoded_words = [vocab.index2word[token.item()] for token in tokens]

    return decoded_words   


In [0]:
# Talking
def talk(encoder, decoder, searcher, vocab):
    input_sentence = ''
    while(1):
        try:
            input_sentence = input('> ')
            if input_sentence == 'q' or input_sentence == 'quit': break

            input_sentence = normalizeString(input_sentence)
            
            output_words = evaluate(encoder, decoder, searcher, vocab, input_sentence)

            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Summer:', ' '.join(output_words))

        except KeyError:
            print("Error: Huh. Haven't seen that before.")

In [485]:
model_name = 'Summer'
corpus_name = 'Cornell'
attention_model = 'dot'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

save_dir = os.getcwd()
loadFilename = None
checkpoint_iter = 4000

#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                          '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                          '{}_checkpoint.tar'.format(checkpoint_iter))

# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    vocab.__dict__ = checkpoint['voc_dict']

# FINAL FINALLY
print('Building the Encoder and Decoder...')

embedding = nn.Embedding(vocab.numwords, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)

encoder = Encoder(hidden_size, embedding, encoder_n_layers, dropout)
decoder = Decoder(attention_model, embedding, hidden_size, vocab.numwords, decoder_n_layers, dropout)

if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)

encoder = encoder.to(device)
decoder = decoder.to(device)

print("We're ready to go!")


Building the Encoder and Decoder...
We're ready to go!


In [487]:
# Training, FINAL FINAL FINALLYYYYY
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 1000

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()
print("Starting Training!")

trainIters(model_name, vocab, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name,loadFilename)

print("Trained.")



Building optimizers ...
Starting Training!
Initializing...
Training...
Trained.


In [0]:
# You know what's gonna happen now :)
encoder.eval()
decoder.eval()

searcher = GreedySearchDecoder(encoder, decoder)

In [505]:
talk(encoder, decoder, searcher, vocab)

> hi
Summer: hi .
> get me coffee
Summer: you re a stuttering !
> no i am not?
Summer: you are .
> what the hell
Summer: i m sorry i m sorry .
> okay I'm going
Summer: okay .
> sure?
Summer: i m sure .
> why do you say that
Summer: i m sorry .
> but why
Summer: because it was a compliment .
> really
Summer: you know what i m saying ?
> I think so
Summer: i m sorry .
> never mind
Summer: you ll never be able to do it .
> i will do it
Summer: you can t .
> goodnight
Summer: goodnight .
> quit
