## Importing Libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools

cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

In [2]:
device

device(type='cpu')

## Data Preprocessing

In [3]:
lines_filepath = os.path.join("cornell movie-dialogs corpus", "movie_lines.txt")
conv_filepath = os.path.join("cornell movie-dialogs corpus", "movie_conversations.txt")

In [4]:
#Visualize Lines
with open(lines_filepath, 'r') as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line.strip())

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


In [5]:
#Categorizing
line_fields = ["lineID","characterID", "movieID", "character", "text"]
lines = {}
with open(lines_filepath, 'r', encoding ='iso-8859-1') as f:
    for line in f:
        values = line.split(" +++$+++ ")
        lineObj = {}
        for i, field in enumerate(line_fields):
            lineObj[field] = values[i]
        lines[lineObj['lineID']] = lineObj
    

In [6]:
#Categorizing Conversation
conv_fields = ["character1ID","character2ID", "movieID", "utteranceID"]
conversation = []
with open(conv_filepath, 'r', encoding ='iso-8859-1') as f:
     for line in f:
        values = line.split(" +++$+++ ")
        convObj = {}
        for i, field in enumerate(conv_fields):
            convObj[field] =  values[i]
        lineIDs = eval(convObj["utteranceID"])
        convObj["lines"] = []
        for lineID in lineIDs:
            convObj["lines"].append(lines[lineID])
        conversation.append(convObj)

In [7]:
#Pairs of sentences
qa_pairs = []
for conv in conversation:
    for i in range(len(conv["lines"])-1):
        inputline  = conv["lines"][i]["text"].strip()
        targetline = conv["lines"][i+1]["text"].strip()
        if inputline and targetline:
            qa_pairs.append([inputline, targetline])
        
    

In [8]:
#Writing to a file

datafile = os.path.join("cornell movie-dialogs corpus","formatted_movie_lines.txt")
delimiter = "\t"
print("\nWriting into output file..")
with open(datafile , 'w', encoding = 'utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter =  delimiter)
    for pair in qa_pairs:        
        writer.writerow(pair)
print("\nDone")


Writing into output file..

Done


In [9]:
datafile = os.path.join("cornell movie-dialogs corpus","formatted_movie_lines.txt")
with open(datafile ,'rb') as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\r\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\r\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\r\r\n"
b'Why?\tU

In [10]:
PAD_token = 0 #For padding short sentences
SOS_token = 1 #For start of a sentence
EOS_token = 2 #For the end of a sentence

class Vocabulary:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 #Count EOS SOS PAD
        
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addword(word)
    
    def addword(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words+=1
        else:
            self.word2count[word] += 1

            
    
    def trim(self, min_count):
        keep_words = []
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)
        print('keep_words {} / {} = {:.4f}'.format(len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)))
        
        #Reinitialize 
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 #Count EOS SOS PAD
        
        for word in keep_words:
            self.addword(word)

In [11]:
#Unicode to ASCII
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c)!='Mn')
    

In [12]:
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1",s)
    s = re.sub(r"[^a-zA-z.!?]+", r" " ,s)
    s = re.sub(r"\s", r" ", s)
    return s

In [13]:
normalizeString("aa12 ?11")

'aa ? '

In [14]:
datafile = os.path.join("cornell movie-dialogs corpus","formatted_movie_lines.txt")
print("Reading into the file..")
lines = open(datafile, encoding = 'utf-8').read().strip().split('\n')
pairs = [[normalizeString(s) for s in pair.split('\t')] for pair in lines]
print('Done Reading')
voc = Vocabulary("cornell movie-dialogs corpus")

Reading into the file..
Done Reading


In [15]:
#Filter
MAXLEN = 10
def filterPair(p):
    return len(p[0].split()) < MAXLEN and len(p[1].split())  < MAXLEN

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [16]:
pairs  = [pair for pair in pairs if len(pair)>1]
print(" There are {} pairs/conversation before filering".format(len(pairs)))
pairs = filterPairs(pairs)
print(" After filtering, there are {} pairs/conversations".format(len(pairs)))

 There are 221282 pairs/conversation before filering
 After filtering, there are 64266 pairs/conversations


In [17]:
for pair in pairs:
    voc.addSentence(pair[0])
    voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
for pair in pairs[:10]:
    print(pair)
save_dir = os.path.join("data", "save")
corpus_name = "cornell movie-dialogs corpus"

Counted words: 18077
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', ' the real you . ']


In [18]:
MINLEN = 3

def trimwords(voc, pairs, MINLEN):
    voc.trim(MINLEN)
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs)/len(pairs)))
    return keep_pairs       
pairs = trimwords(voc, pairs, MINLEN)

keep_words 7837 / 18074 = 0.4336
Trimmed from 64266 pairs to 53115, 0.8265 of total


In [19]:
def indexFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [20]:
indexFromSentence(voc, pairs[1][0])

[7, 8, 9, 10, 4, 11, 12, 13, 2]

In [21]:
inp = []
out = []
for pair in pairs[:10]:
    inp.append(pair[0])
    out.append(pair[1])
print(inp)
print(len(inp))
indexes = [indexFromSentence(voc, sentence) for sentence in inp]
indexes

['there .', 'you have my word . as a gentleman', 'hi .', 'have fun tonight ?', 'well no . . .', 'then that s all you had to say .', 'but', 'do you listen to this crap ?', 'what good stuff ?', 'wow']
10


[[3, 4, 2],
 [7, 8, 9, 10, 4, 11, 12, 13, 2],
 [16, 4, 2],
 [8, 31, 22, 6, 2],
 [33, 34, 4, 4, 4, 2],
 [35, 36, 37, 38, 7, 39, 40, 41, 4, 2],
 [42, 2],
 [47, 7, 48, 40, 45, 49, 6, 2],
 [50, 51, 52, 6, 2],
 [59, 2]]

In [22]:
def zeroPadding( l, fillvalue = 0):
    return list(itertools.zip_longest(*l, fillvalue = fillvalue))

In [23]:
length = [len(ind) for ind in indexes]
max(length)

10

In [24]:
#Test the function
test_result  = zeroPadding(indexes)
print(len(test_result))
test_result

10


[(3, 7, 16, 8, 33, 35, 42, 47, 50, 59),
 (4, 8, 4, 31, 34, 36, 2, 7, 51, 2),
 (2, 9, 2, 22, 4, 37, 0, 48, 52, 0),
 (0, 10, 0, 6, 4, 38, 0, 40, 6, 0),
 (0, 4, 0, 2, 4, 7, 0, 45, 2, 0),
 (0, 11, 0, 0, 2, 39, 0, 49, 0, 0),
 (0, 12, 0, 0, 0, 40, 0, 6, 0, 0),
 (0, 13, 0, 0, 0, 41, 0, 2, 0, 0),
 (0, 2, 0, 0, 0, 4, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]

In [25]:
def binaryMatrix( l , value = 0):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

In [26]:
binary_result =  binaryMatrix(test_result)
binary_result

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 0, 1, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]

In [27]:
def inputVar( l, voc):
    indexes_batch = [indexFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(index) for index in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

In [28]:
def outputVar(l,voc):
    indexes_batch = [indexFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(index) for index in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask  = binaryMatrix(padList)
    mask  = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

In [29]:
def trainDataBatch(voc, pair_batch):
    pair_batch.sort(key = lambda x: len(x[0].split(' ')), reverse = True)
    input_batch, output_batch = [],[]
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, mask_max_leng = outputVar(output_batch, voc)
    return inp, lengths, output, mask, mask_max_leng

In [30]:
batch_size = 5
batches = trainDataBatch(voc , [random.choice(pairs) for _ in range(batch_size)] )
input_var , length, target_var, mask , max_target = batches
print("Input_Var")
print(input_var)
print("Target_Var")
print(target_var)

print("mask")
print(mask)
print("max_target")
print(max_target)

Input_Var
tensor([[  25,   76,   25, 1904,  319],
        [ 114,  158,  411,    4,    6],
        [  40,  157,    4,    2,    2],
        [2205,    4,    2,    0,    0],
        [   4,    2,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
Target_Var
tensor([[   7,   63,    7,   50, 3020],
        [  90,    4,   24,    6, 5580],
        [ 402,    2,    5,    2,    4],
        [  40,    0,   54,    0,    2],
        [  54,    0,  593,    0,    0],
        [ 267,    0,   93,    0,    0],
        [ 200,    0,    6,    0,    0],
        [   4,    0,    2,    0,    0],
        [   2,    0,    0,    0,    0]])
mask
tensor([[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True, False,  True, False,  True],
        [ True, False,  True, False, False],
        [ True, False,  True, False, False],
        [ True, False,  True, False, False],
        [ True, False,  True, False, False],
        [ Tr

In [31]:
#Encoder
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers = 1, dropout =0):
        super(EncoderRNN,self).__init__()
        self.n_layers = n_layers
        self.embedding = embedding
        self.hidden_size = hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size , n_layers , dropout=( 0 if n_layers == 1 else dropout), bidirectional = True)
        
    def forward(self, input_seq, input_lengths,  hidden = None):
        embedded = self.embedding(input_seq)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:,:, :self.hidden_size]+ outputs[:,:, self.hidden_size:]
        return outputs,hidden

In [32]:
#Attention Model
class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
        
    def dot_score(self, hidden, encoder_outputs):
        return torch.sum(hidden*encoder_outputs, dim = 2)
    
    def forward(self, hidden, encoder_outputs):
        attn_energies = self.dot_score(hidden, encoder_outputs)
        attn_energies = attn_energies.t()
        return F.softmax(attn_energies , dim = 1).unsqueeze(1)

In [33]:
#Attention Decoder
class AttnDecoder(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers = 1, dropout =0.1):
        super(AttnDecoder, self).__init__()
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size , n_layers , dropout=( 0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size*2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.attn = Attn(attn_model, hidden_size)
        
    def forward(self, input_step, last_hidden, encoder_outputs):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        rnn_output, hidden = self.gru(embedded, last_hidden)
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0,1))
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output,context),1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        output = F.softmax(output, dim = 1)
        return output, hidden
    
            

In [34]:
def maskLoss(decoder_output, target, mask):
    nTotal = mask.sum()
    target = target.view(-1,1)
    gathered_tensor = torch.gather(decoder_output, 1, target)
    crossEntropy = -torch.log(gathered_tensor)
    loss = crossEntropy.masked_select(mask)
    loss = loss.mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [35]:
#Visualizing for one iteration
small_batch_size = 5
batches = trainDataBatch(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_var, lengths, target_var, mask, max_target_len = batches

print("Input Variable Shape:", input_var.shape)
print("Lengths Shape:", lengths.shape)
print("Target Variable Shape:", target_var.shape)
print("Mask Shape:", mask.shape)
print("Max Target Length :", max_target_len)

#Parameters
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
attn_model = 'dot'
embedding = nn.Embedding(voc.num_words, hidden_size)

#Defining encoder and decoder
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = AttnDecoder(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
encoder = encoder.to(device)
decoder = decoder.to(device)

encoder.train()
decoder.train()

#Optimizer
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr =0.0001)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr =0.0001)
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()

input_var = input_var.to(device)
lengths = lengths.to(device)
target_var = target_var.to(device)
mask = mask.to(device)

loss = 0
print_losses = []
n_totals = 0

encoder_outputs, encoder_hidden = encoder(input_var, lengths)
print("Encoder Outputs Shape:", encoder_outputs.shape)
print("Last Encoder Hidden Shape", encoder_hidden.shape)

decoder_input = torch.LongTensor([[SOS_token for _ in range(small_batch_size)]])
decoder_input = decoder_input.to(device)
print("Encoder Decoder Input Shape:", decoder_input.shape)
print(decoder_input)

#Set Initial decoder hidden state to encoder's final hidden state
decoder_hidden = encoder_hidden[:decoder.n_layers]
print("Initial Decoder Hidden State Shape:", decoder_input.shape)
print("\n")
print("-------------------------")
print("Every timestep of GRU")
print("-------------------------")
print("\n")

#Teacher Forcing
for t in range(max_target_len):
    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
    print("Decoder Output Shape:", decoder_output.shape)
    print("Decoder Hidden Shape:", decoder_hidden.shape)
    #Teacher forcing
    decoder_input = target_var[t].view(1,-1)
    print("Target Variable before reshaping: ", target_var[t])
    print("Shape of Target Variable before reshaping", target_var[t].shape)
    print("Decoder input shape:", decoder_input.shape)
    
    #Calculaye and accumulate loss
    print("Mask at the current timestep", mask[t])
    print("Mask at the current timestep shape", mask[t].shape)
    mask_loss, nTotal = maskLoss(decoder_output, target_var[t], mask[t])
    print("Mask Loss:", mask_loss)
    print("Total:", nTotal)
    loss += mask_loss
    print_losses.append(mask_loss.item()*nTotal)
    print(print_losses)
    n_totals += nTotal
    print(n_totals)
    encoder_optimizer.step()
    decoder_optimizer.step()
    returned_loss = sum(print_losses)/n_totals
    print("Returned Loss", returned_loss)
    print("\n")
    print("---------END OF TIMESTEP---------")
    print("\n")

Input Variable Shape: torch.Size([10, 5])
Lengths Shape: torch.Size([5])
Target Variable Shape: torch.Size([9, 5])
Mask Shape: torch.Size([9, 5])
Max Target Length : 9
Encoder Outputs Shape: torch.Size([10, 5, 500])
Last Encoder Hidden Shape torch.Size([4, 5, 500])
Encoder Decoder Input Shape: torch.Size([1, 5])
tensor([[1, 1, 1, 1, 1]])
Initial Decoder Hidden State Shape: torch.Size([1, 5])


-------------------------
Every timestep of GRU
-------------------------


Decoder Output Shape: torch.Size([5, 7840])
Decoder Hidden Shape: torch.Size([2, 5, 500])
Target Variable before reshaping:  tensor([1380,  319,  116,  319,   50])
Shape of Target Variable before reshaping torch.Size([5])
Decoder input shape: torch.Size([1, 5])
Mask at the current timestep tensor([True, True, True, True, True])
Mask at the current timestep shape torch.Size([5])
Mask Loss: tensor(8.9716, grad_fn=<MeanBackward0>)
Total: 5
[44.858155250549316]
5
Returned Loss 8.971631050109863


---------END OF TIMESTEP-----

In [36]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAXLEN):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [37]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [trainDataBatch(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [38]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores


In [39]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAXLEN):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [40]:
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = AttnDecoder(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [41]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1; Percent complete: 0.0%; Average loss: 8.9798
Iteration: 2; Percent complete: 0.1%; Average loss: 8.7711
Iteration: 3; Percent complete: 0.1%; Average loss: 8.5078
Iteration: 4; Percent complete: 0.1%; Average loss: 8.2358
Iteration: 5; Percent complete: 0.1%; Average loss: 7.5063
Iteration: 6; Percent complete: 0.1%; Average loss: 7.1147
Iteration: 7; Percent complete: 0.2%; Average loss: 6.6519
Iteration: 8; Percent complete: 0.2%; Average loss: 6.7473
Iteration: 9; Percent complete: 0.2%; Average loss: 6.6091
Iteration: 10; Percent complete: 0.2%; Average loss: 6.4117
Iteration: 11; Percent complete: 0.3%; Average loss: 6.0093
Iteration: 12; Percent complete: 0.3%; Average loss: 5.8189
Iteration: 13; Percent complete: 0.3%; Average loss: 5.9135
Iteration: 14; Percent complete: 0.4%; Average loss: 5.3747
Iteration: 15; Percent complete: 0.4%; Average loss: 4.8896
Iteration: 16; Percent complete: 0.4%

Iteration: 135; Percent complete: 3.4%; Average loss: 3.7163
Iteration: 136; Percent complete: 3.4%; Average loss: 3.6504
Iteration: 137; Percent complete: 3.4%; Average loss: 3.5947
Iteration: 138; Percent complete: 3.5%; Average loss: 3.5209
Iteration: 139; Percent complete: 3.5%; Average loss: 3.5598
Iteration: 140; Percent complete: 3.5%; Average loss: 3.4452
Iteration: 141; Percent complete: 3.5%; Average loss: 3.8005
Iteration: 142; Percent complete: 3.5%; Average loss: 3.6790
Iteration: 143; Percent complete: 3.6%; Average loss: 3.6409
Iteration: 144; Percent complete: 3.6%; Average loss: 3.7007
Iteration: 145; Percent complete: 3.6%; Average loss: 3.6218
Iteration: 146; Percent complete: 3.6%; Average loss: 3.6183
Iteration: 147; Percent complete: 3.7%; Average loss: 3.5353
Iteration: 148; Percent complete: 3.7%; Average loss: 3.3380
Iteration: 149; Percent complete: 3.7%; Average loss: 3.5804
Iteration: 150; Percent complete: 3.8%; Average loss: 3.6486
Iteration: 151; Percent 

KeyboardInterrupt: 