In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.22.2-py3-none-a

In [2]:
from __future__ import unicode_literals # to print Unicode characters

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math

import numpy as np
import pandas as pd

In [3]:
#to run on GPU
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [4]:
# This function is used to print the first 'n' lines of a file.
# 'file_path' is the path to the file and 'n' is the number of lines to print.
def printlines(file_path, n=10):
    # Open the file in read-binary mode.
    with open(file_path , 'rb') as datafile:
        # Read all the lines in the file.
        lines = datafile.readlines()
        # Print the total number of lines in the file.
        print('Shape of file is {}\n'.format(len(lines)))
    # Loop over the first 'n' lines.
    for line in lines[:n]:
        # Print each line.
        print(line)


In [5]:
#load files path of the 2nd used dataset
corpus_name = 'cornell-moviedialog-corpus'
movie_lines_path = 'movie_lines.txt'
movie_conversations_path = 'movie_conversations.txt'
movie_titles_path = 'movie_titles_metadata.txt'
movie_charaters_metadata = 'movie_characters_metadata.txt'
printlines(movie_conversations_path)

Shape of file is 83097

b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L363', 'L364']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L365', 'L366']\n"


In [6]:
#function to load the lines in movie_lines and defining the field per word
def loadLines(filename , fields):
    lines = {}
    with open(filename , 'r' , encoding = 'iso-8859-1') as f:
        for line in f:
            values = line.split(' +++$+++ ')
            lineobj = {}
            for i , field in enumerate(fields):
                lineobj[field] = values[i]
                lines[lineobj['lineID']] = lineobj
    return lines
MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
lines = loadLines("movie_lines.txt", MOVIE_LINES_FIELDS)
print(lines['L1045'])

{'lineID': 'L1045', 'characterID': 'u0', 'movieID': 'm0', 'character': 'BIANCA', 'text': 'They do not!\n'}


In [7]:
 # Loading movie_conversations to Structure the Conversations
def loadConversations(filename , lines , fields):
    conversations = []
    with open(filename , 'r' , encoding = 'iso-8859-1') as f:
        for line in f:
            values = line.split(' +++$+++ ')
            convObj = {}
            for i , field in enumerate(fields):
                convObj[field] = values[i]
            utterance_id_pattern = re.compile('L[0-9]+')
            lineIds = utterance_id_pattern.findall(convObj['utteranceIDs'])
            convObj['lines'] = []
            for lineId in lineIds:
                convObj['lines'].append(lines[lineId])
            conversations.append(convObj)
    return conversations
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]

conversations = loadConversations("movie_conversations.txt",lines, MOVIE_CONVERSATIONS_FIELDS)
conversations[:10]

[{'character1ID': 'u0',
  'character2ID': 'u2',
  'movieID': 'm0',
  'utteranceIDs': "['L194', 'L195', 'L196', 'L197']\n",
  'lines': [{'lineID': 'L194',
    'characterID': 'u0',
    'movieID': 'm0',
    'character': 'BIANCA',
    'text': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'},
   {'lineID': 'L195',
    'characterID': 'u2',
    'movieID': 'm0',
    'character': 'CAMERON',
    'text': "Well, I thought we'd start with pronunciation, if that's okay with you.\n"},
   {'lineID': 'L196',
    'characterID': 'u0',
    'movieID': 'm0',
    'character': 'BIANCA',
    'text': 'Not the hacking and gagging and spitting part.  Please.\n'},
   {'lineID': 'L197',
    'characterID': 'u2',
    'movieID': 'm0',
    'character': 'CAMERON',
    'text': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"}]},
 {'character1ID': 'u0',
  'character2ID': 'u2',
  'movieID': 'm0',
  'uttera

In [100]:
# Extract Sentence Pairs
def extractSentencePairs(conversations):
    qa_pairs = []
    for conversation in conversations:
        for i in range(len(conversation['lines']) - 1):
            inputLines = conversation['lines'][i]['text'].strip()
            targetLines = conversation['lines'][i+1]['text'].strip()
            if inputLines and targetLines:
                qa_pairs.append([inputLines , targetLines])
    return qa_pairs
# Writing File
with open('formatted_movie_lines.txt' , 'w' , encoding = 'utf-8') as outputfile:
    writer = csv.writer(outputfile ,lineterminator = '\n' ,  delimiter = str(codecs.decode('\t' , 'unicode_escape')))
    for pair in extractSentencePairs(conversations):
        writer.writerow(pair)
printlines('formatted_movie_lines.txt' )

Shape of file is 221282

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\n"
b'Why?\tUnso

In [124]:
#use only first 300 row from formatted_movie_lines.txt file as it is not so useful like the other datasets

with open('formatted_movie_lines.txt', 'r') as file:
    lines = file.readlines()
    with open('use.txt', 'w') as new_file:
        for line in lines[:500]:
            new_file.write(line)


In [125]:
# prompt: print last row in use.txt

with open('use.txt') as f:
    lines = f.readlines()
    last_line = lines[-1]
    print(last_line)


I'm not asking you to swear to anything.	I don't want you to wait for me.



In [126]:
# Open all files in read mode
with open('new.txt', 'r') as file1,open('use.txt', 'r') as file2, open('dialogs.txt', 'r') as file3:
    # Read the contents of both files
    content1 = file1.read()
    content2 = file2.read()
    content3 = file3.read()

# Concatenate the contents
concatenated_content = content1 + content2 + content3

# Write the concatenated contents to a new file
with open('concatenated.txt', 'w') as output_file:
    output_file.write(concatenated_content)


In [127]:
#load concatenatedd.txt in data variable

with open('concatenated.txt', 'r') as f:
  data = f.read()

# Split the data into questions and answers
questions, answers = [], []
for item in data:
    parts = item.split('\t')
    if len(parts) == 2:
        questions.append(parts[0])
        answers.append(parts[1])


# Create a DataFrame
df = pd.DataFrame({
    'Question': questions,
    'Answer': answers
})

# Check for missing values in the 'Answer' column
missing_answers = df['Answer'].isnull().sum()

print(f'There are {missing_answers} missing answers in the dataset.')


There are 0 missing answers in the dataset.


In [128]:
PAD_token = 0
SOS_token = 1
EOS_token = 2
class Voc:
    def __init__(self , name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token:"PAD", SOS_token:"SOS" , EOS_token : 'EOS'}
        self.num_words = 3
    def addSentence(self,sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self , word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1
    def trim(self , min_count):
#         if self.trimmed:
#             return
        self.trimmed = True
        keep_words = []
        for k,v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)
        print('keep_words {} / {} = {:.4f}'.format(len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)))
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token:"PAD", SOS_token:"SOS" , EOS_token : 'EOS'}
        self.num_words = 3

        for word in keep_words:
            self.addWord(word)

In [129]:
MAX_LENGTH = 20
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD' , s) if unicodedata.category(c) !='Mn')

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s
def readVocs(datafile , corpus_name):
    lines = open(datafile , encoding = 'utf-8').read().strip().split('\n')
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc , pairs
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

In [132]:
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]
def loadPrepareData(corpus , corpus_name , datafile , save_dir):
    voc , pairs = readVocs(datafile, corpus_name)
    pairs = filterPairs(pairs)
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print(voc.num_words)
    return voc , pairs
voc , pairs = loadPrepareData('' , '' , 'formatted_movie_lines.txt' , '')

33027


In [48]:
pairs[:10]

[['well i thought we d start with pronunciation if that s okay with you .',
  'not the hacking and gagging and spitting part . please .'],
 ['not the hacking and gagging and spitting part . please .',
  'okay . . . then how bout we try out some french cuisine . saturday ? night ?'],
 ['you re asking me out . that s so cute . what s your name again ?',
  'forget it .'],
 ['no no it s my fault we didn t have a proper introduction', 'cameron .'],
 ['gosh if only we could find kat a boyfriend . . .',
  'let me see what i can do .'],
 ['c esc ma tete . this is my head',
  'right . see ? you re ready for the quiz .'],
 ['that s because it s such a nice one .', 'forget french .'],
 ['how is our little find the wench a date plan progressing ?',
  'well there s someone i think might be'],
 ['there .', 'where ?'],
 ['you have my word . as a gentleman', 'you re sweet .']]

In [49]:
MIN_COUNT = 3
def trimRareWords(voc , pairs , MIN_COUNT):
    voc.trim(MIN_COUNT)
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break
        if keep_input and keep_output:
            keep_pairs.append(pair)
    print('Trimmed from {} pairs to {} , {:.4f} of Total'.format(len(pairs) , len(keep_pairs) , len(keep_pairs)/len(pairs) ))
    return keep_pairs
pairs = trimRareWords(voc, pairs , MIN_COUNT)

keep_words 1570 / 2848 = 0.5513
Trimmed from 4104 pairs to 2703 , 0.6586 of Total


In [50]:
def indexesFromSentence(voc , sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]
def zeroPadding(l , fill_value = PAD_token):
    return list(itertools.zip_longest(*l , fillvalue = fill_value))
def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

def inputVar(l,voc):
    indexes_batch = [indexesFromSentence(voc , sentence) for sentence in l] # Creating index matrix
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch]) # Lenghts of each index
    padList = zeroPadding(indexes_batch) # Zeropadding will pad the inputs
    padVar = torch.LongTensor(padList)
    return padVar , lengths

def outputVar(l ,voc):
    indexes_batch = [indexesFromSentence(voc , sentence) for sentence in l]
    max_length = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar , mask , max_length
def batch2TrainData(voc , pair_batch):
    pair_batch.sort(key = lambda x : len(x[0].split(" ")) , reverse = True)
    input_batch , output_batch = [] , []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp , lengths = inputVar(input_batch , voc)
    output , mask , max_target_len = outputVar(output_batch , voc)
    return inp , lengths , output , mask , max_target_len
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[ 167,   21,   11,   35,   40],
        [  45,    6,  638,  196,  343],
        [1076,  179,   99,   92,   45],
        [  36,  154,   47,  125,   11],
        [1014,  234,   67,   99, 1022],
        [  91,   47,   71,   28,   15],
        [  36,  516,   15,    2,    2],
        [1077,  861,    2,    0,    0],
        [  15,   15,    0,    0,    0],
        [   2,    2,    0,    0,    0]])
lengths: tensor([10, 10,  8,  7,  7])
target_variable: tensor([[  86,   18,  492,    3,    4],
        [   4,   21,   40,   92,  243],
        [ 167,    6,  114,   12,  331],
        [  45,   56,  471,  678,   42],
        [  46,  861,   15,  286, 1014],
        [  47,   62,    2,   15,  648],
        [1078,  111,    0,    2,   64],
        [  15,   17,    0,    0,   14],
        [   2,  668,    0,    0,   18],
        [   0,   15,    0,    0,   14],
        [   0,    2,    0,    0,  179],
        [   0,    0,    0,    0,   55],
        [   0,    0,    0,    0,   15],
        

In [51]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size , embedding , n_layers=1 , dropout = 0):
        super(EncoderRNN , self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size , hidden_size , n_layers , dropout = dropout , bidirectional = True)
    def forward(self , input_seq , input_lengths , hidden = None):
        embedded = self.embedding(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(embedded , input_lengths.cpu())
        #packed = nn.utils.rnn.pack_padded_sequence(embedded , input_lengths)
        outputs , hidden = self.gru(packed , hidden)
        outputs , _ =  nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[: , : , :self.hidden_size] + outputs[: , : , self.hidden_size:]
        return outputs , hidden

In [52]:
# Attention Layer
class Attn(nn.Module):
    def __init__(self, method , hidden_size):
        super(Attn , self).__init__()
        self.method = method
        if self.method not in ['dot' , 'general' , 'concat']:
            raise ValueError(self.method , "is not defined")
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size , hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size*2 , hidden_size)
            self.v = nn.Parameters(torch.FloatTensor(hidden_size))

    def dot_score(self , hidden , encoder_output):
        return torch.sum(hidden*encoder_output , dim= 2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
      # Calculate attention energies based on the method
      if self.method == 'general':
          attn_energies = self.general_score(hidden, encoder_outputs)
      elif self.method == 'concat':
          attn_energies = self.concat_score(hidden, encoder_outputs)
      elif self.method == 'dot':
          attn_energies = self.dot_score(hidden, encoder_outputs)

      # Transpose max_length and batch_size dimensions
      attn_energies = attn_energies.t()

      # Apply softmax to normalize energies to weights in range 0 to 1, add extra dimension
      return F.softmax(attn_energies, dim=1).unsqueeze(1)




In [53]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self , attn_model , embedding , hidden_size , output_size , n_layers = 1 , dropout = 0.1):
        super(LuongAttnDecoderRNN , self).__init__()
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size =output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size , hidden_size , n_layers , dropout = 0 , )
        self.concat = nn.Linear(hidden_size*2 , hidden_size)
        self.out = nn.Linear(hidden_size , output_size)
        self.attn = Attn(attn_model , hidden_size)
    def forward(self , input_step , last_hidden , encoder_outputs):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        rnn_output , hidden = self.gru(embedded , last_hidden)
        attn_weights = self.attn(rnn_output , encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0,1)) #batch matrix-matrix product of matrices
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output , context) , 1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        output = F.softmax(output , dim = 1)
        return output , hidden

In [54]:
# This function calculates the masked negative log likelihood loss.
def maskNLLLoss(inp , target , mask):
    # Sum the mask tensor to get the total number of true values (i.e., the total number of target tokens)
    nTotal = mask.sum()

    # Compute the cross entropy loss. This is done by first performing a gather operation on the input tensor
    # using the target tensor as indices. The result is then squeezed to remove dimensions of size 1, and the
    # logarithm is taken. The result is negated to give the negative log likelihood.
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))

    # The cross entropy loss is then masked to ignore losses calculated for padding tokens. The masked losses
    # are then averaged to give the mean loss.
    loss = crossEntropy.masked_select(mask).mean()

    # The loss is moved to the device (GPU or CPU) that is being used.
    loss = loss.to(device)

    # The function returns the mean loss and the total number of target tokens.
    return loss , nTotal.item()


In [55]:
def train(input_variable , lengths , target_variable , mask ,
          max_target_len , encoder , decoder , embedding , encoder_optimizer ,
          decoder_optimizer , batch_size , clip , max_length = MAX_LENGTH):
    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    lengths = lengths.to(device)
    mask = mask.to(device)

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0
    encoder_outputs , encoder_hidden = encoder(input_variable , lengths)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)
    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

     # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output , decoder_hidden = decoder(decoder_input , decoder_hidden , encoder_outputs)
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1,-1)
            # Calculate and accumulate loss
            mask_loss , nTotal = maskNLLLoss(decoder_output , target_variable[t] , mask[t])
            loss +=mask_loss
            print_losses.append(mask_loss.item()*nTotal)
            n_totals +=nTotal
    else:
        for t in range(max_target_len):
            decoder_ouput , decoder_hidden = decoder(decoder_input , decoder_hidden , encoder_outputs)
            # No teacher forcing: next input is decoder's own current output
            _ , topi = decoder_output.topk(1)
            # Add some randomness to next input selection
            if random.random() < 0.1:
              decoder_input = torch.randint(high=voc.num_words, size=(1, batch_size), dtype=torch.long, device=device)
            else:
                decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
            #decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            #decoder_input = decoder_input.to(device)

            # Calculate and accumulate loss
            #mask_loss , nTotal = maskNLLLoss(decoder_output , target_variable[t] , mask[t])
            #loss += mask_loss
            #print_losses.append(mask_loss.item()*nTotal)
            #n_totals += nTotal

    # Perform backpropatation
    loss.backward()

# Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters() , clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters() , clip)

# Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()
    return sum(print_losses)/n_totals

In [56]:
# This function is used to train a sequence to sequence model
def trainIters(model_name , voc , pairs , encoder , decoder , encoder_optimizer , decoder_optimizer
               , embedding , encoder_n_layers , decoder_n_layers , save_dir , n_iterations ,
              batch_size , print_every,save_every , clip , corpur_name , loadFilename):

    # Prepare training data
    training_batches = [batch2TrainData(voc , [random.choice(pairs) for _ in range(batch_size)])
                       for _ in range(n_iteration)]

    print('Initializing ...')
    start_iterations = 1
    print_loss = 0

    # If a checkpoint is loaded, we adjust the start iterations
    if loadFilename:
        startiterations = checkpoint['iteration'] + 1

    print('Training ...')

    # Training loop
    for iteration in range(start_iterations , n_iteration + 1):
        # Get training batch
        training_batch = training_batches[iteration - 1]
        input_variable , lengths , target_variable , mask , max_target_len = training_batch

        # Perform a training step
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)

        # Accumulate loss
        print_loss +=loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
           directory = os.path.join(save_dir , model_name , corpus_name , '{}-{}_{}'.format(encoder_n_layers , decoder_n_layers , hidden_size))
           if not os.path.exists(directory):
                os.makedirs(directory)
           torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            } , os.path.join(directory, '{}_{}.tar'.format(iteration , 'checkpoint')))


In [57]:
# Evaluation
class GreedySearchDecoder(nn.Module):
    def __init__(self,encoder,decoder):
        super(GreedySearchDecoder , self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self , input_seq , input_length , max_length):
        encoder_outputs , encoder_hidden = self.encoder(input_seq , input_length)
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        decoder_input = torch.ones(1,1,device = device , dtype = torch.long)*SOS_token
        all_tokens = torch.zeros([0] , device = device , dtype = torch.long)
        all_scores = torch.zeros([0] , device = device)
        for _ in range(max_length):
          decoder_output , decoder_hidden = self.decoder(decoder_input , decoder_hidden , encoder_outputs)
          decoder_scores , decoder_input = torch.max(decoder_output , dim = 1)
          all_tokens = torch.cat((all_tokens, decoder_input) , dim = 0)
          all_scores = torch.cat((all_scores, decoder_scores) , dim = 0)
          decoder_input = torch.unsqueeze(decoder_input , 0)
        return all_tokens , all_scores

In [59]:
def evaluate(encoder , decoder , searcher , voc , sentence , max_length = MAX_LENGTH):
    indexes_batch = [indexesFromSentence(voc , sentence)]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    input_batch = torch.LongTensor(indexes_batch).transpose(0,1)
    input_batch = input_batch.to(device)
    tokens , scores = searcher(input_batch , lengths , max_length)
    decoder_words = [voc.index2word[token.item()] for token in tokens]
    return decoder_words

def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    responses = ["I'm sorry, I did not understand you.",
                 "Sorry, I can't help you with that.",
                 "Can you provide me with more information to help you?"]
    while(1):
      #randomly generate any of the above sentences when it does not have a match to prevent error reply
        try:
            input_sentence = input('> ')
            if input_sentence == 'q' or input_sentence == 'quit': break
            input_sentence = normalizeString(input_sentence)
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))
        except KeyError:
             print(random.choice(responses))



In [60]:
model_name = 'cb_model'
attn_model = 'dot'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64
save_dir = '/'
# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))
# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [61]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.00005
decoder_learning_ratio = 5.0
n_iteration = 2000
print_every = 1
save_every = 2000

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()
# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!
Initializing ...
Training ...
Iteration: 1; Percent complete: 0.1%; Average loss: 7.3678
Iteration: 2; Percent complete: 0.1%; Average loss: 7.3345
Iteration: 3; Percent complete: 0.1%; Average loss: 7.2948
Iteration: 4; Percent complete: 0.2%; Average loss: 7.2628
Iteration: 5; Percent complete: 0.2%; Average loss: 7.2031
Iteration: 6; Percent complete: 0.3%; Average loss: 7.1232
Iteration: 7; Percent complete: 0.4%; Average loss: 7.0707
Iteration: 8; Percent complete: 0.4%; Average loss: 6.9422
Iteration: 9; Percent complete: 0.4%; Average loss: 6.8076
Iteration: 10; Percent complete: 0.5%; Average loss: 6.6971
Iteration: 11; Percent complete: 0.5%; Average loss: 6.4928
Iteration: 12; Percent complete: 0.6%; Average loss: 6.3162
Iteration: 13; Percent complete: 0.7%; Average loss: 6.1177
Iteration: 14; Percent complete: 0.7%; Average loss: 6.1057
Iteration: 15; Percent complete: 0.8%; Average loss: 6.0389
Iteration: 16; Percent complete: 0.8

In [62]:
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting (uncomment and run the following line to begin)
evaluateInput(encoder, decoder, searcher, voc)

> hi
Bot: well you re going to make a good deal . . . . . .
> how are you
Bot: i m throwing a party on friday . s all you . . . .
> am I invited?
Bot: i don t have a long time . . . . . . .
> okay then
Bot: you re right . i don t want to go back in . . . .
> it will be so much fun
Bot: it wouldn t be nice if it wasn t be more . . . . .
> yes
Bot: you can t even say that ? . . . . . .
> no
Bot: you might need to tell me . you don t want to come over ? . .
> no, i want to
Bot: that s a good idea . . . . . . . .
> quit
