In [None]:
!nvidia-smi

Fri Oct 29 08:46:09 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.29.05    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P8    34W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Importing Packages

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline
import csv
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet=WordNetLemmatizer()

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import random
import os
import unicodedata
import codecs
from io import open
import itertools
import math

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Loading and Preprocessing Data for NMF

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/presidential_speeches.csv')
# Cleaning the text
a = list(df['Transcript'])
b = ' '.join(str(e) for e in a)
sentences=nltk.sent_tokenize(b)
corpus=[]
for i in range(len(sentences)):
  review=re.sub('[^a-zA-Z]', ' ', sentences[i])
  review=review.lower()
  review=review.split()
  review=[wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
  review=' '.join(review)
  corpus.append(review)

with open('/content/drive/MyDrive/Colab Notebooks/Data/speeches.txt', 'w') as filehandle:
    for listitem in corpus:
        filehandle.write('%s\n' % listitem)

corpus=[]
with open('/content/drive/MyDrive/Colab Notebooks/Data/speeches.txt', 'r') as filehandle:
    for line in filehandle:
        currentPlace = line[:-1]
        corpus.append(currentPlace)

### Training and Saving Topic model

In [None]:
def save_model(dictionary, feature_names, filename):
    """save two numpy arrays, one for the dictionary and one for the feature names. Can be loaded back in with load_model"""
    with open(filename, "wb") as f:
        np.savez(f, dictionary=dictionary, feature_names=feature_names)

def load_model(filename):
    """load model saved with save_model. Returns a tupe of dictionary, feature_names"""
    with open(filename, "rb") as f:
        arrs = np.load(f)
        return arrs["dictionary"], arrs["feature_names"]
    

def print_top_words(dictionary, feature_names, n_top_words):
    for topic_idx, topic in enumerate(dictionary):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


def train_model(data, n_features, n_components):
    """train a model on data. Data is a list of strings to be trained on.
    They are converted into a bag of words representation and then
    transformed using a tfidf transformer with a maximum of n_features
    words in the model. This representation is then fed into the NMF
    algorithm which has n_components number of features. Returns the
    dictionary matrix (np array) and a list of words which represent the
    columns of the dictionary matrix"""
    model = Pipeline(steps=[("tfidf", TfidfVectorizer(max_df=0.95, min_df=2,
                                                      max_features=n_features, stop_words='english')),
                            ("nmf", NMF(n_components=n_components, random_state=1,
                                       alpha=.1, l1_ratio=.5))])
    model.fit(data)
    features = model["tfidf"].get_feature_names()
    dictionary = model["nmf"].components_
    return dictionary, features


def main():
    # number of top words to use in tfidf representation of data
    n_features = 1000
    # number of topics in NMF
    n_components = 10

    # Presidential Speech data
    print("Loading, training, and saving Presidential Speech model")
    dictionary, features = train_model(corpus, n_features, n_components)
    save_model(dictionary, features, "/content/drive/MyDrive/Colab Notebooks/Data/presidential_speeches.npz")

if __name__ == "__main__":
    main()

c = np.load("/content/drive/MyDrive/Colab Notebooks/Data/presidential_speeches.npz")["dictionary"]
d = np.load("/content/drive/MyDrive/Colab Notebooks/Data/presidential_speeches.npz")["feature_names"]
print(len(c[0]),len(c))
print(len(d)) 
print(c)
print(d)

Loading, training, and saving Presidential Speech model
1000 10
1000
[[0.03340312 0.05376837 0.01175347 ... 0.02996114 0.         0.38284078]
 [0.00677903 0.         0.         ... 0.02037617 0.         0.00327087]
 [0.         0.01365533 0.02552903 ... 0.         0.06069489 0.        ]
 ...
 [0.01520393 0.05684332 0.11559798 ... 0.01968985 0.00964287 0.12716392]
 [0.00605393 0.04835156 0.01882106 ... 0.02170889 0.00267111 0.03239904]
 [0.03159277 0.06173521 0.05452766 ... 0.         0.00503898 0.06749094]]
['ability' 'able' 'abroad' 'abuse' 'accept' 'accomplished' 'according'
 'account' 'achieve' 'act' 'action' 'activity' 'actual' 'actually' 'add'
 'added' 'addition' 'additional' 'address' 'adequate' 'adjustment'
 'administration' 'adopted' 'adoption' 'advance' 'advantage' 'affair'
 'afford' 'africa' 'age' 'agency' 'agent' 'aggression' 'ago' 'agree'
 'agreed' 'agreement' 'agricultural' 'agriculture' 'ahead' 'aid' 'air'
 'allowed' 'ally' 'amendment' 'america' 'american' 'annual' 'answe

### Printing top 10 words of each topic categorised by NMF

In [None]:
def print_top_words(dictionary, feature_names, n_top_words):
    for topic_idx, topic in enumerate(dictionary):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()



c = np.load("/content/drive/MyDrive/Colab Notebooks/Data/presidential_speeches.npz")["dictionary"]
d = np.load("/content/drive/MyDrive/Colab Notebooks/Data/presidential_speeches.npz")["feature_names"]
print_top_words(c, d, 10)

Topic #0: people want think know work going say young million free
Topic #1: president vice thank office said congress question house senate say
Topic #2: state united union territory treaty citizen constitution relation republic th
Topic #3: year ago fiscal past increase million billion expenditure june tax
Topic #4: nation world america war peace know new great freedom today
Topic #5: government congress law right power public act shall duty general
Topic #6: mr thank question governor member said secretary distinguished general fellow
Topic #7: american life citizen million fellow job want today work family
Topic #8: time come long think change work history going period short
Topic #9: country great foreign citizen business good trade prosperity think policy



### Preprocessing Data for Encoder Decoder

In [None]:
def loadLines(fileName, fields):
    lines = {}
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            # Extract fields
            lineObj = {}
            for i, field in enumerate(fields):
                lineObj[field] = values[i]
            lines[lineObj['lineID']] = lineObj
    return lines

def printLines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

def loadConversations(fileName, lines, fields):
    conversations = []
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            convObj = {}
            for i, field in enumerate(fields):
                convObj[field] = values[i]
            utterance_id_pattern = re.compile('L[0-9]+')
            lineIds = utterance_id_pattern.findall(convObj["utteranceIDs"])
            convObj["lines"] = []
            for lineId in lineIds:
                convObj["lines"].append(lines[lineId])
            conversations.append(convObj)
    return conversations

def extractSentencePairs(conversations):
    qa_pairs = []
    for conversation in conversations:
        for i in range(len(conversation["lines"]) - 1): 
            inputLine = conversation["lines"][i]["text"].strip()
            targetLine = conversation["lines"][i+1]["text"].strip()
            if inputLine and targetLine:
                qa_pairs.append([inputLine, targetLine])
    return qa_pairs

datafile = "/content/drive/MyDrive/Colab Notebooks/Data/formatted_movie_lines.txt"
datafile_validation = r"/content/drive/MyDrive/Colab Notebooks/Data/formatted_movie_lines_validation.txt"
delimiter = '\t'
delimiter = str(codecs.decode(delimiter, "unicode_escape"))
lines = {}
conversations = []
MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"]
MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
lines = loadLines("/content/drive/MyDrive/Colab Notebooks/Data/movie_lines.txt", MOVIE_LINES_FIELDS)
conversations = loadConversations("/content/drive/MyDrive/Colab Notebooks/Data/movie_conversations.txt", lines, MOVIE_CONVERSATIONS_FIELDS)

number = 1
print("\nWriting newly formatted file...")
with open(datafile_validation, 'w', encoding='utf-8') as outputfile:
        writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')

        for pair in extractSentencePairs(conversations):
            if number <= 60000:
                writer.writerow(pair)
            number += 1

number = 1
with open(datafile, 'w', encoding='utf-8') as outputfile:
        writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
        for pair in extractSentencePairs(conversations):
            if number > 60000:
                writer.writerow(pair)
            number += 1


print("\nLines from formatted_movies_lines with query sentence and response sentence pair separated by tab:")
printLines(datafile)

    # printLines("/content/drive/MyDrive/Colab Notebooks/Data/movie_lines.txt")


Writing newly formatted file...

Lines from formatted_movies_lines with query sentence and response sentence pair separated by tab:
b'Out of the mouths of babes, Louis.\tThis babe has said enough.\n'
b'Poor Missy. God, I was sorry to hear. I remember when she was no older\'n Ellen there, walking down to the store with her Raggedy Anne doll draggin\' behind her in the dust. I don\'t know why God takes someone like her, who should have a bunch of years still in front of them, and lets an old shit like me just go on and on.\t"My father used to have a saying, Jud-- ""God sees the truth, but waits."""\n'
b'"My father used to have a saying, Jud-- ""God sees the truth, but waits."""\tAyuh...how is your cat, Louis?\n'
b"Ayuh...how is your cat, Louis?\tIt's Ellie's cat.\n"
b"It's Ellie's cat.\tNope. He's your cat now.\n"
b"Your father-in-law packs a wallop, for an old guy. He and his wife gone back to Chicago?\tNo...squatting out there at the Holiday Inn like a couple of vultures. He really th

### Create a vocabulary and loading query/response sentence pairs into memory and Data Preprocess

In [None]:
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)

corpus_name="movie-dialogs"
MAX_LENGTH = 10  
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
    print("Reading lines...")
    lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs

# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

# Filter pairs using filterPair condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


# Load/Assemble voc and pairs
save_dir = "/content/drive/MyDrive/Colab Notebooks/Data/save"
voc_validation, pairs_validation = loadPrepareData(corpus_name, datafile_validation, save_dir)
voc, pairs = loadPrepareData( corpus_name, datafile, save_dir)

MIN_COUNT = 3    # Minimum word count threshold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)


print("\npairs:")
for pair in pairs[:10]:
    print(pair)

Start preparing training data ...
Reading lines...
Read 60000 sentence pairs
Trimmed to 17442 sentence pairs
Counting words...
Counted words: 8378
Start preparing training data ...
Reading lines...
Read 161282 sentence pairs
Trimmed to 46829 sentence pairs
Counting words...
Counted words: 15035
keep_words 6373 / 15032 = 0.4240
Trimmed from 46829 pairs to 37530, 0.8014 of total

pairs:
['it s ellie s cat .', 'nope . he s your cat now .']
['leave me alone !', 'remember .']
['let her go . it s cool .', 'louis the house is beautiful .']
['and buckaroo banzai .', 'come on let s parole em .']
['my god !', 'it s beautiful !']
['that s enough of that kind of talk !', 'i just said']
['getting there .', 'i got eggs down here !']
['i got eggs down here !', 'good d']
['i m going to try to do better .', 'you re doing fine .']
['you better get going hon .', 'oh louis i just don t know about this']


### Preparing Data for model

In [None]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]


def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len


# Example for validation
# small_batch_size = 5
# batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
# input_variable, lengths, target_variable, mask, max_target_len = batches

# print("input_variable:", input_variable)
# print("lengths:", lengths)
# print("target_variable:", target_variable)
# print("mask:", mask)
# print("max_target_len:", max_target_len)

### Preparing Model

In [None]:
def calculate_codes(topic_for_code, input_seq_for_code, voc, feature_path, batch_size):
    # batch_size = 64 for training, 1 for chatting
    nmfdict = np.load(feature_path)["feature_names"]
    new_input_seq = torch.zeros(batch_size, len(nmfdict))

    for i in range(batch_size):
        for j in range(len(input_seq_for_code[i])):
            if voc.index2word.get(input_seq_for_code[i][j].item(),0)!=0:
                input_seq_words = voc.index2word.get(input_seq_for_code[i][j].item(),0)
                for check_index in range(len(nmfdict)):
                    if nmfdict[check_index] == input_seq_words:
                        new_input_seq[i][check_index] = 1

    three_d_topic = topic_for_code.repeat(batch_size, 1, 1).to(device)
    three_d_q = new_input_seq.repeat(1, 1, 1).permute(1, 2, 0).to(device)

    return torch.bmm(three_d_topic, three_d_q)



class EncoderRNN(nn.Module):
    def __init__(self,
                 hidden_size,
                 embedding,
                 topics,
                 n_layers=1,
                 dropout=0,
                 batch_size=64):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.topics = topics
        self.batch_size = batch_size
        # self.voc = voc
        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size,
                          hidden_size,
                          n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        #print(input_seq)
        batch_size = 1
        #when chatting, set to 1
        #get the codes first
        input_seq_for_code = input_seq.transpose(0,1)
        topic_for_code = self.topics
        #input_seq_for_code_np = input_seq_for_code.numpy()
        #print('topic_for_code', topic_for_code.size())
        #print('input_seq_for_code',input_seq_for_code.size())
        #print(input_seq_for_code[0])
        feature_path = r"/content/drive/MyDrive/Colab Notebooks/Data/presidential_speeches.npz"
        codes = calculate_codes(topic_for_code, input_seq_for_code, voc, feature_path, self.batch_size)
        #print(codes.size())
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths.cpu())
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden, codes

class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

# Message attention
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5 (our eq. 17)
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        # print(concat_input.size())
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6 (our eq. 19 third eq. only with the first term)
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden


def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()


def validation(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):
    # Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden, codes = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    teacher_forcing_ratio = 1.0
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs, codes, batch_size
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs, codes, batch_size
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    return sum(print_losses) / n_totals

def train(input_variable,lengths,target_variable,mask,max_target_len,encoder,decoder,embedding,encoder_optimizer,decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden, codes = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    #decoder_hidden = encoder_hidden[:decoder.n_layers]
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    teacher_forcing_ratio = 1.0
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):

            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs, codes, batch_size
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs, codes, batch_size
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals


def trainIters(model_name,voc,voc_validation,pairs,pairs_validation,encoder,decoder,encoder_optimizer,decoder_optimizer,embedding,encoder_n_layers,decoder_n_layers,save_dir,n_iteration,batch_size,print_every,save_every,clip,corpus_name,loadFilename,DICT_NAME,checkpoint):

    #history record file
    history_file = open(r'/content/drive/MyDrive/Colab Notebooks/Data/history.txt', 'w')

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]
    training_batches_validation = [batch2TrainData(voc_validation, [random.choice(pairs_validation) for _ in range(batch_size)])
                        for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    print_loss_validation = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        training_batch_validation = training_batches_validation[iteration - 1]

        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch
        input_variable_validation, lengths_validation, target_variable_validation, mask_validation, max_target_len_validation = training_batch_validation

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss
        loss_validation = validation(input_variable_validation, lengths_validation, target_variable_validation, mask_validation, max_target_len_validation, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss_validation += loss_validation

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print_loss_avg_validation = print_loss_validation / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Training loss: {:.4f}; Validation loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg, print_loss_avg_validation))
            history_file.write('{}'.format(print_loss_avg))
            history_file.write(' ')
            history_file.write('{}'.format(print_loss_avg_validation))
            history_file.write("\n")

            print_loss = 0
            print_loss_validation = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            hidden_size = 500
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,

            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpointiteration_'+DICT_NAME)))

            torch.save({
                #'iteration': iteration,
                'en': encoder.state_dict(),

            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpointencoder_'+DICT_NAME)))

            torch.save({


                'de': decoder.state_dict(),

            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpointdecoder_'+DICT_NAME)))

            torch.save({


                'en_opt': encoder_optimizer.state_dict(),

            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpointenopt_'+DICT_NAME)))

            torch.save({


                'de_opt': decoder_optimizer.state_dict(),

            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpointdeopt_'+DICT_NAME)))

            torch.save({


                'loss': loss,

            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpointloss_'+DICT_NAME)))

            torch.save({


                'voc_dict': voc.__dict__,

            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpointvocdict_'+DICT_NAME)))

            torch.save({

                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpointembbedding_'+DICT_NAME)))

    history_file.close()

class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length, batch_size):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden, codes = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:self.decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input,
                                                          decoder_hidden,
                                                          encoder_outputs,
                                                          codes,
                                                          batch_size)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            #print(decoder_scores)
            #print(decoder_input)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

def my_random_pick(probabilities):
    x=random.uniform(0,1)
    #print(probabilities[0][20])
    cumulative_probability = 0.0
    #list = [n for n in range(len(probabilities))]
    for item in range(len(probabilities[0])):
        cumulative_probability += probabilities[0][item]
        if x < cumulative_probability:
            break
    item_list = torch.zeros([1], device=device, dtype=torch.long)
    item_list[0] = item
    score = torch.zeros([1],device=device)
    score[0]=probabilities[0][item]
    #print(item_list)
    #print(score)
    #print("one random pick")

    return item_list, score

class ProbabilitySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder, batch_size=1):
        super(ProbabilitySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.batch_size = batch_size

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden, codes = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:self.decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input,
                                                          decoder_hidden,
                                                          encoder_outputs,
                                                          codes,
                                                          self.batch_size)
            # Obtain word based on probability distribution token and its softmax score
            decoder_input, decoder_scores = my_random_pick(decoder_output)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores


def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            start_time = time.time()
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))
            end_time = time.time()
            print('Responding time:', end_time - start_time)
        except KeyError:
            print("Error: Encountered unknown word.")

def multi_evaluateInput(encoder1, decoder1, encoder2, decoder2, searcher1, searcher2, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            #start_time = time.time()
            output_words1 = evaluate(encoder1, decoder1, searcher1, voc, input_sentence)
            # Format and print response sentence
            output_words1[:] = [x for x in output_words1 if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words1))
            output_words2 = evaluate(encoder2, decoder2, searcher2, voc, input_sentence)
            output_words2[:] = [x for x in output_words2 if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words2))
            #end_time = time.time()
            #print('Responding time:', end_time - start_time)
        except KeyError:
            print("Error: Encountered unknown word.")

### Training

In [None]:
class TopicAttention(nn.Module):
    def __init__(self, topic_vocab_size, enc_hid_dim, dec_hid_dim):
        super(TopicAttention, self).__init__()
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        self.attn = nn.Linear(topic_vocab_size + dec_hid_dim + enc_hid_dim, dec_hid_dim)
        self.v = nn.Parameter(torch.rand(dec_hid_dim))

    def forward(self, hidden, topic_dict, enc_hidden):
        # hidden = [batch_size, dec_hid_dim]
        # topic_dict = [num_topics, topic_vocab_size]
        # enc_hidden = [batch_size, enc_hid_dim * 2]
        batch_size = enc_hidden.shape[0]
        num_topics = topic_dict.shape[0]

        #print('hidden', hidden.size())
        #print('topic_dict', topic_dict.size())
        #print('enc_hidden', enc_hidden.size())
        hidden = hidden.repeat(num_topics, 1, 1).permute(1, 0, 2)
        enc_hidden = enc_hidden.repeat(num_topics, 1, 1).permute(1, 0, 2)
        topic_dict = topic_dict.repeat(batch_size, 1, 1)

        # hidden = [batch_size, num_topics, dec_hid_dim]
        # enc_hidden = [batch_size, num_topics, 2 * enc_hid_dim]
        # topic_dict = [batch_size, num_topics, topic_vocab_size]
        #print(num_topics)
        #print('hidden',hidden.size())
        #print('topic_dict',topic_dict.size())
        #print('enc_hidden',enc_hidden.size())
        #print(torch.cat((hidden, topic_dict, enc_hidden), dim=2).size())
        energy = torch.tanh(self.attn(torch.cat((hidden, topic_dict, enc_hidden), dim=2)))

        # energy = [batch_size, dec_hid_dim]
        energy = energy.permute(0, 2, 1)

        v = self.v.repeat(batch_size, 1).unsqueeze(1)

        # v = [batch_size, 1, dec_hid_dim]
        attention = torch.bmm(v, energy).squeeze(1)

        return F.softmax(attention, dim=1).unsqueeze(1)


class TopicDecoder(nn.Module):
    def __init__(self,
                 attn_model,
                 embedding,
                 hidden_size,
                 output_size,
                 enc_hid_dim,
                 dec_hid_dim,
                 topics,
                 topic_vocab_size,
                 n_layers=1,
                 dropout=0.1,
                 batch_size=1):
        super(TopicDecoder, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.topics = topics
        self.topic_vocab_size = topic_vocab_size
        self.batch_size = batch_size

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2 + topic_vocab_size + 1000, hidden_size) #1000 is len of topic dictionary
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)
        self.topic_attn = TopicAttention(topic_vocab_size, enc_hid_dim, dec_hid_dim)

    def forward(self,
                input_step,
                last_hidden,
                encoder_outputs,
                codes,
                batch_size=1):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        # print('encoder_outputs',encoder_outputs.size())
        # batch_size = 64  #for chat : set to 1
        # print('batch_size', batch_size)
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)

        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # print('last_hidden', last_hidden.size())
        # print('self.topics', self.topics.size())
        # print('encoder_outputs', encoder_outputs.size())
        topic_attn_weights = self.topic_attn(last_hidden, self.topics, encoder_outputs[-1])

        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        encoder_outputs = encoder_outputs.permute(1,0,2)
        # print('attn_weights', attn_weights.size())
        # print('encoder_outputs', encoder_outputs.size())

        context = torch.bmm(attn_weights, encoder_outputs)
        # print('context', context.size())
        # print('topic_attn_weights',topic_attn_weights.size())
        # print('self.topics.repeat(batch_size, 1, 1)',self.topics.repeat(batch_size, 1, 1).size())
        topic_context = torch.bmm(topic_attn_weights, self.topics.repeat(batch_size, 1, 1))
        # context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # print('self.topics',self.topics.size())
        # print('codes',codes.size())
        topic_for_Pk = self.topics.repeat(batch_size,1,1).permute(0,2,1)
        # print('topic_for_Pk',topic_for_Pk.size())
        Pk_context = torch.bmm(topic_for_Pk, codes)
        # print('Pk_context',Pk_context.size())

        # Concatenate weighted context vector and GRU output using Luong eq. 5
        # print('rnn_output', rnn_output.size())
        # print('context', context.size())
        # print('topic_context', topic_context.size())
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        topic_context = topic_context.squeeze(1)
        # print('rnn_output', rnn_output.size())
        # print('context', context.size())
        # print('topic_context', topic_context.size())
        Pk_context = Pk_context.squeeze(2)
        concat_input = torch.cat((rnn_output, context, topic_context, Pk_context), 1)
        # print('concat_input', concat_input.size())
        concat_output = torch.tanh(self.concat(concat_input))

        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden




class error_TopicDecoder(nn.Module):
    def __init__(self,
                 output_dim,
                 emb_dim,
                 enc_hid_dim,
                 dec_hid_dim,
                 dropout,
                 attention,
                 topic_attention,
                 topics,
                 embedding):
        super(TopicDecoder, self).__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.topic_dim = topics.shape[1]
        self.dropout = dropout
        self.attention = attention
        self.topic_attention = topic_attention
        self.topics = topics

        self.embedding = embedding

        self.rnn = nn.GRU(emb_dim + 2 * enc_hid_dim + self.topic_dim, dec_hid_dim)
        self.out = nn.Linear(emb_dim + dec_hid_dim + 2 * enc_hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, word, hidden, enc_out):
        # word = [batch_size]
        # hidden = [batch_size, dec_hid_dim]
        # enc_out = [sent_len, batch_size, 2 * enc_hid_dim]
        # why did i use these dimensions for enc_out?
        batch_size = word.shape[0]

        word = word.unsqueeze(0)
        embedded = self.embedding(word)
        embedded = self.embedding_dropout(embedded)
        embedded = self.dropout(self.embedding(word))

        # embedded = [1, batch_size, emb_dim]
        a = self.attention(hidden, enc_out)
        ta = self.topic_attention(hidden, self.topics, enc_out[-1])

        # a = [batch_size, sent_len]
        # ta = [batch_size, num_topics]
        a = a.unsqueeze(1)
        ta = ta.unsqueeze(1)

        enc_out = enc_out.permute(1, 0, 2)

        # enc_out = [batch_size, sent_len, 2 * enc_hid_dim]
        weighted = torch.bmm(a, enc_out)
        topics_weighted = torch.bmm(ta, self.topics.repeat(batch_size, 1, 1))

        # weighted = [batch_size, 1, 2 * enc_hid_dim]
        # topics_weighted = [batch_size, 1,  topic_dim]
        # leave these unsqueezed so the RNN treats it as a seq of len 1
        weighted = weighted.permute(1, 0, 2)
        topics_weighted = topics_weighted.permute(1, 0, 2)

        rnn_input = torch.cat((embedded, weighted, topics_weighted), dim=2)

        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)

        # use topics_weighted in the final linear layer?
        output = self.out(torch.cat((output, weighted, embedded), dim=1))

        # return weighted makes this only work with biased seq2seq, make cleaner later
        return output, hidden.squeeze(0)#, weighted


In [None]:
voc.num_words

6376

In [None]:
# Configure models
model_name = 'topiccb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 1
decoder_n_layers = 1
dropout = 0.1
batch_size = 64 # 64 for training, 1 for chatting


DICT_NAME = 'presidential_speeches'

DICT_PATH = "/content/drive/MyDrive/Colab Notebooks/Data/presidential_speeches.npz"
topic_dict = torch.tensor(np.load(DICT_PATH)["dictionary"], dtype=torch.float).to(device)

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 1000
# loadFilename = os.path.join(save_dir, model_name, corpus_name,
                          #  '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
                          #  '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words-1, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)

# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size=hidden_size,
                             embedding=embedding,
                             topics=topic_dict,
                             n_layers=encoder_n_layers,
                             dropout=dropout,
                             batch_size=batch_size)


#decoder = .LuongAttnDecoderRNN(attn_model, embedding, hidden_size, d.voc.num_words, decoder_n_layers, dropout)
enc_hid_dim, dec_hid_dim, emb_dim = hidden_size, hidden_size, hidden_size
#voc_dim = d.voc.num_words
#attention = Attn(attn_model, hidden_size)
#ta_attn = TopicAttention(topic_dict.shape[1], enc_hid_dim, dec_hid_dim)
decoder = TopicDecoder(attn_model,
                                  embedding,
                                  hidden_size,
                                  voc.num_words,
                                  enc_hid_dim,
                                  dec_hid_dim,
                                  topic_dict,
                                  topic_dict.shape[1],
                                  decoder_n_layers,dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)

# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')


#start training
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 1000
print_every = 1
save_every = 1000

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name=model_name,
                   voc=voc,
                   voc_validation=voc_validation,
                   pairs=pairs,
                   pairs_validation=pairs_validation,
                   encoder=encoder,
                   decoder=decoder,
                   encoder_optimizer=encoder_optimizer,
                   decoder_optimizer=decoder_optimizer,
                   embedding=embedding,
                   encoder_n_layers=encoder_n_layers,
                   decoder_n_layers=decoder_n_layers,
                   save_dir=save_dir,
                   n_iteration=n_iteration,
                   batch_size=batch_size,
                   print_every=print_every,
                   save_every=save_every,
                   clip=clip,
                   corpus_name=corpus_name,
                   loadFilename=loadFilename,
                   DICT_NAME=DICT_NAME,
                   checkpoint=None)

Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...


IndexError: ignored