












# MTA-LSTM-PyTorch

In [1]:
# ! pip install matplotlib==3.1.1

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import gensim
from gensim.models import KeyedVectors
import numpy as np
import torch
from torch import nn, autograd, optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.nn import Parameter, LayerNorm
from torch.autograd import Variable
import torch.jit as jit
import torch.nn.functional as F
import time
import os
import math
from tqdm import tqdm
import collections
from collections import namedtuple
import random

In [4]:
print(gensim.__version__)
print(torch.__version__)


3.6.0
1.8.1+cu101


In [5]:
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
print('Available cuda:', torch.cuda.device_count())
if torch.cuda.is_available():
    device_num = 0
    deviceName = "cuda:%d" % device_num
    torch.cuda.set_device(device_num)
    print('Current device:', torch.cuda.current_device())
else:
    deviceName = "cpu"
    
device = torch.device(deviceName)
print(deviceName)

Available cuda: 1
Current device: 0
cuda:0



**Build a dictionary and pretrained embedding system**

Here I'm gonna load the pretrained word2vec vocab and vectors. Please refer to this notebook to he how to train it.

The code fvec.vectors is where we get the pretrained vectors. <PAD>, <BOS>, <EOS> and <UNK> are 4 common tokens which stands for PADding, Begin-Of-Sentence, End-Of-Sentence and UNKnown respectively. We simply add them into the vocabularies

In [6]:
 try:
   file_path = '/content/drive/MyDrive/Colab Notebooks/Kaavish/Kaavish_Implementation_01_res/UrduW2V/data_big_W2V.txt'
 finally:
   file_path = '/content/drive/MyDrive/Kaavish_Implementation_01_res/UrduW2V/data_big_W2V.txt'

#file_path = "/content/drive/MyDrive/Colab Notebooks/Kaavish/Kaavish_Implementation_01_res/UrduW2V/data_big_W2V.txt"

fvec = KeyedVectors.load_word2vec_format(file_path, binary=False)
word_vec = fvec.vectors
vocab = ['<PAD>', '<BOS>', '<EOS>', '<UNK>']
vocab.extend(list(fvec.vocab.keys()))
word_vec = np.concatenate((np.array([[0]*word_vec.shape[1]] * 4), word_vec))
word_vec = torch.tensor(word_vec).float()
del fvec
print("total %d words" % len(word_vec))

total 146817 words


In [7]:
try:
  save_folder = '/content/drive/MyDrive/Colab Notebooks/Kaavish/Kaavish_Implementation_01_res/modelTrained'
finally:
  save_folder = '/content/drive/MyDrive/Kaavish_Implementation_01_res/modelTrained'

In [8]:
#save_folder = '/content/drive/MyDrive/Colab Notebooks/Kaavish/Kaavish_Implementation_01_res/modelTrained'
vocab_check_point = '%s/vocab.pkl' % save_folder
word_vec_check_point = '%s/word_vec.pkl' % save_folder
torch.save(vocab, vocab_check_point)
torch.save(word_vec, word_vec_check_point)


**Build a word-index convertor**

We don't want to use type of string directly when training, instead we map them to a unique index in integer. In text generation phase, we'll then convert them back to string.

In [9]:
word_to_idx = {ch: i for i, ch in enumerate(vocab)}
idx_to_word = {i: ch for i, ch in enumerate(vocab)}

**Load preprocessed data**

You can prepare for your own data, or simply use what I offered in the data folder.

In [10]:
essays = []
topics = []

try:
  data_path = "/content/drive/MyDrive/Colab Notebooks/Kaavish/Kaavish_Implementation_01_res/data_keyed_5_shortPara.txt"
finally:
  data_path = "/content/drive/MyDrive/Kaavish_Implementation_01_res/data_keyed_5_shortPara.txt"

num_lines = sum(1 for line in open(data_path, 'r'))

with open(data_path) as f:
    for line in tqdm(f, total=num_lines):
        essay, topic = line.replace('\n', '').split(' </d> ')
        # print(essay)
        # print(topic)
        essays.append(essay.strip().split(' '))
        topics.append(topic.strip().split(' '))
    f.close()

assert len(topics) == len(essays)

100%|██████████| 92325/92325 [00:02<00:00, 36992.44it/s]


In [11]:
print(len(essays))

92325


We then map all the training and testing corpus to integer index word-by-word, with the help of our convertor. Note that we map it to <UNK> if the words in corpus are not in the dictionary.

In [12]:
divider = 90000        # no. at which you want to divide corpus into train and test samples
total = 92325          # total no. of samples -- replace with len(essays) if want to use all samples in dataset

corpus_indice = list(map(lambda x: [word_to_idx[w] if (w in word_to_idx) else word_to_idx['<UNK>'] for w in x], tqdm(essays[:divider])))
topics_indice = list(map(lambda x: [word_to_idx[w] if (w in word_to_idx) else word_to_idx['<UNK>'] for w in x], tqdm(topics[:divider])))
corpus_test = list(map(lambda x: [word_to_idx[w] if (w in word_to_idx) else word_to_idx['<UNK>'] for w in x], tqdm(essays[divider:total])))
topics_test = list(map(lambda x: [word_to_idx[w] if (w in word_to_idx) else word_to_idx['<UNK>'] for w in x], tqdm(topics[divider:total])))

100%|██████████| 90000/90000 [00:02<00:00, 31843.27it/s]
100%|██████████| 90000/90000 [00:00<00:00, 341846.45it/s]
100%|██████████| 2325/2325 [00:00<00:00, 34980.87it/s]
100%|██████████| 2325/2325 [00:00<00:00, 292415.27it/s]


In [13]:
print(len(corpus_test))
print(len(essays))

2325
92325


In [14]:

def viewData(topics, X):
    topics = [idx_to_word[x] for x in topics]
    X = [idx_to_word[x] for x in X]
    #print(topics, X)
    print(topics)

#print(len(corpus_indice))

# viewData(topics_indice[10], corpus_indice[10])

for i in range(1000,1010):
    viewData(topics_indice[i], corpus_indice[i])
    print(len(topics_indice[i]))

['مدد', 'حوصلہ', 'فراہم', 'بھیجے', 'وسائل']
5
['بھرپور', 'کوشش', 'قوت', 'امریکہ', 'امریکا']
5
['ہو', 'کردیا', 'زخمی', 'درجنوں', 'شدید']
5
['فورسز', 'جس', 'قبضہ', 'عراق', 'افواج']
5
['قبضہ', 'فلوجہ', 'ہوگئے', 'رمادی', 'تقریبا']
5
['وکٹ', 'تیسرا', 'انڈیز', 'دے', 'نیوزی']
5
['جواب', 'کن', 'ساتھ', 'سیریز', '3-0']
5
['گئے', 'روزہ', 'ہدف', 'باؤلنگ', 'عوض']
5
['ہو', 'سٹیفنی', 'جواب', 'نمایاں', 'مطلوبہ']
5
['اسے', 'جائے', 'رہی', 'میچ', 'وڈ']
5


In [15]:
from random import shuffle

def shuffleData(topics_indice, corpus_indice):
    ind_list = [i for i in range(len(topics_indice))]
    shuffle(ind_list)
    topics_indice = np.array(topics_indice)
    corpus_indice = np.array(corpus_indice)
    topics_indice = topics_indice[ind_list,]
    corpus_indice = corpus_indice[ind_list,]
    topics_indice = topics_indice.tolist()
    corpus_indice = corpus_indice.tolist()
    return topics_indice, corpus_indice

# topics_indice, corpus_indice = shuffleData(topics_indice, corpus_indice)
viewData(topics_indice[0], corpus_indice[0])

['کھیلے', 'زمبابوے', 'آخری', 'دیش', 'گئے']


In [16]:
for t in topics_indice:
    if len(t) != 5:
        print('less than 5')

In [17]:
length = list(map(lambda x: len(x), corpus_indice))
length

[98,
 90,
 80,
 86,
 88,
 99,
 87,
 82,
 86,
 96,
 89,
 97,
 80,
 97,
 82,
 83,
 87,
 80,
 94,
 87,
 91,
 98,
 91,
 91,
 94,
 91,
 92,
 87,
 97,
 96,
 93,
 85,
 85,
 93,
 82,
 94,
 81,
 84,
 89,
 85,
 99,
 94,
 81,
 92,
 91,
 91,
 93,
 94,
 89,
 98,
 99,
 90,
 83,
 82,
 97,
 95,
 97,
 89,
 88,
 83,
 89,
 83,
 99,
 96,
 90,
 90,
 88,
 83,
 82,
 97,
 94,
 98,
 98,
 92,
 97,
 98,
 99,
 85,
 88,
 91,
 93,
 99,
 91,
 97,
 92,
 84,
 84,
 90,
 85,
 90,
 89,
 91,
 84,
 89,
 95,
 100,
 84,
 87,
 90,
 90,
 92,
 82,
 93,
 80,
 99,
 92,
 98,
 92,
 83,
 85,
 93,
 94,
 95,
 90,
 93,
 97,
 90,
 97,
 100,
 87,
 98,
 95,
 97,
 83,
 87,
 88,
 86,
 88,
 95,
 95,
 80,
 90,
 89,
 96,
 83,
 89,
 89,
 100,
 100,
 99,
 92,
 81,
 87,
 96,
 81,
 100,
 82,
 85,
 85,
 83,
 84,
 90,
 94,
 81,
 80,
 80,
 81,
 84,
 98,
 89,
 88,
 94,
 83,
 85,
 96,
 91,
 99,
 82,
 97,
 94,
 97,
 86,
 98,
 90,
 84,
 94,
 94,
 91,
 89,
 85,
 97,
 92,
 98,
 81,
 80,
 100,
 91,
 94,
 84,
 95,
 88,
 80,
 92,
 83,
 92,
 81,
 90,
 83,
 91,

In [18]:
del essays
del topics

**Batch data iterator**

We want to iter through training data in batches and feed them into the network, and this is how we prepare for it:

In [19]:
def data_iterator(corpus_indice, topics_indice, batch_size, num_steps):
    epoch_size = len(corpus_indice) // batch_size
    for i in range(epoch_size):
        raw_data = corpus_indice[i*batch_size: (i+1)*batch_size]
        key_words = topics_indice[i*batch_size: (i+1)*batch_size]
        data = np.zeros((len(raw_data), num_steps+1), dtype=np.int64)
        # data = np.zeros((len(raw_data), num_steps+1), dtype=np.int32)
        for i in range(batch_size):
            doc = raw_data[i]
            tmp = [1]
            tmp.extend(doc)
            tmp.extend([2])
            tmp = np.array(tmp, dtype=np.int64)
            # tmp = np.array(tmp, dtype=np.int32)
            _size = tmp.shape[0]
            data[i][:_size] = tmp
        key_words = np.array(key_words, dtype=np.int64)
        # key_words = np.array(key_words, dtype=np.int32)
        x = data[:, 0:num_steps]
        y = data[:, 1:]
        mask = np.float32(x != 0)
        x = torch.tensor(x)
        y = torch.tensor(y)
        mask = torch.tensor(mask)
        key_words = torch.tensor(key_words)
        yield(x, y, mask, key_words)

**Build model: MTA-LSTM**
This is the most important part in the notebook.

**Bahdanau Attention**

In [20]:
class Attention(nn.Module):
    """Implements Bahdanau (MLP) attention"""
    
    def __init__(self, hidden_size, embed_size):
        super(Attention, self).__init__()
        
        self.Ua = nn.Linear(embed_size, hidden_size, bias=False)
        self.Wa = nn.Linear(hidden_size, hidden_size, bias=False)
        self.va = nn.Linear(hidden_size, 1, bias=True)
        # to store attention scores
        self.alphas = None
        
    def forward(self, query, topics, coverage_vector):
        scores = []
        C_t = coverage_vector.clone()
        for i in range(topics.shape[1]):
            proj_key = self.Ua(topics[:, i, :])
            query = self.Wa(query)
            scores += [self.va(torch.tanh(query + proj_key)) * C_t[:, i:i+1]]
            
        # stack scores
        scores = torch.stack(scores, dim=1)
        scores = scores.squeeze(2)
#         print(scores.shape)
        # turn scores to probabilities
        alphas = F.softmax(scores, dim=1)
        self.alphas = alphas
        
        # mt vector is the weighted sum of the topics
        mt = torch.bmm(alphas.unsqueeze(1), topics)
        mt = mt.squeeze(1)
        
        # mt shape: [batch x embed], alphas shape: [batch x num_keywords]
        return mt, alphas

**Attention Decoder**

In [21]:
class AttentionDecoder(nn.Module):
    def __init__(self, hidden_size, embed_size, num_layers, dropout=0.5):
        super(AttentionDecoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.num_layers = num_layers
        self.dropout = dropout
        
        # topic attention
        self.attention = Attention(hidden_size, embed_size)
        
        # lstm
        self.rnn = nn.LSTM(input_size=embed_size * 2, 
                           hidden_size=hidden_size, 
                           num_layers=num_layers, 
                           dropout=dropout)
        
    def forward(self, input, output, hidden, phi, topics, coverage_vector):
        # 1. calculate attention weight and mt
        mt, score = self.attention(output.squeeze(0), topics, coverage_vector)
        mt = mt.unsqueeze(1).permute(1, 0, 2)
        
        # 2. update coverge vector [batch x num_keywords]
        coverage_vector = coverage_vector - score / phi
        
        # 3. concat input and Tt, and feed into rnn 
        output, hidden = self.rnn(torch.cat([input, mt], dim=2), hidden)
        
        return output, hidden, score, coverage_vector

**MTA-LSTM model**

In [22]:
LSTMState = namedtuple('LSTMState', ['hx', 'cx'])

class MTALSTM(nn.Module):
    def __init__(self, hidden_dim, embed_dim, num_keywords, num_layers, weight,
                 num_labels, bidirectional, dropout=0.5, **kwargs):
        super(MTALSTM, self).__init__(**kwargs)
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.num_layers = num_layers
        self.num_labels = num_labels
        self.bidirectional = bidirectional
        if num_layers <= 1:
            self.dropout = 0
        else:
            self.dropout = dropout
        self.embedding = nn.Embedding.from_pretrained(weight)
        self.embedding.weight.requires_grad = False
        self.Uf = nn.Linear(embed_dim * num_keywords, num_keywords, bias=False)
        
        # attention decoder
        self.decoder = AttentionDecoder(hidden_size=hidden_dim, 
                                        embed_size=embed_dim, 
                                        num_layers=num_layers, 
                                        dropout=dropout)
        
        # adaptive softmax
        self.adaptiveSoftmax = nn.AdaptiveLogSoftmaxWithLoss(hidden_dim, 
                                                             num_labels, 
                                                             cutoffs=[round(num_labels / 20), 4*round(num_labels / 20)])
    
    def forward(self, inputs, topics, output, hidden=None, mask=None, target=None, coverage_vector=None, seq_length=None):
        embeddings = self.embedding(inputs)
        topics_embed = self.embedding(topics)
        ''' calculate phi [batch x num_keywords] '''
        phi = None
        phi = torch.sum(mask, dim=1, keepdim=True) * torch.sigmoid(self.Uf(topics_embed.reshape(topics_embed.shape[0], -1).float()))
        
        # loop through sequence
        inputs = embeddings.permute([1, 0, 2]).unbind(0)
        output_states = []
        attn_weight = []
        for i in range(len(inputs)):
            output, hidden, score, coverage_vector = self.decoder(input=inputs[i].unsqueeze(0), 
                                                                        output=output, 
                                                                        hidden=hidden, 
                                                                        phi=phi, 
                                                                        topics=topics_embed, 
                                                                        coverage_vector=coverage_vector) # [seq_len x batch x embed_size]
            output_states += [output]
            attn_weight += [score]
            
        output_states = torch.stack(output_states)
        attn_weight = torch.stack(attn_weight)
        
        # calculate loss py adaptiveSoftmax
        outputs = self.adaptiveSoftmax(output_states.reshape(-1, output_states.shape[-1]), target.t().reshape((-1,)))
        
        return outputs, output_states, hidden, attn_weight, coverage_vector
    
    def inference(self, inputs, topics, output, hidden=None, mask=None, coverage_vector=None, seq_length=None):
        embeddings = self.embedding(inputs)
        topics_embed = self.embedding(topics)
       
        phi = None
        phi = seq_length.float() * torch.sigmoid(self.Uf(topics_embed.reshape(topics_embed.shape[0], -1).float()))
        
        queries = embeddings.permute([1, 0, 2])[-1].unsqueeze(0)
        
        inputs = queries.permute([1, 0, 2]).unbind(0)
        output_states = []
        attn_weight = []
        for i in range(len(inputs)):
            output, hidden, score, coverage_vector = self.decoder(input=inputs[i].unsqueeze(0), 
                                                                        output=output, 
                                                                        hidden=hidden, 
                                                                        phi=phi, 
                                                                        topics=topics_embed, 
                                                                        coverage_vector=coverage_vector) # [seq_len x batch x embed_size]
            output_states += [output]
            attn_weight += [score]
            
        output_states = torch.stack(output_states)
        attn_weight = torch.stack(attn_weight)
        
        outputs = self.adaptiveSoftmax.log_prob(output_states.reshape(-1, output_states.shape[-1]))
        return outputs, output_states, hidden, attn_weight, coverage_vector
    
    def init_hidden(self, batch_size):
        hidden = (torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device), 
                  torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device))
        return hidden
    
    def init_coverage_vector(self, batch_size, num_keywords):
        return torch.ones([batch_size, num_keywords]).to(device)


**Greedy decode strategy**

In [23]:

def pad_topic(topics):
    topics = [word_to_idx[x] for x in topics]
    ###.to(device) added in below line by me
    topics = torch.tensor(topics).to(device)
    print(topics)
    max_num = 5     # no. of topics you are using, change accordingly
    size = 1
    ans = np.zeros((size, max_num), dtype=int)
    for i in range(size):
        true_len = min(len(topics), max_num)
        for j in range(true_len):
            print(topics[i])
            ans[i][j] = topics[i][j]
    return ans

In [24]:

def predict_rnn(topics, num_chars, model, idx_to_word, word_to_idx):
    output_idx = [1]
    topics = [word_to_idx[x] for x in topics]
    topics = torch.tensor(topics)
    topics = topics.reshape((1, topics.shape[0]))
#     hidden = torch.zeros(num_layers, 1, hidden_dim)
#     hidden = (torch.zeros(num_layers, 1, hidden_dim).to(device), torch.zeros(num_layers, 1, hidden_dim).to(device))
    hidden = model.init_hidden(batch_size=1)
    if use_gpu:
#         hidden = hidden.cuda()
        adaptive_softmax.to(device)
        topics = topics.to(device)
    coverage_vector = model.init_coverage_vector(topics.shape[0], topics.shape[1])
    attentions = torch.zeros(num_chars, topics.shape[1])
    for t in range(num_chars):
        X = torch.tensor(output_idx[-1]).reshape((1, 1))
#         X = torch.tensor(output).reshape((1, len(output)))
        if use_gpu:
            X = X.to(device)
        if t == 0:
            output = torch.zeros(1, hidden_dim).to(device)
        else:
            output = output.squeeze(0)
        # May be need to change seq length in line below
        pred, output, hidden, attn_weight, coverage_vector = model.inference(inputs=X, topics=topics, output=output, hidden=hidden, coverage_vector=coverage_vector, seq_length=torch.tensor(50).reshape(1, 1).to(device))
#         print(coverage_vector)
        pred = pred.argmax(dim=1) # greedy strategy
        attentions[t] = attn_weight[0].data
#         pred = adaptive_softmax.predict(pred)
        if pred[-1] == 2:
#         if pred.argmax(dim=1)[-1] == 2:
            break
        else:
            output_idx.append(int(pred[-1]))
#             output.append(int(pred.argmax(dim=1)[-1]))
    return(''.join([idx_to_word[i] for i in output_idx[1:]]), [idx_to_word[i] for i in output_idx[1:]], attentions[:t+1].t(), output_idx[1:])

In [25]:
test = [1, 15, 23]
test = np.array(test, dtype=np.int64)
mm = np.float32(test != 0)
mm

array([1., 1., 1.], dtype=float32)

**Beam search strategy**

In [26]:
def beam_search(topics, num_chars, model, idx_to_word, word_to_idx, is_sample=False):
    output_idx = [1]
    topics = [word_to_idx[x] for x in topics]
    topics = torch.tensor(topics)
    topics = topics.reshape((1, topics.shape[0]))
#     hidden = torch.zeros(num_layers, 1, hidden_dim)
#     hidden = (torch.zeros(num_layers, 1, hidden_dim).to(device), torch.zeros(num_layers, 1, hidden_dim).to(device))
    hidden = model.init_hidden(batch_size=1)
    if use_gpu:
#         hidden = hidden.cuda()
        adaptive_softmax.to(device)
        topics = topics.to(device)
        # Also change seq_length here
        seq_length = torch.tensor(50).reshape(1, 1).to(device)
    """1"""    
    coverage_vector = model.init_coverage_vector(topics.shape[0], topics.shape[1])
    attentions = torch.zeros(num_chars, topics.shape[1])
    X = torch.tensor(output_idx[-1]).reshape((1, 1)).to(device)
    output = torch.zeros(1, hidden_dim).to(device)
    log_prob, output, hidden, attn_weight, coverage_vector = model.inference(inputs=X, 
                                                                   topics=topics, 
                                                                   output=output, 
                                                                   hidden=hidden, 
                                                                   coverage_vector=coverage_vector, 
                                                                   seq_length=seq_length)
    log_prob = log_prob.cpu().detach().reshape(-1).numpy()
#     print(log_prob[10])
    """2"""
    if is_sample:
        top_indices = np.random.choice(vocab_size, beam_size, replace=False, p=np.exp(log_prob))
    else:
        top_indices = np.argsort(-log_prob)
    """3"""
    beams = [(0.0, [idx_to_word[1]], idx_to_word[1], torch.zeros(1, topics.shape[1]), torch.ones(1, topics.shape[1]))]
    b = beams[0]
    beam_candidates = []
#     print(attn_weight[0].cpu().data, coverage_vector)
#     assert False
    for i in range(beam_size):
        word_idx = top_indices[i]
        beam_candidates.append((b[0]+log_prob[word_idx], b[1]+[idx_to_word[word_idx]], word_idx, torch.cat((b[3], attn_weight[0].cpu().data), 0), torch.cat((b[4], coverage_vector.cpu().data), 0), hidden, output.squeeze(0), coverage_vector))
    """4"""
    beam_candidates.sort(key = lambda x:x[0], reverse = True) # decreasing order
    beams = beam_candidates[:beam_size] # truncate to get new beams
    
    for xy in range(num_chars-1):
        beam_candidates = []
        for b in beams:
            """5"""
            X = torch.tensor(b[2]).reshape((1, 1)).to(device)
            """6"""
            log_prob, output, hidden, attn_weight, coverage_vector = model.inference(inputs=X, 
                                                                           topics=topics, 
                                                                           output=b[6], 
                                                                           hidden=b[5], 
                                                                           coverage_vector=b[7], 
                                                                           seq_length=seq_length)
            log_prob = log_prob.cpu().detach().reshape(-1).numpy()
            """8"""
            if is_sample:
                top_indices = np.random.choice(vocab_size, beam_size, replace=False, p=np.exp(log_prob))
            else:
                top_indices = np.argsort(-log_prob)
            """9"""
            for i in range(beam_size):
                word_idx = top_indices[i]
                beam_candidates.append((b[0]+log_prob[word_idx], b[1]+[idx_to_word[word_idx]], word_idx, torch.cat((b[3], attn_weight[0].cpu().data), 0), torch.cat((b[4], coverage_vector.cpu().data), 0), hidden, output.squeeze(0), coverage_vector))
        """10"""
        beam_candidates.sort(key = lambda x:x[0], reverse = True) # decreasing order
        beams = beam_candidates[:beam_size] # truncate to get new beams
    
    """11"""
    if '<EOS>' in beams[0][1]:
        first_eos = beams[0][1].index('<EOS>')
    else:
        first_eos = num_chars-1
    return(''.join(beams[0][1][:first_eos]), beams[0][1][:first_eos], beams[0][3][:first_eos].t(), beams[0][4][:first_eos])

**Attention visualization**

In [27]:
# plt.switch_backend('agg')
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
    
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.subplots(1)
#     cmap = 'bone'
    cmap = 'viridis'
    cax = ax.matshow(attentions.numpy(), cmap=cmap)
    fig.colorbar(cax)

    # Set up axes
    # ax.set_yticklabels([''] + input_sentence.split(' '), fontproperties=fontprop, fontsize=10)
    # ax.set_xticklabels([''] + output_words, fontproperties=fontprop, fontsize=10, rotation=45)

    ax.set_yticklabels([''] + input_sentence.split(' '))
    ax.set_xticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    
    word_size = 0.5
    fig.set_figheight(word_size * len(input_sentence.split(' ')))
    fig.set_figwidth(word_size * len(output_words))
    plt.show()

def evaluateAndShowAttention(input_sentence, method='beam_search', is_sample=False):
    num_chars = 100     # change this to set size of output
    if method == 'beam_search':
        _, output_words, attentions, coverage_vector = beam_search(input_sentence, num_chars, model, idx_to_word, word_to_idx, is_sample=is_sample)
    else:
        _, output_words, attentions, _ = predict_rnn(input_sentence, num_chars, model, idx_to_word, word_to_idx)
    #print('input =', ' '.join(input_sentence))
    #print('output =', ' '.join(output_words))
#     n_digits = 3
#     coverage_vector = torch.round(coverage_vector * 10**n_digits) / (10**n_digits)
#     coverage_vector=np.round(coverage_vector, n_digits)
#     print(coverage_vector.numpy())

    showAttention(' '.join(input_sentence), output_words, attentions)

**Bleu score calculation**

In [28]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction

def evaluate_bleu(model, topics_test, corpus_test, num_test, method='beam_search', is_sample=False):
    num_chars = 100     # change this to set size of output
    bleu_2_score = 0
    for i in tqdm(range(len(corpus_test[:num_test]))):
        if method == 'beam_search':
            _, output_words, _, _ = beam_search([idx_to_word[x] for x in topics_test[i]], num_chars, model, idx_to_word, word_to_idx, False)
        else:
            _, output_words, _, _ = predict_rnn([idx_to_word[x] for x in topics_test[i]], num_chars, model, idx_to_word, word_to_idx)
        bleu_2_score += sentence_bleu([[idx_to_word[x] for x in corpus_test[i] if x not in [0, 2]]], output_words, weights=(0, 1, 0, 0))
        
    bleu_2_score = bleu_2_score / num_test
    return bleu_2_score

print([[idx_to_word[x] for x in corpus_test[10] if x not in [0, 2]]])
print(len(corpus_test[10]))
print(len([ idx_to_word[x] for x in corpus_test[10] ]))
print(len(corpus_test[:100]))

[['تھا', 'چھوٹے', 'ماموں', 'نے', 'ایک', 'دم', 'اچھل', 'کر', 'کہا', '۔', 'آہاہا', '،', 'ہاں', '،', '!', 'ہم', 'سب', 'خوشی', 'کے', 'مارے', 'آنکھیں', 'پھاڑ', 'پھاڑ', 'کر', 'ایک', 'دوسرے', 'کی', 'طرف', 'دیکھنے', 'لگے', '۔', 'مگر', 'اسٹور', 'تک', 'پہنچنے', 'کے', 'لیے', 'صرف', 'ابا', 'کا', 'کمرہ', 'ہی', 'راستہ', 'تھا', 'ماموں', 'کٹورا', 'لے', 'کر', 'ننگے', 'پاو\x04¿ں', 'بڑھے', 'اچانک', 'دیدی', 'بولی', ':', 'سب', 'کتابوں', 'میں', 'لکھا', 'ہے', 'جوری', 'بری', 'بات', 'ہے', '۔', 'ہٹ', '!', 'غریب', 'کے', 'لیے', 'چوری', 'کرنے', 'میں', 'گناہ', 'نہیں', 'ہوتا', '،', 'وہ', 'جو', 'سلطانہ', 'ڈاکو', 'تھا', 'وہ', 'سب', 'لوٹ', 'کر', 'غریبوں', 'میں', 'بانٹ', 'دیا', 'کرتا', 'تھا', 'میں']]
93
93
100


**Some configurations**

In [29]:
embedding_dim = 100     # depends on your Word2Vec model embedding size
# hidden_dim = 512
hidden_dim = 128
lr = 1e-3 * 0.5
momentum = 0.01


num_epoch = 80

clip_value = 0.1
use_gpu = True
# use_gpu = False
num_layers = 2
# num_layers = 1
bidirectional = False
# batch_size = 32
batch_size = 4
num_keywords = 5            # change this if more keywords
verbose = 1
check_point = 5
beam_size = 2
is_sample = True
vocab_size = len(vocab)
# device = torch.device(deviceName)
loss_function = nn.NLLLoss()

adaptive_softmax = nn.AdaptiveLogSoftmaxWithLoss(
    hidden_dim, len(vocab), cutoffs=[round(vocab_size / 20), 4*round(vocab_size / 20)])

In [30]:
!pip install keras-rectified-adam

Collecting keras-rectified-adam
  Downloading https://files.pythonhosted.org/packages/35/6f/e91e36f09178df814ccfc4f6fc7239175cb6b67ce5fe9fbb579152aa326a/keras-rectified-adam-0.18.0.tar.gz
Building wheels for collected packages: keras-rectified-adam
  Building wheel for keras-rectified-adam (setup.py) ... [?25l[?25hdone
  Created wheel for keras-rectified-adam: filename=keras_rectified_adam-0.18.0-cp37-none-any.whl size=8950 sha256=e8e2fde2437bb8d33d2c37f485148b73cc9899a175573cda773bc0a03538786a
  Stored in directory: /root/.cache/pip/wheels/4c/7f/90/bdcf51356f0efcc737e0fb1d5ea310418ae4c78b4614fbc67e
Successfully built keras-rectified-adam
Installing collected packages: keras-rectified-adam
Successfully installed keras-rectified-adam-0.18.0


In [31]:
model = MTALSTM(hidden_dim=hidden_dim, embed_dim=embedding_dim, num_keywords=num_keywords, 
                num_layers=num_layers, num_labels=len(vocab), weight=word_vec, bidirectional=bidirectional)


In [32]:
model = MTALSTM(hidden_dim=hidden_dim, embed_dim=embedding_dim, num_keywords=num_keywords, 
                num_layers=num_layers, num_labels=len(vocab), weight=word_vec, bidirectional=bidirectional)
# optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)

# uncommented by me 
optimizer = optim.Adam(model.parameters(), lr=lr)

#from keras_radam import RAdam
#optimizer = RAdam(model.parameters(), lr=lr)


lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.4, patience=2, min_lr=1e-7, verbose=True)
# optimizer = optim.Adadelta(model.parameters(), lr=lr)
if use_gpu:
#     model = nn.DataParallel(model)
#     model = model.to(device)
  model = model.to('cuda:0')
  print("Dump to cuda")
else:
  model = model.to(device)


Dump to cuda


In [33]:

def params_init_uniform(m):
    if type(m) == nn.Linear:
        y = 0.04
        nn.init.uniform_(m.weight, -y, y)
        
model.apply(params_init_uniform)

MTALSTM(
  (embedding): Embedding(146817, 100)
  (Uf): Linear(in_features=500, out_features=5, bias=False)
  (decoder): AttentionDecoder(
    (attention): Attention(
      (Ua): Linear(in_features=100, out_features=128, bias=False)
      (Wa): Linear(in_features=128, out_features=128, bias=False)
      (va): Linear(in_features=128, out_features=1, bias=True)
    )
    (rnn): LSTM(200, 128, num_layers=2, dropout=0.5)
  )
  (adaptiveSoftmax): AdaptiveLogSoftmaxWithLoss(
    (head): Linear(in_features=128, out_features=7343, bias=False)
    (tail): ModuleList(
      (0): Sequential(
        (0): Linear(in_features=128, out_features=32, bias=False)
        (1): Linear(in_features=32, out_features=22023, bias=False)
      )
      (1): Sequential(
        (0): Linear(in_features=128, out_features=8, bias=False)
        (1): Linear(in_features=8, out_features=117453, bias=False)
      )
    )
  )
)

# Load previous checkpoint

In [34]:
version_num = 1
# Type = 'best'
Type = 'trainable'
model_check_point = '%s/model_%s_%d.pk' % (save_folder, Type, version_num)
optim_check_point = '%s/optim_%s_%d.pkl' % (save_folder, Type, version_num)
loss_check_point = '%s/loss_%s_%d.pkl' % (save_folder, Type, version_num)
epoch_check_point = '%s/epoch_%s_%d.pkl' % (save_folder, Type, version_num)
bleu_check_point = '%s/bleu_%s_%d.pkl' % (save_folder, Type, version_num)
loss_values = []
epoch_values = []
bleu_values = []
if os.path.isfile(model_check_point):
    print('Loading previous status (ver.%d)...' % version_num)
    model.load_state_dict(torch.load(model_check_point, map_location='cpu'))
    model = model.to(device)
    optimizer.load_state_dict(torch.load(optim_check_point))
    lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.4, patience=2, min_lr=1e-7, verbose=True)
    loss_values = torch.load(loss_check_point)
    epoch_values = torch.load(epoch_check_point)
    bleu_values = torch.load(bleu_check_point)
    print('Load successfully')
else:
    print("ver.%d doesn't exist" % version_num)

Loading previous status (ver.1)...
Load successfully


In [35]:
def isnan(x):
    return x != x

for name, p in model.named_parameters():
#     if p.grad is None:
#         continue
    if p.requires_grad:
        print(name, p)
#         p.register_hook(lambda grad: torch.clamp(grad, -clip_value, clip_value))

Uf.weight Parameter containing:
tensor([[-0.0443, -0.1080,  0.0762,  ...,  0.1006,  0.0372, -0.0090],
        [ 0.0621, -0.0320, -0.0580,  ...,  0.0659, -0.0534, -0.0218],
        [ 0.0103,  0.0520, -0.0090,  ...,  0.0262,  0.0397, -0.0869],
        [-0.0251,  0.1063, -0.0021,  ...,  0.0074, -0.1113, -0.0216],
        [-0.0430, -0.1499, -0.0318,  ...,  0.0165, -0.0107, -0.0323]],
       device='cuda:0', requires_grad=True)
decoder.attention.Ua.weight Parameter containing:
tensor([[ 0.0875,  0.1000,  0.2211,  ..., -0.1830, -0.1067, -0.0955],
        [ 0.0010,  0.2574, -0.2827,  ...,  0.0358, -0.0143,  0.1897],
        [ 0.0322, -0.2989,  0.2179,  ..., -0.0422,  0.0434, -0.0881],
        ...,
        [ 0.0051,  0.2908, -0.2869,  ...,  0.0465, -0.0956,  0.1361],
        [-0.1012,  0.0736, -0.2038,  ...,  0.1308,  0.0572,  0.1367],
        [-0.0212,  0.3074, -0.2535,  ...,  0.0589, -0.0762,  0.1670]],
       device='cuda:0', requires_grad=True)
decoder.attention.Wa.weight Parameter contain

In [36]:
def decay_lr(optimizer, epoch, factor=0.1, lr_decay_epoch=60):
    if epoch % lr_decay_epoch == 0:
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * factor
        print('lr decayed to %.4f' % optimizer.param_group[0]['lr'])
    return optimizer

**Train the model**

In [37]:
#device = torch.device('cuda:0')

In [None]:
since = time.time()
autograd.set_detect_anomaly(False)
prev_epoch = 0 if not epoch_values else epoch_values[-1]
best_bleu = 0 if not bleu_values else max(bleu_values)

for epoch in range(num_epoch - prev_epoch):
    epoch += prev_epoch
    start = time.time()
    num, total_loss = 0, 0
#     optimizer = decay_lr(optimizer=optimizer, epoch=epoch+1)
    topics_indice, corpus_indice = shuffleData(topics_indice, corpus_indice) # shuffle data at every epoch
    data = data_iterator(corpus_indice, topics_indice, batch_size, max(length) + 1)
    hidden = model.init_hidden(batch_size=batch_size)
    weight = torch.ones(len(vocab))
    weight[0] = 0
    num_iter = len(corpus_indice) // batch_size
    for X, Y, mask, topics in tqdm(data, total=num_iter):
        num += 1
#         hidden.detach_()
        if use_gpu:
            X = X.to(device)
            Y = Y.to(device)
            mask = mask.to(device)
            topics = topics.to(device)
#             hidden = hidden.to(device)
#             hidden[0].to(device)
#             hidden[1].to(device)
            loss_function = loss_function.to(device)
            weight = weight.to(device)
        optimizer.zero_grad()
        # init hidden layer
#         hidden = model.init_hidden(num_layers, batch_size, hidden_dim)
        coverage_vector = model.init_coverage_vector(batch_size, num_keywords)
        init_output = torch.zeros(batch_size, hidden_dim).to(device)
        # inputs, topics, output, hidden=None, mask=None, target=None, coverage_vector=None, seq_length=None):
        output, _, hidden, _, _ = model(inputs=X, topics=topics, output=init_output, hidden=hidden, mask=mask, target=Y, coverage_vector=coverage_vector)
#         output, hidden = model(X, topics)
        hidden[0].detach_()
        hidden[1].detach_()
        
        loss = (-output.output).reshape((-1, batch_size)).t() * mask
#         loss = loss.sum(dim=1)
        loss = loss.sum(dim=1) / mask.sum(dim=1)
        loss = loss.mean()
        loss.backward()
        
        norm = 0.0
#         norm = nn.utils.clip_grad_norm_(model.parameters(), 10)
        nn.utils.clip_grad_value_(model.parameters(), 1)
            
        optimizer.step()
        total_loss += float(loss.item())
        
        if np.isnan(total_loss):
            for name, p in model.named_parameters():
                if p.grad is None:
                    continue 
                print(name, p)
            assert False, "Gradient explode"
    
    one_iter_loss = np.mean(total_loss)
    lr_scheduler.step(one_iter_loss)
#     print("One iteration loss {:.3f}".format(one_iter_loss))
    
    # validation
    bleu_score = 0
    num_test = 500
    bleu_score = evaluate_bleu(model, topics_test, corpus_test, num_test=num_test, method='predict_rnn', is_sample=False)
    
    bleu_values.append(bleu_score)
    loss_values.append(total_loss / num)
    epoch_values.append(epoch+1)
    
    # save checkpoint
    # if ((epoch + 1) % check_point == 0) or (epoch == (num_epoch - 1)) or epoch+1 > 7 or bleu_score > 4:
    if (epoch + 1) > 0:     # save all models
        model_check_point = '%s/model_trainable_%d.pk' % (save_folder, epoch+1)
        optim_check_point = '%s/optim_trainable_%d.pkl' % (save_folder, epoch+1)
        loss_check_point = '%s/loss_trainable_%d.pkl' % (save_folder, epoch+1)
        epoch_check_point = '%s/epoch_trainable_%d.pkl' % (save_folder, epoch+1)
        bleu_check_point = '%s/bleu_trainable_%d.pkl' % (save_folder, epoch+1)
        torch.save(model.state_dict(), model_check_point)
        torch.save(optimizer.state_dict(), optim_check_point)
        torch.save(loss_values, loss_check_point)
        torch.save(epoch_values, epoch_check_point)
        torch.save(bleu_values, bleu_check_point)
    
    # save current best result
    if bleu_score > best_bleu:
        best_bleu = bleu_score
        print('current best bleu: %.4f' % best_bleu)
        model_check_point = '%s/model_best_%d.pk' % (save_folder, epoch+1)
        optim_check_point = '%s/optim_best_%d.pkl' % (save_folder, epoch+1)
        loss_check_point = '%s/loss_best_%d.pkl' % (save_folder, epoch+1)
        epoch_check_point = '%s/epoch_best_%d.pkl' % (save_folder, epoch+1)
        bleu_check_point = '%s/bleu_best_%d.pkl' % (save_folder, epoch+1)
        torch.save(model.state_dict(), model_check_point)
        torch.save(optimizer.state_dict(), optim_check_point)
        torch.save(loss_values, loss_check_point)
        torch.save(epoch_values, epoch_check_point)
        torch.save(bleu_values, bleu_check_point)
        
    # calculate time
    end = time.time()
    s = end - since
    h = math.floor(s / 3600)
    m = s - h * 3600
    m = math.floor(m / 60)
    s -= (m * 60 + h * 3600)
    

    # verbose 
    # if ((epoch + 1) % verbose == 0) or (epoch == (num_epoch - 1)):
    print('epoch %d/%d, loss %.4f, norm %.4f, predict bleu: %.4f, time %.3fs, since %dh %dm %ds'
            % (epoch + 1, num_epoch, total_loss / num, norm, bleu_score, end - start, h, m, s))

    evaluateAndShowAttention(['ایک', 'پاکستان', 'بھارت', 'کشمیر', 'قبضہ'], method='beam_search', is_sample=True)
    evaluateAndShowAttention(['کرکٹ', 'بنگلہ', 'دیش', 'کھیل', 'جیت'], method='beam_search', is_sample=True)
    evaluateAndShowAttention(['دھماکہ', 'دہشت', 'گرد', 'کیا', 'زخمی'], method='beam_search', is_sample=True)


In [39]:
print(torch.cuda.device_count())

1


In [41]:
#evaluateAndShowAttention(['ایک', 'پاکستان', 'بھارت', 'کشمیر', 'قبضہ'], method='beam_search', is_sample=True)
#evaluateAndShowAttention(['کرکٹ', 'بنگلہ', 'دیش', 'کھیل', 'جیت'], method='beam_search', is_sample=True)
#evaluateAndShowAttention(['دھماکہ', 'دہشت', 'گرد', 'کیا', 'زخمی'], method='beam_search', is_sample=True)


# Evaluation


In [42]:
from nltk.translate.bleu_score import sentence_bleu

def evaluate_bleu(model, topics_test, corpus_test, num_test, method='beam_search', is_sample=False):
    num_chars = 100     # change this to set size of output
    bleu_score = 0
    for i in range(len(corpus_test[:num_test])):
        if method == 'beam_search':
            _, output_words, _, _ = beam_search([idx_to_word[x] for x in topics_test[i]], num_chars, model, idx_to_word, word_to_idx, False)
        else:
            _, output_words, _, _ = predict_rnn([idx_to_word[x] for x in topics_test[i]], num_chars, model, idx_to_word, word_to_idx)
        bleu_score += sentence_bleu([[idx_to_word[x] for x in corpus_test[i] if x not in [0, 2]]], output_words, weights=(1, 0, 0, 0))
        
    bleu_score = bleu_score / num_test
    return bleu_score

In [43]:
AllBleu = []


In [44]:
import random

In [None]:
AllBleu = {}
for i in range(1,80):
  version_num = i
  Type = 'trainable'
  model_check_point = '%s/model_%s_%d.pk' % (save_folder, Type, version_num)
  optim_check_point = '%s/optim_%s_%d.pkl' % (save_folder, Type, version_num)
  loss_check_point = '%s/loss_%s_%d.pkl' % (save_folder, Type, version_num)
  epoch_check_point = '%s/epoch_%s_%d.pkl' % (save_folder, Type, version_num)
  bleu_check_point = '%s/bleu_%s_%d.pkl' % (save_folder, Type, version_num)
  loss_values = []
  epoch_values = []
  bleu_values = []
  if os.path.isfile(model_check_point):
      print('Loading previous status (ver.%d)...' % version_num)
      model.load_state_dict(torch.load(model_check_point, map_location='cpu'))
      model = model.to(device)
      optimizer.load_state_dict(torch.load(optim_check_point))
      lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.4, patience=2, min_lr=1e-7, verbose=True)
      loss_values = torch.load(loss_check_point)
      epoch_values = torch.load(epoch_check_point)
      bleu_values = torch.load(bleu_check_point)
      print('Load successfully')
      score = evaluate_bleu(model, topics_test, corpus_test, num_test=100, method='predict_rnn', is_sample=False)

      print("BleuScore:",score)

      AllBleu[i]=score

    
  else:
      print("ver.%d doesn't exist" % version_num)




In [None]:
import matplotlib.pyplot as plt
  
# x axis values
x = AllBleu.keys()
# corresponding y axis values
y = AllBleu.values()
  
# plotting the points 
plt.plot([0]+list(x), [0]+list(y))
  
# naming the x axis
plt.xlabel('no.of epochs trained')
# naming the y axis
plt.ylabel('Bleu score')
  
# giving a title to my graph
plt.title('BLEU Scores for different epochs - Google Colab')
  
# function to show the plot
plt.show()

In [None]:
def inference(input_sentence, method='beam_search', is_sample=False, showResult=False):
    num_chars = 100     # change this to set size of output
    if method == 'beam_search':
        _, output_words, attentions, coverage_vector = beam_search(input_sentence, num_chars, model, idx_to_word, word_to_idx, is_sample=is_sample)
    else:
        _, output_words, attentions, _ = predict_rnn(input_sentence, num_chars, model, idx_to_word, word_to_idx)

    if showResult:
      print('input =', ' '.join(input_sentence))
      print('output =', ' '.join(output_words))
    return output_words[1:]

inference([idx_to_word[word] for word in topics_indice[100]])