In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from collections import Counter

from typing import Callable, Optional
from copy import deepcopy

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch import optim
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
!pip install pkbar
import pkbar

Collecting pkbar
  Downloading pkbar-0.5-py3-none-any.whl (9.2 kB)
Installing collected packages: pkbar
Successfully installed pkbar-0.5
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [3]:
!pip install rouge
from rouge import Rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [4]:
class Parameters:
    hidden_size: int = 150  
    dec_hidden_size: Optional[int] = 200  
    embed_size: int = 100 
    eps=1e-31
    batch_size=16 
    enc_bidi = True 
    enc_rnn_dropout = 0.1 
    enc_attn = True 
    dec_attn = True 
    pointer = True 
    dec_in_dropout=0.1
    dec_rnn_dropout=0.1
    dec_out_dropout=0.1
    max_src_len: int = 65 
    max_tgt_len: int = 15  
    vocab_min_frequency: int = 3
    embed_file: Optional[str] = 'C:/Users/Nirmal/Documents/Python Scripts/glove.6B.100d.txt'  
    data_path: str = 'C:/Users/Nirmal/Documents/Python Scripts/cl_train_news_summary_more.csv'
    val_data_path: Optional[str] = 'C:/Users/Nirmal/Documents/Python Scripts/cl_train_news_summary_more.csv'
    test_data_path: str = 'C:/Users/Nirmal/Documents/Python Scripts/cl_valid_news_summary_more.csv'
    resume_train = False
    encoder_weights_path='encoder_sum.pt'
    decoder_weights_path='decoder_sum.pt'
    encoder_decoder_adapter_weights_path='adapter_sum.pt'
    losses_path='val_losses.pkl'
    print_every = 100

In [5]:
def simple_tokenizer(text, lower=False, newline=None):
    if lower:
        text = text.lower()
    if newline is not None:
        text = text.replace('\n', ' ' + newline + ' ')
        return text.split()


In [6]:
class Vocab(object):
    PAD = 0
    SOS = 1
    EOS = 2
    UNK = 3

def __init__(self):
    self.word2index = {}
    self.word2count = Counter()
    self.reserved = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']
    self.index2word = self.reserved[:]
    self.embeddings = None

def add_words(self, words):
    for word in words:
  
      if word not in self.word2index:

        self.word2index[word] = len(self.index2word)

        self.index2word.append(word)

    self.word2count.update(words)
  
  def load_embeddings(self, file_path: str, dtype=np.float32) -> int:
    ''' Load the embedding vectors from a file into the vocabulary'''
    num_embeddings = 0
    vocab_size = len(self)
    with open(file_path, 'rb') as f:
        for line in f:
            line = line.split()
            word = line[0].decode('utf-8')

            idx = self.word2index.get(word)
        if idx is not None:
            vec = np.array(line[1:], dtype=dtype)
        if self.embeddings is None:
            n_dims = len(vec)
            self.embeddings = np.random.normal(np.zeros((vocab_size, n_dims))).astype(dtype)
            self.embeddings[self.PAD] = np.zeros(n_dims)
            self.embeddings[idx] = vec
            num_embeddings += 1
            return num_embeddings

def save_to_file(self, filename):
    ''' Save the Vocab object to a file'''
    with open(filename,'wb') as f:
        pickle.dump(self,f) 

def __getitem__(self, item):
    ''' Get the next item when iterating over the instance'''
    if type(item) is int:
        return self.index2word[item]
    return self.word2index.get(item, self.UNK)

def __len__(self):
    ''' Return the length of the instance or vocabulary'''
    return len(self.index2word)


def load_vocab(filename):
    ''' Load a Vocab instance from a file'''
    with open(filename,'rb') as f:
        v = pickle.load(f)
    return v

In [7]:
class Dataset(object):
    def __init__(self, filename: str, tokenize: Callable=simple_tokenizer, max_src_len: int=None,
               max_tgt_len: int=None, max_rows: int=None, truncate_src: bool=False, truncate_tgt: bool=False):
    print("Reading dataset %s..." % filename, end=' ', flush=True)
    
    self.filename = filename
    self.pairs = []
    self.src_len = 0
    self.tgt_len = 0
    self.max_rows = max_rows

    if max_rows is None:
        df = pd.read_csv(filename, encoding='utf-8')
    else:
        df = pd.read_csv(filename, encoding='utf-8', nrows=max_rows
                        )

    sources = df['text'].apply(lambda x : tokenize(x))

    if truncate_src:
        sources = [src[:max_src_len] if len(src)>max_src_len else src for src in sources]

    targets = df['summary'].apply(lambda x : tokenize(x))

    if truncate_tgt:
        targets = [tgt[:max_tgt_len] if len(tgt)>max_tgt_len else tgt for tgt in targets]
        
 
    src_length = [len(src)+1 for src in sources]
    tgt_length = [len(tgt)+1 for tgt in targets]

    max_src = max(src_length)
    max_tgt = max(tgt_length)

    self.src_len = max_src
    self.tgt_len = max_tgt

    self.pairs.append([(src, tgt, src_len, tgt_len) for src,tgt,src_len,tgt_len in zip(sources,targets,src_length,tgt_length)])
    self.pairs = self.pairs[0]
    print("%d pairs." % len(self.pairs))

def build_vocab(self, min_freq, embed_file: str=None) -> Vocab:
    
    total_words=[src+tgr for src,tgr,len_src,len_tgr in self.pairs]
    total_words = [item for sublist in total_words for item in sublist]
    word_counts = Counter(total_words)
    vocab=Vocab()
    for word,count in word_counts.items():
        if(count>min_freq):
            vocab.add_words([word])  
    count = vocab.load_embeddings(embed_file)
    print("%d pre-trained embeddings loaded." % count)

    return vocab  


In [8]:
class MyDataset(nn.Module):
    ''' A Dataset Class where we store all the data needed during the training phase'''
    
    def __init__(self, src_sents, trg_sents, vocab):
        self.src_sents = src_sents
        self.trg_sents = trg_sents
        self.vocab=vocab
        self._len = len(src_sents)

    def __getitem__(self, index):
        ''' Return the ith items from the object
            Input:
            - Index: integer, index of the items to return
            Output:
            - a dictionary with keys x the source texts, y the targets, 
              x_len length of source texts, y_len the length of targets
        '''
        return {'x':self.src_sents[index], 
                'y':self.trg_sents[index], 
                'x_len':len(self.src_sents[index]), 
                'y_len':len(self.trg_sents[index])}
    
    def __len__(self):
        ''' Return the length of the object'''
        return self._len


In [9]:
def tensorize(vocab, tokens):
    ''' Convert the tokens received to a tensor '''
    return torch.tensor([vocab[token] for token in tokens])

def pad_sequence(vectorized_sent, max_len):
    ''' Padding the sentence (tensor) to max_len '''
    pad_dim = (0, max_len - len(vectorized_sent))
    return F.pad(vectorized_sent, pad_dim, 'constant').tolist()

def preprocess(x,y,p,vocab):
    ''' Prepare a source text x and a target summary y: convert them to tensors,
        pads the sentences to its max length.
    '''
    tensors_src = tensorize(vocab, x)
    tensors_trg = tensorize(vocab, y) 
    return {'x':pad_sequence(tensors_src, p.max_src_len), 
          'y':pad_sequence(tensors_trg, p.max_tgt_len), 
          'x_len':len(tensors_src), 
          'y_len':len(tensors_trg)}

def sort_batch_by_len(data_dict,p,vocab):
    ''' Return a batch of sentences processed and ordered by its length
    '''
    data=[]
    res={'x':[],'y':[],'x_len':[],'y_len':[]}
    for i in range(data_dict['x_len']):
        data.append(preprocess(data_dict['x'][i],data_dict['y'][i],p,vocab))
    for i in range(len(data)):
        res['x'].append(data[i]['x'])
        res['y'].append(data[i]['y'])
        res['x_len'].append(len(data[i]['x']))
        res['y_len'].append(len(data[i]['y']))  
    
    sorted_indices = np.array(res['x_len']).argsort()[::-1].tolist()
    data_batch = {name:[_tensor[i] for i in sorted_indices]
                  for name, _tensor in res.items()}
    return data_batch


In [10]:
class EncoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, bidi=True, rnn_drop: float=0):
    super(EncoderRNN, self).__init__()

    self.hidden_size = hidden_size

    self.num_directions = 2 if bidi else 1

    self.gru = nn.GRU(embed_size, hidden_size, bidirectional=bidi, dropout=rnn_drop)

def forward(self, embedded,hidden,input_lengths=None):
    ''' Run a Forward pass of the encoder to return outputs
        Input:
        - embedded: tensor, the embedding of the input data (word of the soure text)
        - hidden: a tensor, the previous hidden state of the encoder
        - input:lengths: a list of integers, length of the inputs 
    '''
    if input_lengths is not None:
        embedded = pack_padded_sequence(embedded, input_lengths,batch_first=True)
    
    output, hidden = self.gru(embedded,hidden)
    
    if input_lengths is not None:
        output, _ = pad_packed_sequence(output)
    if self.num_directions > 1:
        batch_size = hidden.size(1)
        hidden = hidden.transpose(0, 1).contiguous().view(1, batch_size, self.hidden_size * self.num_directions)
    return output, hidden

  def init_hidden(self, batch_size, device):
        return torch.zeros(self.num_directions, batch_size, self.hidden_size, device=device) 


In [11]:
class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, enc_attn=True, dec_attn=True,
               enc_attn_cover=True, pointer=True,
               in_drop: float=0, rnn_drop: float=0, out_drop: float=0, enc_hidden_size=None,
               epsilon: float=0.0, device: str="cpu"):
    ''' Initialize the decoder instance defining its parameters:
            Input:
                - vocab_size: integer, number of words in the vocabulary 
                - embed_size: integer, size of the embedding layer
                - hidden_size: integer, size of the hidden layer (Hyperparameter)
                - enc_attn: activate the attention in the encoder
                - dec_attn: activate the attention in the decoder
                - enc_attn_cover: activate the coverage mechanism in the attention
                - pointer: activate the pointer generation
                - in_drop: dropout probability to apply to the input of the decoder
                - rnn_drop: dropout probability to apply to the GRU layer of the decoder
                - out_drop: dropout probability to apply to the output of the decoder
                - enc_hidden_size: dimension if the hidden state of the encoder
                - epsilon: float
                - device: cpu or gpu, device to store the tensors
    '''

    super(DecoderRNN, self).__init__()
    self.vocab_size = vocab_size
    self.hidden_size = hidden_size
    self.combined_size = hidden_size
    self.device = device
    self.eps = epsilon
    self.in_drop = nn.Dropout(in_drop) if in_drop > 0 else None
    self.gru = nn.GRU(embed_size, hidden_size, dropout=rnn_drop)
    
    if not enc_hidden_size: enc_hidden_size = self.hidden_size
    self.enc_bilinear = nn.Bilinear(hidden_size, enc_hidden_size, 1)
    
    self.combined_size += enc_hidden_size
    if enc_attn_cover:
        self.cover_weight = nn.Parameter(torch.rand(1))
        self.dec_bilinear = nn.Bilinear(self.hidden_size, self.hidden_size, 1)
        self.combined_size += self.hidden_size
        self.out_drop = nn.Dropout(out_drop) if out_drop > 0 else None
        self.ptr = nn.Linear(self.combined_size, 1)
        self.out = nn.Linear(self.combined_size, vocab_size)

def forward(self, embedded, hidden, encoder_hidden=None, decoder_states=None, coverage_vector=None, *,
              encoder_word_idx=None, ext_vocab_size: int=None, log_prob: bool=True):
    ''' Run a Forward pass of the decoder to return outputs
        Input:
        - embedded: tensor, the embedding of the input data (decoder output in the last step
        - hidden: a tensor, the previous hidden state of the decoder
        - decoder_states: tensor, hidden state of the decoder in the last step
        - coverage_vector: tensor, coverage vector at this step
        - encoder_word_idx: tensor, indexes of the words in the source text
        - ext_vocab_size: integer, vocabulary size of the extended vocabulary
        - log_prob: bool, use of Log Softmax or Softmax in the output
    '''
    batch_size = embedded.size(0)
    combined = torch.zeros(batch_size, self.combined_size, device=self.device)
    if self.in_drop: embedded = self.in_drop(embedded)
    output, hidden = self.gru(embedded.unsqueeze(0), hidden)
    combined[:, :self.hidden_size] = output.squeeze(0)        
    offset = self.hidden_size
    enc_attn, prob_ptr = None, None

    num_enc_steps = encoder_hidden.size(0)
    enc_total_size = encoder_hidden.size(2)
    enc_attn = self.enc_bilinear(hidden.expand(num_enc_steps, batch_size, -1).contiguous(),encoder_hidden)
  
    if coverage_vector is not None:
        enc_attn += self.cover_weight * torch.log(coverage_vector.transpose(0, 1).unsqueeze(2) + self.eps)
    enc_attn = F.softmax(enc_attn, dim=0).transpose(0, 1)

    enc_context = torch.bmm(encoder_hidden.permute(1, 2, 0), enc_attn)
    combined[:, offset:offset+enc_total_size] = enc_context.squeeze(2)
    offset += enc_total_size
    enc_attn = enc_attn.squeeze(2)
    
    if decoder_states is not None and len(decoder_states) > 0:
        dec_attn = self.dec_bilinear(hidden.expand_as(decoder_states).contiguous(),
                                      decoder_states)
        dec_attn = F.softmax(dec_attn, dim=0).transpose(0, 1)
        dec_context = torch.bmm(decoder_states.permute(1, 2, 0), dec_attn)
        combined[:, offset:offset + self.hidden_size] = dec_context.squeeze(2)
        offset += self.hidden_size
    
    out_embed = combined
    logits = self.out(out_embed) 

 
    prob_ptr = torch.sigmoid(self.ptr(combined)) 
    prob_gen = 1 - prob_ptr
    gen_output = F.softmax(logits, dim=1)  
    output = prob_gen * gen_output
    pad_dim = (0, ext_vocab_size - output.size(1))
    output=F.pad(output, pad_dim, 'constant')

    ptr_output = enc_attn
    encoder_word_idx_l = encoder_word_idx.long()
    try:
        output.scatter_add_(1, encoder_word_idx_l, prob_ptr * ptr_output)
    except:
        prob_po = prob_ptr * ptr_output 
        print(output.shape,encoder_word_idx_l.shape,prob_ptr.shape, ptr_output.shape, prob_po.shape)
        print(output)
        print(encoder_word_idx_l)
        print(prob_po)
        output.scatter_add_(1, encoder_word_idx_l, prob_po)

    output = torch.log(output + self.eps)

    return output, hidden, enc_attn, prob_ptr 


In [12]:
def get_coverage_vector(enc_attn_weights):
    """Combine the past attention weights into one vector"""
    coverage_vector = torch.sum(torch.cat(enc_attn_weights), dim=0)
    
    return coverage_vector  


In [13]:
def get_next_batch(data, p, vocab, i, batch_size, device):
    ''' Generate and return the next batch of the data during training
        Input:
        - data: list, input data to the model
        - p: a class Parameters object, model and training parameters
        - vocab: a class Vocab object, vocabulary of the data
        - i: integer, index or iterator
        - batch_size: integer, batch size
        - device: string, where to train the model, cpu or gpu 
    '''
    vocab_ext=deepcopy(vocab)

    try:
        data_dict=data[i:i+batch_size]
    except:
        data_dict=data[i:len(data)]
    data_batch = sort_batch_by_len(data_dict,p,vocab_ext)
    for word in data_dict['x']:
        vocab_ext.add_words(word)

    data_batch_extra=sort_batch_by_len(data_dict,p,vocab_ext)    
    x_extra=torch.tensor(data_batch_extra['x']).to(device)
    
    x, x_len = torch.tensor(data_batch['x']).to(device), torch.tensor(data_batch['x_len']).to(device)
    y, y_len = torch.tensor(data_batch['y']).to(device), torch.tensor(data_batch['y_len']).to(device)

    return x, x_len, y, y_len, x_extra, vocab_ext


In [14]:
def train(dataset,val_dataset,vocab,p,embedding_weights, learning_rate, num_epochs):
    ''' Run all the steps in the training phase
        Input:
        - dataset: Dataset object, training data
        - val_dataset: Dataset object, validation data
        - vocab: a class Vocab object, the vocabulary of the datasets
        - p: a class Parameters object, model and training parameters
        - embedding_weigths: tensor, the embedding vectors
        - learning_rate: float, learning rate parameter
        - num_epochs: integer, number of epochs of the training
    '''
    eps = p.eps
    batch_size =p.batch_size
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    enc_dec_adapter = nn.Linear(p.hidden_size * 2, p.dec_hidden_size).to(DEVICE)
    embedding = nn.Embedding(len(vocab), p.embed_size, padding_idx=vocab.PAD,
                             _weight=embedding_weights).to(DEVICE)
    
    embedding.weight.requires_grad=False
    encoder = EncoderRNN(p.embed_size, p.hidden_size, p.enc_bidi,rnn_drop=p.enc_rnn_dropout).to(DEVICE)
    decoder = DecoderRNN(len(vocab), p.embed_size, p.dec_hidden_size,
                                  enc_attn=p.enc_attn, dec_attn=p.dec_attn,
                                  pointer=p.pointer,
                                  in_drop=p.dec_in_dropout, rnn_drop=p.dec_rnn_dropout,
                                  out_drop=p.dec_out_dropout, enc_hidden_size=p.hidden_size * 2,
                                  device=DEVICE, epsilon=p.eps).to(DEVICE)

    if(os.path.exists(p.encoder_weights_path) and p.resume_train):
        encoder.load_state_dict(torch.load(p.encoder_weights_path,map_location=torch.device(DEVICE)))
    if(os.path.exists(p.decoder_weights_path) and p.resume_train):
        decoder.load_state_dict(torch.load(p.decoder_weights_path,map_location=torch.device(DEVICE)))
    if(os.path.exists(p.encoder_decoder_adapter_weights_path) and p.resume_train):   
        enc_dec_adapter.load_state_dict(torch.load(p.encoder_decoder_adapter_weights_path,map_location=torch.device(DEVICE)))
    
    cnn_data=MyDataset([pair[0] for pair in dataset.pairs],[pair[1] for pair in dataset.pairs],vocab)
    
    val_data=MyDataset([pair[0] for pair in val_dataset.pairs],[pair[1] for pair in val_dataset.pairs],vocab)
    

    criterion = nn.NLLLoss(ignore_index=vocab.PAD)
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    adapter_optimizer=optim.Adam([{'params':enc_dec_adapter.parameters()}], lr=learning_rate)
    losses=[]
    val_losses=[]
    if(os.path.exists(p.losses_path) and p.resume_train):
        with open(p.losses_path,'rb') as f:
        val_losses=pickle.load(f)
        
    for _e in range(num_epochs):
        i=0
        print('\nEpoch: %d/%d' % (_e + 1, num_epochs))
        kbar = pkbar.Kbar(target=len(cnn_data), width=8)
        while i<len(cnn_data):
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()
            adapter_optimizer.zero_grad()

            x, x_len, y, y_len, x_extra, vocab_ext = get_next_batch(cnn_data, p, vocab, i, batch_size, device=DEVICE)
    
            encoder_embedded = embedding(x)
            encoder_hidden=encoder.init_hidden(x.size(0), DEVICE)
            encoder_outputs, encoder_hidden =encoder(encoder_embedded,encoder_hidden,x_len)
            decoder_input = torch.tensor([vocab.SOS] * x.size(0), device=DEVICE)
            decoder_hidden = enc_dec_adapter(encoder_hidden)
            
            decoder_states = []
            enc_attn_weights = []
            loss=0
            for di in range(y.size(1)):
                decoder_embedded = embedding(decoder_input)
                if enc_attn_weights:
                    coverage_vector = get_coverage_vector(enc_attn_weights)
                else:
                    coverage_vector = None
                    
                decoder_output, decoder_hidden, dec_enc_attn, dec_prob_ptr = decoder(decoder_embedded, decoder_hidden, encoder_outputs,
                            torch.cat(decoder_states) if decoder_states else None, coverage_vector,
                            encoder_word_idx=x_extra,log_prob=True,ext_vocab_size=len(vocab_ext))  
                decoder_output.to(DEVICE)
                decoder_hidden.to(DEVICE)
                dec_enc_attn.to(DEVICE)
                dec_prob_ptr.to(DEVICE)
                
                decoder_states.append(decoder_hidden)
                prob_distribution = torch.exp(decoder_output)
                _, top_idx = decoder_output.data.topk(1)
                gold_standard = y[:,di]
                nll_loss= criterion(decoder_output, gold_standard)    
                loss+=nll_loss 
                decoder_input = y[:,di]
                if (coverage_vector is not None and criterion): 
                    coverage_loss = torch.sum(torch.min(coverage_vector, dec_enc_attn)) / batch_size #* cover_loss            
                    loss+=coverage_loss
                enc_attn_weights.append(dec_enc_attn.unsqueeze(0)) 
                
            loss.backward()
            clip_grad_norm_(encoder.parameters(), 1)
            clip_grad_norm_(decoder.parameters(), 1)
            clip_grad_norm_(enc_dec_adapter.parameters(), 1)
            
            encoder_optimizer.step()
            decoder_optimizer.step()
            adapter_optimizer.step() 
            
            if i%(p.print_every*batch_size)==0:
                kbar.update(i, values=[("loss", loss.data.item())])
            i+=batch_size
               
        loss=loss.data.item()/x.size(0)
        kbar.add(1, values=[("loss", loss)])
        
        kbar2 = pkbar.Kbar(target=len(val_data), width=8)
        
        val_loss=0
        i=0
        while(i<len(val_data)):
            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()
            adapter_optimizer.zero_grad()
            x, x_len, y, y_len, x_extra, vocab_ext = get_next_batch(val_data, p, vocab, i, batch_size, device=DEVICE)
            encoder_embedded = embedding(x)
            encoder_hidden=encoder.init_hidden(x.size(0), device=DEVICE)
            encoder_outputs, encoder_hidden =encoder(encoder_embedded,encoder_hidden,x_len)
            decoder_input = torch.tensor([vocab.SOS] * x.size(0), device=DEVICE)
            decoder_hidden = enc_dec_adapter(encoder_hidden)
            
            decoder_states = []
            enc_attn_weights = []
            for di in range(y.size(1)):
                try:
                    decoder_embedded = embedding(decoder_input)
                except:
                    print('Dec input: ',decoder_input.shape,' x:', x.shape,' x_len:',x_len.shape, ' Vocab:', 
                          vocab.embeddings.shape,' Vocab Ext:', vocab_ext.embeddings.shape)
                    decoder_embedded = embedding(decoder_input)

                if enc_attn_weights:
                    coverage_vector = get_coverage_vector(enc_attn_weights)
                else:
                    coverage_vector = None
                decoder_output, decoder_hidden, dec_enc_attn, dec_prob_ptr = decoder(decoder_embedded, decoder_hidden, encoder_outputs,
                            torch.cat(decoder_states) if decoder_states else None, coverage_vector,
                            encoder_word_idx=x_extra,log_prob=True,ext_vocab_size=len(vocab_ext))  

                decoder_output.to(DEVICE)
                decoder_hidden.to(DEVICE)
                dec_enc_attn.to(DEVICE)
                dec_prob_ptr.to(DEVICE)

                decoder_states.append(decoder_hidden)      
                prob_distribution = torch.exp(decoder_output)

                _, top_idx = decoder_output.data.topk(1)
                gold_standard = y[:,di]

                nll_loss= criterion(decoder_output, gold_standard)    
                val_loss+=nll_loss.data.item()
                
                decoder_input = top_idx.view(-1) 
                if (coverage_vector is not None and criterion):
                    coverage_loss = torch.sum(torch.min(coverage_vector, dec_enc_attn)) / batch_size #* cover_loss            
                    val_loss+=coverage_loss.data.item()
                enc_attn_weights.append(dec_enc_attn.unsqueeze(0))  
            if i%(p.print_every*batch_size)==0:
                kbar2.update(i, values=[("Val loss", val_loss)])

            i+=batch_size
            
        avg_val_loss=val_loss/len(val_data)        
        kbar2.add(1, values=[("Train loss", loss), ("Val loss", val_loss), ("Avg Val loss", avg_val_loss)])
        
        if(len(val_losses)>0 and avg_val_loss<min(val_losses)):
            torch.save(encoder.state_dict(), p.encoder_weights_path)
            torch.save(decoder.state_dict(), p.decoder_weights_path)
            torch.save(enc_dec_adapter.state_dict(), p.encoder_decoder_adapter_weights_path)
        val_losses.append(avg_val_loss) 
    
    with open(p.losses_path,'wb') as f:
        pickle.dump(val_losses,f) 


In [15]:
def predict(sent,vocab,p,batch_size=1):
    ''' Function to predict the summary of the source text sentence
        Input:
        - sent: string, text to summarize
        - vocab: a class Vocab object, vocabulary of the texts
        - p: a class Parameters object, model parameters
        - batch_size: integer, batch size of the data to predict
    '''
    eps=p.eps
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    embedding_weights = torch.from_numpy(vocab.embeddings).to(DEVICE)
    enc_dec_adapter = nn.Linear(p.hidden_size * 2, p.dec_hidden_size).to(DEVICE)
    embedding = nn.Embedding(len(vocab), p.embed_size, padding_idx=vocab.PAD,_weight=embedding_weights).to(DEVICE)
    encoder = EncoderRNN(p.embed_size, p.hidden_size, p.enc_bidi,rnn_drop=p.enc_rnn_dropout).to(DEVICE)
    decoder = DecoderRNN(len(vocab), p.embed_size, p.dec_hidden_size,
                                  enc_attn=p.enc_attn, dec_attn=p.dec_attn,
                                  pointer=p.pointer,
                                  in_drop=p.dec_in_dropout, rnn_drop=p.dec_rnn_dropout,
                                  out_drop=p.dec_out_dropout, enc_hidden_size=p.hidden_size * 2,
                                  device=DEVICE).to(DEVICE) 
    sent_vec=[vocab[word] for word in sent.split()]
    vocab_ext=deepcopy(vocab)
    for word in sent.split():
        vocab_ext.add_words(word)
    sent_vec_extra=[vocab_ext[word] for word in sent.split()] 
    if(len(sent_vec_extra)<p.max_src_len):
        pad_dim = (0, p.max_src_len-len(sent_vec_extra))
        sent_vec_extra_tensor=F.pad(torch.tensor(sent_vec_extra), pad_dim , 'constant')
    else:
        sent_vec_extra_tensor=torch.tensor(sent_vec_extra)
        
    if(len(sent_vec)<p.max_src_len):
        pad_dim = (0, p.max_src_len-len(sent_vec))
        sent_vec_tensor=F.pad(torch.tensor(sent_vec), pad_dim, 'constant')
    else:
        sent_vec_tensor=torch.tensor(sent_vec)
        
    if(os.path.exists(p.encoder_weights_path)):
        encoder.load_state_dict(torch.load(p.encoder_weights_path,map_location=torch.device(DEVICE)))
    if(os.path.exists(p.decoder_weights_path)):
        decoder.load_state_dict(torch.load(p.decoder_weights_path,map_location=torch.device(DEVICE)))
    if(os.path.exists(p.encoder_decoder_adapter_weights_path)):    
        enc_dec_adapter.load_state_dict(torch.load(p.encoder_decoder_adapter_weights_path,map_location=torch.device(DEVICE)))

    x=sent_vec_tensor.view(1,-1).to(DEVICE)
    x_extra=sent_vec_extra_tensor.view(1,-1).to(DEVICE)
    encoder_embedded = embedding(x)
    encoder_hidden=encoder.init_hidden(x.size(0), DEVICE)
    encoder_outputs, encoder_hidden =encoder(encoder_embedded,encoder_hidden,
                                             torch.tensor(p.max_src_len).view(1).to(DEVICE))
    decoder_input = torch.tensor([vocab.SOS] * batch_size, device=DEVICE)
    decoder_hidden = enc_dec_adapter(encoder_hidden)
    
    decoder_states = []
    enc_attn_weights = []
    output=[]
    for di in range(p.max_tgt_len):
        decoder_embedded = embedding(decoder_input)
        if enc_attn_weights:
            coverage_vector = get_coverage_vector(enc_attn_weights)
        else:
            coverage_vector = None
        decoder_output, decoder_hidden, dec_enc_attn, dec_prob_ptr = decoder(decoder_embedded, decoder_hidden, encoder_outputs,
                    torch.cat(decoder_states).to(DEVICE) if decoder_states else None, coverage_vector,
                    encoder_word_idx=x_extra,log_prob=True,ext_vocab_size=len(vocab_ext))  
        decoder_output.to(DEVICE)
        decoder_hidden.to(DEVICE)
        dec_enc_attn.to(DEVICE)
        dec_prob_ptr.to(DEVICE)
        decoder_states.append(decoder_hidden)
        prob_distribution = torch.exp(decoder_output)
        _, top_idx = decoder_output.data.topk(1)
        output.append(top_idx.squeeze().data.item())
        enc_attn_weights.append(dec_enc_attn.unsqueeze(0))
        decoder_input = top_idx.view(-1)
    output=[vocab_ext[idx] for idx in output]    
    return output 

In [16]:
def prediction(sent,vocab,embedding, encoder, enc_dec_adapter, decoder, device, p,batch_size=1):
    ''' Function to predict the summary of the source text sentence
        Input:
        - sent: string, text to summarize
        - vocab: a class Vocab object, vocabulary of the texts
        - p: a class Parameters object, model parameters
        - batch_size: integer, batch size of the data to predict
    '''
    eps=p.eps
    sent_vec=[vocab[word] for word in sent.split()]
    vocab_ext=deepcopy(vocab)
    for word in sent.split():
        vocab_ext.add_words(word)
    sent_vec_extra=[vocab_ext[word] for word in sent.split()] 
    if(len(sent_vec_extra)<p.max_src_len):
        pad_dim = (0, p.max_src_len-len(sent_vec_extra))
        sent_vec_extra_tensor=F.pad(torch.tensor(sent_vec_extra), pad_dim , 'constant')
    else:
        sent_vec_extra_tensor=torch.tensor(sent_vec_extra)
        
    if(len(sent_vec)<p.max_src_len):
        pad_dim = (0, p.max_src_len-len(sent_vec))
        sent_vec_tensor=F.pad(torch.tensor(sent_vec), pad_dim, 'constant')
    else:
        sent_vec_tensor=torch.tensor(sent_vec)
        
    x=sent_vec_tensor.view(1,-1).to(device)
    x_extra=sent_vec_extra_tensor.view(1,-1).to(device)
    encoder_embedded = embedding(x)
    encoder_hidden=encoder.init_hidden(x.size(0), device)
    encoder_outputs, encoder_hidden =encoder(encoder_embedded,encoder_hidden,
                                             torch.tensor(p.max_src_len).view(1).to(device))
    decoder_input = torch.tensor([vocab.SOS] * batch_size, device=device)
    decoder_hidden = enc_dec_adapter(encoder_hidden)
    
    decoder_states = []
    enc_attn_weights = []
    output=[]
    for di in range(p.max_tgt_len):
        decoder_embedded = embedding(decoder_input)
        if enc_attn_weights:
            coverage_vector = get_coverage_vector(enc_attn_weights)
        else:
            coverage_vector = None
            
        decoder_output, decoder_hidden, dec_enc_attn, dec_prob_ptr = decoder(decoder_embedded, decoder_hidden, encoder_outputs,
                    torch.cat(decoder_states).to(device) if decoder_states else None, coverage_vector,
                    encoder_word_idx=x_extra,log_prob=True,ext_vocab_size=len(vocab_ext))  
        decoder_output.to(device)
        decoder_hidden.to(device)
        dec_enc_attn.to(device)
        dec_prob_ptr.to(device)
        decoder_states.append(decoder_hidden)
        prob_distribution = torch.exp(decoder_output)
        _, top_idx = decoder_output.data.topk(1)
        output.append(top_idx.squeeze().data.item())
        enc_attn_weights.append(dec_enc_attn.unsqueeze(0))
        decoder_input = top_idx.view(-1)
        
    output=[vocab_ext[idx] for idx in output]    
    return output 

In [17]:
def eval_metrics(preds, targets, avg=True):
    ''' Evaluate the ROUGE metrics ROUGE-2 and ROUGE-L for every pair predicted summary - target summary
    
        Input:
           - preds: list of strings, predicted summaries
           - targets: list of string, target summaries
        Output:
            - rouge2_f_metric: list of float, the Rouge-2 fscore for every predicted summary
            - rougel_f_metric: list of float, the Rouge-L fscore for every predicted summary
    '''
    rouge = Rouge()
    scores = rouge.get_scores(preds, targets, avg)
    if avg:
        rouge2_f_metric = scores['rouge-2']['f']
        rouge2_p_metric = scores['rouge-2']['p']
        rouge2_r_metric = scores['rouge-2']['r']
        rougel_f_metric = scores['rouge-l']['f']
        rougel_p_metric = scores['rouge-l']['p']
        rougel_r_metric = scores['rouge-l']['r']
    else:
        rouge2_f_metric = [score['rouge-2']['f'] for score in scores]
        rouge2_p_metric = [score['rouge-2']['p'] for score in scores]
        rouge2_r_metric = [score['rouge-2']['r'] for score in scores]
        rougel_f_metric = [score['rouge-l']['f'] for score in scores]
        rougel_p_metric = [score['rouge-l']['p'] for score in scores]
        rougel_r_metric = [score['rouge-l']['r'] for score in scores]
    
    return rouge2_f_metric, rouge2_p_metric, rouge2_r_metric, rougel_f_metric, rougel_p_metric, rougel_r_metric

def save_to_df(text, labeled_summaries, predicted_summaries, r2_f, r2_p, r2_r, rl_f, rl_p, rl_r):
    ''' Stores the metric results into a pandas dataframe'''
    results = pd.DataFrame(columns=['text', 'summary','pred_summary','rouge2-f','rouge2-p','rouge2-r','rougel-f', 'rougel-p', 'rougel-r'])
    results['text'] = text
    results['summary'] = labeled_summaries
    results['pred_summary'] = predicted_summaries
    results['rouge2-f'] = r2_f
    results['rouge2-p'] = r2_p
    results['rouge2-r'] = r2_r
    results['rougel-f'] = rl_f
    results['rougel-p'] = rl_p
    results['rougel-r'] = rl_r

    return results

In [18]:
def get_predictions(x_test, vocab, params, print_every=20):
    ''' Generate the predicted summaries of the source texts on x_test
        Input:
        - x_test: list of strings, the source texts
        - vocab: a Vocab Class object, vocabulary of the texts
        - params: a Parameters object, parameter of the model
        - print_every: integer, print progress every print_every iterations
    '''
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    embedding_weights = torch.from_numpy(vocab.embeddings).to(DEVICE)
    enc_dec_adapter = nn.Linear(params.hidden_size * 2, params.dec_hidden_size).to(DEVICE)
    embedding = nn.Embedding(len(vocab), params.embed_size, padding_idx=vocab.PAD,_weight=embedding_weights).to(DEVICE)
    encoder = EncoderRNN(params.embed_size,params.hidden_size, params.enc_bidi,rnn_drop=params.enc_rnn_dropout).to(DEVICE)
    decoder = DecoderRNN(len(vocab), params.embed_size, params.dec_hidden_size,
                                  enc_attn=params.enc_attn, dec_attn=params.dec_attn,
                                  pointer=params.pointer,
                                  in_drop=params.dec_in_dropout, rnn_drop=params.dec_rnn_dropout,
                                  out_drop=params.dec_out_dropout, enc_hidden_size=params.hidden_size * 2,
                                  device=DEVICE).to(DEVICE) 

    if(os.path.exists(params.encoder_weights_path)):
        encoder.load_state_dict(torch.load(params.encoder_weights_path,map_location=torch.device(DEVICE)))
    if(os.path.exists(params.decoder_weights_path)):
        decoder.load_state_dict(torch.load(params.decoder_weights_path,map_location=torch.device(DEVICE)))
    if(os.path.exists(params.encoder_decoder_adapter_weights_path)):    
        enc_dec_adapter.load_state_dict(torch.load(params.encoder_decoder_adapter_weights_path,map_location=torch.device(DEVICE)))

        
    predicted_summaries = []
    kbar = pkbar.Kbar(target=len(x_test), width=8)
    for i,doc in enumerate(x_test):
        pred_summ = prediction(doc,vocab,embedding,encoder,enc_dec_adapter,decoder,DEVICE,params,batch_size=1)
        predicted_summaries.append(' '.join(pred_summ))
        if i%print_every==0:
            kbar.update(i)
            
    return predicted_summaries


In [19]:
def generate_predictions(x_test, vocab, params, print_every=20):
    ''' Generate the predicted summaries of the source texts on x_test
        Input:
        - x_test: list of strings, the source texts
        - vocab: a Vocab Class object, vocabulary of the texts
        - params: a Parameters object, parameter of the model
        - print_every: integer, print progress every print_every iterations
    '''

    predicted_summaries = []

    kbar = pkbar.Kbar(target=len(x_test), width=8)

    for i,doc in enumerate(x_test):

        pred_summ = predict(doc,vocab,params,batch_size=1)
        predicted_summaries.append(' '.join(pred_summ))
 
        if i%print_every==0:
            kbar.update(i)
            
   
    return predicted_summaries


In [20]:
params = Parameters()

In [21]:

dataset = Dataset(params.data_path, simple_tokenizer, params.max_src_len, params.max_tgt_len, max_rows=64000,
                        truncate_src=True, truncate_tgt=True)
valid_dataset = Dataset(params.val_data_path, simple_tokenizer, params.max_src_len, params.max_tgt_len, max_rows= 3200,
                        truncate_src=True, truncate_tgt=True)
print(dataset.src_len, valid_dataset.src_len,dataset.tgt_len, valid_dataset.tgt_len)

Reading dataset /kaggle/input/cleaned-news-summary/cl_train_news_summary_more.csv... 64000 pairs.
Reading dataset /kaggle/input/cleaned-news-summary/cl_train_news_summary_more.csv... 3200 pairs.
61 54 14 13


In [22]:
vocab = dataset.build_vocab(params.vocab_min_frequency, embed_file=params.embed_file)
vocab.save_to_file('vocab_train.pkl')
embedding_weights = torch.from_numpy(vocab.embeddings)

29022 pre-trained embeddings loaded.


In [24]:
train(dataset,valid_dataset,vocab, params, embedding_weights,learning_rate=0.001,num_epochs = 10)

  "num_layers={}".format(dropout, num_layers))



Epoch: 1/10
1601/3200 [===>....] - ETA: 50s - Val loss: 5743.1058 - Train loss: 2.8980 - Avg Val loss: 3.5510
Epoch: 2/10
1601/3200 [===>....] - ETA: 53s - Val loss: 5731.3381 - Train loss: 2.5627 - Avg Val loss: 3.5395
Epoch: 3/10
1601/3200 [===>....] - ETA: 53s - Val loss: 5797.5375 - Train loss: 2.3181 - Avg Val loss: 3.5613
Epoch: 4/10
1601/3200 [===>....] - ETA: 53s - Val loss: 5668.7522 - Train loss: 2.0249 - Avg Val loss: 3.5443
Epoch: 5/10
1601/3200 [===>....] - ETA: 55s - Val loss: 5768.4707 - Train loss: 1.8409 - Avg Val loss: 3.5840
Epoch: 6/10
1601/3200 [===>....] - ETA: 56s - Val loss: 5762.2783 - Train loss: 1.7314 - Avg Val loss: 3.5798
Epoch: 7/10
1601/3200 [===>....] - ETA: 54s - Val loss: 5787.8225 - Train loss: 1.7196 - Avg Val loss: 3.5747
Epoch: 8/10
1601/3200 [===>....] - ETA: 55s - Val loss: 5756.4203 - Train loss: 1.6308 - Avg Val loss: 3.6272
Epoch: 9/10
1601/3200 [===>....] - ETA: 55s - Val loss: 5874.0652 - Train loss: 1.6695 - Avg Val loss: 3.6691
Epoch: 10

Load the test dataset to evaluate the model

In [25]:
test_dataset = Dataset(params.test_data_path, simple_tokenizer, params.max_src_len, params.max_tgt_len, max_rows= 3200,
                        truncate_src=True, truncate_tgt=True)
print('Length Test Dataset:', len(test_dataset.pairs))
x_test = [' '.join(pair[0]) for pair in test_dataset.pairs]
y_test = [' '.join(pair[1]) for pair in test_dataset.pairs]
preds = get_predictions(x_test, vocab, params, print_every=100)
r2_f, r2_p, r2_r, rl_f, rl_p, rl_r = eval_metrics(preds, y_test, False)
print('\nMean Rouge-2 FScore: ',np.mean(r2_f), 'Mean Rouge-L FScore: ',np.mean(rl_f))
test_results = save_to_df(x_test, y_test, preds, r2_f, r2_p, r2_r, rl_f, rl_p, rl_r)
test_results.head(5)

Reading dataset /kaggle/input/cleaned-news-summary/cl_valid_news_summary_more.csv... 3200 pairs.
Length Test Dataset: 3200
Mean Rouge-2 FScore:  0.12921766904371598 Mean Rouge-L FScore:  0.331027675203172


Unnamed: 0,text,summary,pred_summary,rouge2-f,rouge2-p,rouge2-r,rougel-f,rougel-p,rougel-r
0,hrd ministry formed threemember special invest...,govt forms sit ryan murder case cbse seeks saf...,hrd ministry formed sit probe murder sevenyear...,0.0,0.0,0.0,0.190476,0.181818,0.2
1,letter written jail sheena bora murder accused...,indrani asks furniture jewellery divorce report,indrani mukerjea asks furniture jewellery artw...,0.210526,0.142857,0.4,0.5,0.357143,0.833333
2,enforcement directorate ed friday conducted se...,ed raids 35 premises nirav modi â assets seized,ed raids new locations fraudaccused diamond je...,0.095238,0.076923,0.125,0.285714,0.25,0.333333
3,japan acknowledged first time worker died radi...,japan admits 1st death 2011 fukushima nuclear ...,japan acknowledged first time worker died nucl...,0.095238,0.071429,0.142857,0.285714,0.230769,0.375
4,entire village germany auctioned weekend bids ...,entire village germany auctioned,village germany auctioned weekend bids â crore...,0.25,0.153846,0.666667,0.375,0.25,0.75


In [28]:
def calculate_rouge(y_test, preds):
    rouge = Rouge()
    scores = rouge.get_scores(preds, y_test, avg=True)
    return scores

In [29]:
rouge_scores = calculate_rouge(y_test, preds)
print(f"ROUGE Scores: {rouge_scores}")

ROUGE Scores: {'rouge-1': {'r': 0.5022704274891789, 'p': 0.29198233450577155, 'f': 0.3656299544703699}, 'rouge-2': {'r': 0.2001990327380952, 'p': 0.0967695811133316, 'f': 0.12921766904371532}, 'rouge-l': {'r': 0.4546986268939392, 'p': 0.2643932855339099, 'f': 0.33102767520317194}}
