## importing libraries


In [None]:
from tqdm import tqdm
import re
import warnings
import numpy as np
warnings.filterwarnings("ignore")
import pandas as pd
pd.set_option('display.max_colwidth', -1)
from tensorflow.keras import backend as K
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed,Add,AdditiveAttention
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.callbacks import EarlyStopping
from keras.preprocessing.text import text_to_word_sequence
import tensorflow as tf
import random as rn

np.random.seed(42)
tf.random.set_seed(32)
rn.seed(12)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
text_data=pd.read_csv('/content/text_data.csv')
text_data.head(5)

In [None]:
def conv_str(text):
  new_text=''
  try:
     text = [str(item) for item in text.split()]
  except:
     text=str(text)
  return text

In [None]:
text_data['summary']=text_data['summary'].map(conv_str)

In [None]:
max_text_len = 500
max_summary_len = 100

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_validation,y_train,y_validatioin=train_test_split(np.array(text_data['article']),np.array(text_data['summary']),random_state=33 ,test_size=0.1)

### Considering rare words as unk

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(list(x_train))

In [None]:
thresh=2
rare_word=[]
for key,value in x_tokenizer.word_counts.items():
    if(value<thresh):
        rare_word.append(key)

In [None]:
print(len(rare_word))
rare_word[:5]

39072


['naoma', 'gianato', 'msha', 'illegitimacy', 'indignantly']

In [None]:
tokenrare=[]
for i in range(len(rare_word)):
    tokenrare.append('ukn')

In [None]:
dictionary_1 = dict(zip(rare_word,tokenrare))

In [None]:
x_trunk=[]
for i in x_train:
    for word in i.split():
        if word.lower() in dictionary_1:
            i = i.replace(word, dictionary_1[word.lower()])
    x_trunk.append(i)

In [None]:
x_tokenizer = Tokenizer(oov_token='ukn')
x_tokenizer.fit_on_texts(list(x_trunk))

In [None]:
# Convert text sequences into integer sequences (i.e one-hot encodeing all the words)
x_tr_seq    =   x_tokenizer.texts_to_sequences(x_trunk)
x_val_seq   =   x_tokenizer.texts_to_sequences(x_validation)

# Padding zero upto maximum length
x_tr    =   pad_sequences(x_tr_seq,  maxlen=max_text_len, padding='post')
x_val   =   pad_sequences(x_val_seq, maxlen=max_text_len, padding='post')

# Size of vocabulary ( +1 for padding token)
x_voc   =  len(x_tokenizer.word_index) + 1

In [None]:
y_train

In [None]:
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(y_train)

In [None]:
thresh=2
rare_word=[]
for key,value in y_tokenizer.word_counts.items():
    if(value<thresh):
        rare_word.append(key)

In [None]:
print(len(rare_word))
rare_word[3:10]

In [None]:
tokenrare=[]
for i in range(len(rare_word)):
    tokenrare.append('ukn')

In [None]:
dictionary_1 = dict(zip(rare_word,tokenrare))

In [None]:
y_trunk=[]
for i in y_train:
    for word in i:
        if word.lower() in dictionary_1:
            i[i.index(word)] = dictionary_1[word.lower()]
    y_trunk.append(i)

In [None]:
y_tokenizer = Tokenizer(oov_token='ukn')
y_tokenizer.fit_on_texts(y_trunk)

# Convert text sequences into integer sequences (i.e one hot encode the text in Y)
y_tr_seq    =   y_tokenizer.texts_to_sequences(y_trunk)
y_val_seq   =   y_tokenizer.texts_to_sequences(y_validatioin)

# Padding zero upto maximum length
y_tr    =   pad_sequences(y_tr_seq, maxlen=max_summary_len, padding='post')
y_val   =   pad_sequences(y_val_seq, maxlen=max_summary_len, padding='post')

# Size of vocabulary
y_voc  =   len(y_tokenizer.word_index) +1

In [None]:
y_voc

## loading glove vectors


In [None]:
embeddings_dictionary = dict()
glove_file = open("/content/drive/MyDrive/glove.42B.300d.txt", encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [None]:
#Updating the dictionary with the pre-trained GloVe embeddings.
embedding_matrix_x = np.zeros((x_voc+1 , 300))
for word, index in x_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix_x[index] = embedding_vector
embedding_matrix_x.shape

In [None]:
embedding_matrix_y = np.zeros((y_voc+1, 300))

for word, index in y_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix_y[index] = embedding_vector
embedding_matrix_y.shape

## Encoder


In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim,embedding_matrix_x, hidden_units):
        super().__init__()

        self.hidden_units = hidden_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,weights=[embedding_matrix_x])
        self.bi_gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(
            hidden_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform',dropout=0.08,recurrent_dropout=0.05))

    def call(self, encoder_input,encoder_states):
        # inputs: encoder_input = (batch_size, seq_length)
        #         encoder_states = list[(batch_size, hidden_units),(batch_size, hidden_units)]

        # embedding look-up layer
        encoder_emb = self.embedding(encoder_input) # (batch_size,seq_length,embedding_dim)

        # encoder_output = (batch_size,seq_length,hidden_units)
        # encoder_states = (batch_size,hidden_units)
        encoder_output, state_fwd, state_back = self.bi_gru(encoder_emb,initial_state=encoder_states)
        encoder_states = [state_fwd,state_back]

        return encoder_output, encoder_states

## Attention mechanism with coverage vector(wc)



In [None]:
class additiveAttention(tf.keras.layers.AdditiveAttention):
    def __init__(self, hidden_units,is_coverage=False):
        super().__init__()

        self.Wh = tf.keras.layers.Dense(hidden_units) # weight matrix for encoder hidden state
        self.Ws = tf.keras.layers.Dense(hidden_units) # weight matrix for decoder state
        self.wc = tf.keras.layers.Dense(1) # weight vector for coverage
        self.V = tf.keras.layers.Dense(1)
        self.coverage = is_coverage
        if self.coverage is False:
            self.wc.trainable = False

    def call(self,keys):


        value=keys[0]
        query=keys[1]
        ct=keys[2]


        value = tf.expand_dims(value, 1) # (batch_size, 1, hidden_units)
        ct = tf.expand_dims(ct, 1) # (batch_size, 1, seq_length)

        score = self.V(tf.nn.tanh(
                        self.Wh(query) +
                        self.Ws(value) +

                        self.wc(ct)
                        ))

        attention_weights = tf.nn.softmax(score, axis=1) # (batch_size, seq_length, 1)
        # only update coverage vector if coverage is enabled
        ct = tf.squeeze(ct,1) # (batch_size, seq_length)
        if self.coverage is True:
            ct+=tf.squeeze(attention_weights)

        context_vector = attention_weights * query # (batch_size, seq_length, hidden_units)
        context_vector = tf.reduce_sum(context_vector, axis=1) # (batch_size, hidden_units)

        return context_vector, attention_weights, ct

### Decoder

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim,embedding_matrix_y,hidden_units):
        super().__init__()

        self.hidden_units = hidden_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,weights=[embedding_matrix_y])
        self.gru = tf.keras.layers.GRU(
            hidden_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform',
        )
        self.W1 = tf.keras.layers.Dense(hidden_units)
        self.W2 = tf.keras.layers.Dense(vocab_size)
        # Pointer Generator
        # wh = tf.keras.layers.Dense(1)
        # ws = tf.keras.layers.Dense(1)
        # wx = tf.keras.layers.Dense(1)


    def call(self, decoder_input, decoder_state, encoder_output,context_vector):
        # inputs: decoder_input = (batch_size, 1)
        #         decoder_state = (batch_size, hidden_units)
        #         encoder_output = (batch_size,seq_length, hidden_units)
        #         coverage_vector = (batch_size,seq_length)

        # embedding look-up layer
        decoder_emb = self.embedding(decoder_input) # (batch_size, seq_length, hidden_units)

        # decoder_output = (batch_size,seq_length,hidden_units)
        # decoder_state = (batch_size,hidden_units)
        decoder_output , decoder_state = self.gru(decoder_emb,initial_state=decoder_state)

        # concatenate context vector and decoder state
        concat_vector = tf.concat([context_vector,decoder_state], axis=-1)
        # reshape to 1d array
        concat_vector = tf.reshape(concat_vector, (-1, concat_vector.shape[1]))
        # create vocabulary distribution
        p_vocab = tf.nn.log_softmax(self.W2(self.W1(concat_vector)))

        # calculate p_gen
        #p_gen = tf.nn.sigmoid(self.wh(context_vector)+self.ws(decoder_state)+self.wx(decoder_input))

        return p_vocab, decoder_state

In [None]:
#Set Parameters
input_vocab_size = x_voc+1
output_vocab_size = y_voc +1
#Encoding and decoding Embedding layer dimension


### Data generator

In [None]:
def data_generator(X,y,BATCH_SIZE,shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    if shuffle:
        dataset = dataset.shuffle(len(X)).batch(BATCH_SIZE,drop_remainder=True)
    else:
        dataset = dataset.batch(BATCH_SIZE,drop_remainder=True)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

In [None]:
body_seqs=x_tr
target_seqs=y_tr

body_seqs_val=x_val
target_seqs_val=y_val

In [None]:
train_dataset = data_generator(body_seqs,target_seqs,BATCH_SIZE=64,
                       shuffle=True)
val_dataset = data_generator(body_seqs_val,target_seqs_val,BATCH_SIZE=64,
                       shuffle=False)

In [None]:
embedding_dim = 300
hidden_units = 128
batch_size=64

encoder = Encoder(input_vocab_size, embedding_dim,embedding_matrix_x, hidden_units)
attention = additiveAttention(hidden_units,is_coverage=True)
decoder = Decoder(output_vocab_size, embedding_dim,embedding_matrix_y,hidden_units)

In [None]:
encoder_input, decoder_target = next(iter(train_dataset))
encoder_init_states = [tf.zeros((batch_size, encoder.hidden_units)) for i in range(2)]
encoder_output, encoder_states = encoder(encoder_input,encoder_init_states)
decoder_state = encoder_states[0]
coverage_vector = tf.zeros((64,encoder_input.shape[1]))
decoder_input_t = decoder_target[:,0]
context_vector, attention_weights, coverage_vector = attention([decoder_state,encoder_output,coverage_vector])
p_vocab,decoder_state = decoder(tf.expand_dims(decoder_input_t,1),decoder_state,encoder_output,context_vector)

In [None]:
optimizer = tf.keras.optimizers.Adam()

def nll_loss(p_vocab,target):
    # apply a mask such that pad zeros do not affect the loss
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss = -p_vocab
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return loss

def coverage_loss(attention_weights,coverage_vector,target):
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    coverage_vector = tf.expand_dims(coverage_vector,axis=2)
    ct_min = tf.reduce_min(tf.concat([attention_weights,coverage_vector],axis=2),axis=2)
    cov_loss = tf.reduce_sum(ct_min,axis=1)
    mask = tf.cast(mask, dtype=cov_loss.dtype)
    cov_loss *= mask
    return cov_loss

### Model Training


### Training function

In [None]:
@tf.function
def train_step(encoder_input, decoder_target):
    """Function which performs one training step (batch)"""
    loss = tf.zeros(batch_size)
    lambda_cov = 1
    with tf.GradientTape() as tape:
        # run body_sequence input through encoder
        encoder_init_states = [tf.zeros((batch_size, encoder.hidden_units)) for i in range(2)]
        encoder_output, encoder_states = encoder(encoder_input,encoder_init_states)
        # initialize decoder with encoder forward state
        decoder_state = encoder_states[0] # !!!interpolate between forward and backward instead!!!
        coverage_vector = tf.zeros((64,encoder_input.shape[1]))
        # loop over each word in target sequence
        for t in range(decoder_target.shape[1]-1):
            # run decoder input through decoder and generate vocabulary distribution
            decoder_input_t = decoder_target[:,t]
            decoder_target_t = decoder_target[:,t+1]
            # get attention scores
            context_vector, attention_weights, coverage_vector = attention([decoder_state, encoder_output,coverage_vector])
            # get vocabulary distribution for each batch at time t
            p_vocab,decoder_state = decoder(tf.expand_dims(decoder_input_t,1),decoder_state,encoder_output,context_vector)
            # for each batch get the probability of the target word at time t+1
            p_vocab_list = []
            for i in range(len(decoder_target_t)):
                p_vocab_list.append(p_vocab[i,decoder_target_t[i]])
            p_vocab_target = tf.stack(p_vocab_list)
            # calculate the loss at each time step t and add to current loss
            loss += nll_loss(p_vocab_target,decoder_target_t) + lambda_cov*coverage_loss(attention_weights,coverage_vector,decoder_target_t)

        # get the non-padded length of each sequence in the batch
        seq_len_mask = tf.cast(tf.math.logical_not(tf.math.equal(decoder_target, 0)),tf.float32)
        batch_seq_len = tf.reduce_sum(seq_len_mask,axis=1)

        # get batch loss by dividing the loss of each batch by the target sequence length and mean
        batch_loss = tf.reduce_mean(loss/batch_seq_len)

    # update trainable variables
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(batch_loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

### Validation_step

In [None]:
@tf.function
def val_step(encoder_input, decoder_target):
    loss = tf.zeros(batch_size)
    lambda_cov = 1

    encoder_init_states = [tf.zeros((batch_size, encoder.hidden_units)) for i in range(2)]
    encoder_output, encoder_states = encoder(encoder_input,encoder_init_states)
    decoder_state = encoder_states[0]
    coverage_vector = tf.zeros((64,encoder_input.shape[1]))


    for t in range(decoder_target.shape[1]-1):
            # run decoder input through decoder and generate vocabulary distribution
        decoder_input_t = decoder_target[:,t]
        decoder_target_t = decoder_target[:,t+1]
            # get attention scores
        context_vector, attention_weights, coverage_vector = attention([decoder_state, encoder_output,coverage_vector])
            # get vocabulary distribution for each batch at time t
        p_vocab,decoder_state = decoder(tf.expand_dims(decoder_input_t,1),decoder_state,encoder_output,context_vector)
            # for each batch get the probability of the target word at time t+1
        p_vocab_list = []

        for i in range(len(decoder_target_t)):
            p_vocab_list.append(p_vocab[i,decoder_target_t[i]])

        p_vocab_target = tf.stack(p_vocab_list)
            # calculate the loss at each time step t and add to current loss
        loss += nll_loss(p_vocab_target,decoder_target_t) + lambda_cov*coverage_loss(attention_weights,coverage_vector,decoder_target_t)

        # get the non-padded length of each sequence in the batch

    seq_len_mask = tf.cast(tf.math.logical_not(tf.math.equal(decoder_target, 0)),tf.float32)
    batch_seq_len = tf.reduce_sum(seq_len_mask,axis=1)

        # get batch loss by dividing the loss of each batch by the target sequence length and mean
    val_batch_loss = tf.reduce_mean(loss/batch_seq_len)


    return val_batch_loss

## Training loop

## Testing the model

In [None]:
def decode_seq(encoder_input):
    """Function which returns a summary by always picking the highest probability option conditioned on the previous word"""
    # run body_sequence through encoder
    encoder_init_states = [tf.zeros((1, encoder.hidden_units)) for i in range(2)]
    encoder_output, encoder_states = encoder(encoder_input,encoder_init_states)
    # initialize decoder with encoder forward state
    decoder_state = encoder_states[0]

    decoder_input_t =  tf.ones(1)*target_word_index['start'] # initialize with start token
    summary = [target_word_index['start']]
    coverage_vector = tf.zeros((1,encoder_input.shape[1]))
    while decoder_input_t[0].numpy()!=target_word_index['end'] and len(summary)<max_summary_len: # as long as decoder input is different from end token continue
        context_vector, attention_weights, coverage_vector = attention([decoder_state, encoder_output,coverage_vector])
        p_vocab, decoder_state = decoder(tf.expand_dims(decoder_input_t,1),decoder_state,encoder_output,context_vector)
        decoder_input_t = tf.argmax(p_vocab,axis=1)
        decoder_word_idx = int(decoder_input_t[0].numpy())
        summary.append(decoder_word_idx)
    return summary

In [None]:
reverse_target_word_index=y_tokenizer.index_word
reverse_source_word_index=x_tokenizer.index_word
target_word_index=y_tokenizer.word_index

In [None]:
def seq2text(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString=newString+reverse_source_word_index[i]+' '
    return newString

def seq2summary(input_seq,ukn_token):

    newString=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index['start']) and i!=target_word_index['end']):
            if i==target_word_index['ukn']:

                newString=newString+ukn_token+' '
            else:

                newString=newString+reverse_target_word_index[i]+' '
    return newString
def search(list, platform):
    for i in range(len(list)):
        if list[i] == platform:
            return True
    return False

In [None]:
for i in range(31,50):
    encoder_input_sum = tf.expand_dims(x_val[i],0)
    summary = decode_seq(encoder_input_sum)

    k= seq2text(x_val[i])
    k=re.sub('[^a-z]+', ' ', k)
    result = text_to_word_sequence(k)


    if search(result, 'ukn'):
        idx=result.index('ukn')

        input_org = re.sub('[^a-z]+',' ', x_validation[i])
        input_org = text_to_word_sequence(input_org)
        ukn_token = input_org[idx]

    else:
        ukn_token='ukn'

    print("Original summary:",y_validation[i])

    print("Predicted summary:",seq2summary(summary, ukn_token))

    print("\n")