<a href="https://colab.research.google.com/github/SandeshRangreji/Pointer-Generator-Networks/blob/main/PGNModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
  tf.config.experimental.set_memory_growth(gpu, True)

In [None]:
pip install tensorflow-addons

In [1]:
from google.colab import drive

import time
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import re
import numpy as np

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# unzips Glove word embeddings
!unrar x "/content/drive/MyDrive/Pointer Generator Networks/glove.6B.200d.rar" -d "/content/PGN/data/"


UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from /content/drive/MyDrive/Pointer Generator Networks/glove.6B.200d.rar

Creating    /content/PGN                                              OK
Creating    /content/PGN/data                                         OK
Extracting  /content/PGN/data/glove.6B.200d.txt                            1%  3%  5%  6%  8% 10% 11% 13% 15% 16% 18% 20% 22% 23% 25% 27% 28% 30% 32% 33% 35% 37% 38% 40% 42% 44% 45% 47% 49% 50% 52% 54% 55% 57% 59% 61% 62% 64% 66% 67% 69% 71% 72% 74% 76% 77% 79% 81% 83% 84% 86% 88% 89% 91% 93% 94% 96% 98% 99%  OK 
All OK


In [2]:
# class to handle data loading, splitting, preprocessing, tokenization
class Data:

  def __init__(self, vocab_size, oov_token):
    # dictionary for contractions
    self.tokenizer = Tokenizer(num_words = vocab_size, filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n', oov_token=oov_token)
    self.contractions = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

  # function to perform contractions
  def split_contractions(self, sentence):
    # parameters:
    # sentence: article/summary (string)
    # making a list of words from the article/summary
    # return: article/summary after contractions (string)
    li_sentence = sentence.split(' ')
    # iterating through each word and replacing the contracted word if it is present in contraction dictionary
    for i in range(len(li_sentence)):
      li_sentence[i] = self.contractions.get(li_sentence[i], li_sentence[i])
    # combining the list to form a string again
    sentence = ' '.join(li_sentence)
    return sentence

  # function to handle preprocessing of articles and summaries
  def preprocess(self, sentence):
    # parameters:
    # sentence: article or summary to be processed
    # returns:
    # sentence: cleaned article/summary
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    sentence = self.split_contractions(sentence)
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sentence)
    # removing trailing spaces
    sentence = sentence.lower().strip()
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    sentence = '<start> ' + sentence + ' <end>'
    return sentence  
  
  # splitting data into articles and summaries for training and testing
  def split_data(self, dataset):
    # parameters:
    # dataset : tfds of cnn_dailymail dataset (bytes)
    # returns:
    # train_articles, train_summaries, eval_articles, eval_summaries : lists of training and eval articles and summaries (string)
    train_articles = []
    train_summaries = []
    eval_articles = []
    eval_summaries = []
    # iterating through train dataset and storing articles and summaries seperately
    for text in tfds.as_numpy(dataset['train']):
      # decoding from bytes to string
      article = self.preprocess(text['article'].decode("utf-8"))
      summaries = self.preprocess(text['highlights'].decode("utf-8"))
      train_articles.append(article)
      train_summaries.append(summaries)

    # iterating through validation dataset and storing articles and summaries seperately
    for text in tfds.as_numpy(dataset['validation']):
      # decoding from bytes to string
      article = self.preprocess(text['article'].decode("utf-8"))
      summaries = self.preprocess(text['highlights'].decode("utf-8"))
      eval_articles.append(article)
      eval_summaries.append(summaries)
    return train_articles, train_summaries, eval_articles, eval_summaries

  # function to tokenize data
  def tokenize(self, train_articles, train_summaries, eval_articles, eval_summaries, vocab_size , embedding_dim, max_length_articles, max_length_summaries, truncating_type, padding_type, oov_token):
    # parameters:
    # train_articles, train_summaries, eval_articles, eval_summaries : lists of training and eval articles and summaries (string)
    # vocab_size: size of vocabulary
    # embedding_dim: dimensions of word embeddings
    # max_lengths_articles: number of words in the longest article
    # max_lengths_summaries: number of words in the longest summary
    # truncating_type: pre/post truncatation
    # padding_type: pro/post padding
    # oov_token: specifies what oov_token should be used
    # return:
    # train_articles, train_summaries, eval_articles, eval_summaries : lists of training and eval articles and summaries (sequences)
    # initialize tokenizer
    # fit tokenizer on training input (vocab)
    self.tokenizer.fit_on_texts(train_articles)
    # get word index from tokenizer
    word_index = self.tokenizer.word_index
    # tokenize articles for training 
    train_articles = self.tokenizer.texts_to_sequences(train_articles)
    train_articles = pad_sequences(train_articles ,maxlen=max_length_articles, padding=padding_type, truncating=truncating_type)
    # tokenize summaries for training 
    eval_articles = self.tokenizer.texts_to_sequences(eval_articles)
    eval_articles = pad_sequences(eval_articles, maxlen=max_length_articles, padding=padding_type, truncating=truncating_type)
    # tokenize articles for eval 
    train_summaries = self.tokenizer.texts_to_sequences(train_summaries)
    train_summaries = pad_sequences(train_summaries ,maxlen=max_length_summaries, padding=padding_type, truncating=truncating_type)
    # tokenize summaries for eval 
    eval_summaries = self.tokenizer.texts_to_sequences(eval_summaries)
    eval_summaries = pad_sequences(eval_summaries, maxlen=max_length_summaries, padding=padding_type, truncating=truncating_type)
    return train_articles, train_summaries, eval_articles, eval_summaries, word_index

  # function to get a dictionery which is the reverse of the word_index
  def get_reverse_word_index(self, word_index):
    # parameters:
    # word_index : { word : id }
    # returns
    # reverse_word_index : { id : word }
    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
    return reverse_word_index

  # get important metrics of the dataset that is needed to build a model
  def get_data_metrics(self):
    # return:
    # average length of articles, average length of summaries, length of longest article, length of longest summary
    # loading dataset
    ds = tfds.load("cnn_dailymail")
    # decoding and splitting dataset into train and eval articles and summaries
    train_articles, train_summaries, eval_articles, eval_summaries = self.split_data(ds)
    train_article_sum = 0
    max_article_len = 0
    max_summary_len = 0
    train_summaries_sum = 0
    # iterating through dataset to count total number of words
    for i in range(len(train_articles)):
      # current article and summary length (no. of words)
      article_len = len(train_articles[i].split())
      summary_len = len(train_summaries[i].split())
      # finding length of article with most number of words
      if(article_len>max_article_len):
        max_article_len = article_len
      # finding length of summary with most number of words
      if(summary_len>max_summary_len):
        max_summary_len = summary_len
      # calculating total number of words accross all articles and summary to calculate average
      train_article_sum = train_article_sum + article_len
      train_summaries_sum = train_summaries_sum + summary_len
    return train_article_sum/len(train_articles), train_summaries_sum/len(train_summaries), max_article_len, max_summary_len 

  # function to load pretrained word embeddings and prepare them for embedding layer
  def get_word_embeddings(self, word_index, vocab_size, embedding_dim):
    # parameters:
    # word_index: { word:id }
    # vocab_size: size of vocabulary
    # embedding_dim: dimension of word embeddings
    embeddings_index = {}
    # opening and reading word embeddings from file
    with open('/content/PGN/data/glove.6B.200d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embeddings_matrix = np.zeros((vocab_size+1, embedding_dim))

    # converting word embeddings to matrix (weights for embedding layer) using word_index
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector
    return embeddings_index, embeddings_matrix

  # function to convert list of articles and summaries to iterable, batched datasets
  def batch_datasets(self, train_articles, train_summaries, eval_articles, eval_summaries, BATCH_SIZE):
    # parameters:
    # train_articles, train_summaries, eval_articles, eval_summaries : lists of training and eval articles and summaries (sequences)
    # BATCH_SIZE: size of one batch in the dataset
    # return:
    # train_dataset, val_dataset: dataset objects for training and evaluation, batched according to BATCH_SIZE
    # making a dataset object from the train articles and summaries
    train_dataset = tf.data.Dataset.from_tensor_slices((train_articles, train_summaries))
    # batching training dataset
    train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)
    # making a dataset object from the evaluation articles and summaries
    val_dataset = tf.data.Dataset.from_tensor_slices((eval_articles, eval_summaries))
    # batching evaluation dataset
    val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)
    return train_dataset, val_dataset

  def __call__(self, num_training_examples = 2048, vocab_size = 25000, embedding_dim = 200, max_length_articles = 2880, max_length_summaries = 1344, truncating_type='post', padding_type='post', oov_token='<OOV>', BATCH_SIZE=64):
    # parameters:
    # vocab_size: size of vocabulary
    # embedding_dim: dimensions of word embeddings
    # max_lengths_articles: number of words in the longest article
    # max_lengths_summaries: number of words in the longest summary
    # truncating_type: pre/post truncatation
    # padding_type: pro/post padding
    # oov_token: specifies what oov_token should be used
    # returns:
    # train_dataset, val_dataset: dataset objects for training and evaluation, batched according to BATCH_SIZE
    # word_index : { word : id }
    # reverse_word_index : { id : word }

    # loading data in bytes from tfds
    ds=tfds.load("cnn_dailymail")
    # splitting data into train and test sets of articles and summaries
    train_articles, train_summaries, eval_articles, eval_summaries = self.split_data(ds)
    # tokenizing data
    train_articles, train_summaries, eval_articles, eval_summaries, word_index = self.tokenize(
        train_articles[:num_training_examples], 
        train_summaries[:num_training_examples], 
        eval_articles[:2048], 
        eval_summaries[:2048],
        vocab_size , 
        embedding_dim, 
        max_length_articles, 
        max_length_summaries, 
        truncating_type, 
        padding_type, 
        oov_token)
    vocab_size = len(word_index)
    # converting list of articles to train and evaluation datasets that are in batches
    train_dataset, validation_dataset = self.batch_datasets(train_articles, train_summaries, eval_articles, eval_summaries, BATCH_SIZE)
    return train_dataset, validation_dataset, word_index, self.get_reverse_word_index(word_index), vocab_size

In [3]:
vocab_size = 100000
num_examples = 2048
embedding_dim = 200
max_length_articles = 1024
max_length_summaries = 128
truncating_type ='post'
padding_type ='post'
oov_token = "<OOV>"
BATCH_SIZE = 8
data = Data(vocab_size, oov_token)
# load, split, batch data
train_dataset, val_dataset, word_index, reverse_word_index, vocab_size = data(
        num_examples,
        vocab_size, 
        embedding_dim, 
        max_length_articles, 
        max_length_summaries, 
        truncating_type, 
        padding_type, 
        oov_token,
        BATCH_SIZE)

In [4]:
len(word_index)

43826

In [5]:
# get word embeddings in the form of a matrix
embeddings_index, embeddings_matrix = data.get_word_embeddings(word_index, vocab_size, embedding_dim)

In [6]:
embeddings_matrix.shape

(43827, 200)

In [7]:
# sample data from dataset
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([16, 1024]), TensorShape([16, 128]))

In [8]:
# class for Encoder model
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz, embeddings_matrix):
    # parameters:
    # vocab_size: size of vocabulary
    # embedding_dims: dimension of word embeddings
    # enc_units: number of LSTM units in the encoder
    # batch_sz: batch size of data
    # embedding_matrix: word embeddings in the form of a matrix
    super(Encoder, self).__init__()
    # initializing model layers and some parameters of those layers
    self.batch_sz = batch_sz
    self.l1_enc_units = enc_units
    self.l2_enc_units = enc_units//2
    self.l3_enc_units = enc_units//4
    # initializing embedding layer with pretrained word embeddings
    self.embedding = tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length_articles, weights=[embeddings_matrix], trainable=False)

    ##________ LSTM layer in Encoder ------- ##
    self.lstm_layer_1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.l1_enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform'))
    
    self.lstm_layer_2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.l2_enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform'))
    
    self.lstm_layer_3 = tf.keras.layers.LSTM(self.l3_enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    
    # self.lstm_layer_3 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8,
                                  #  return_sequences=True,
                                  #  return_state=True,
                                  #  recurrent_initializer='glorot_uniform'))


  # build encoder model
  def call(self, x, hidden):
    x = self.embedding(x)
    # print(x.shape)
    # output_3, h3, c3 = self.lstm_layer_1(x, initial_state = hidden )
    output_1, forward_h1, forward_c1, backward_h1, backward_c1 = self.lstm_layer_1(x, initial_state = hidden )
    output_2, forward_h2, forward_c2, backward_h2, backward_c2 = self.lstm_layer_2(output_1)
    output_3, h3, c3 = self.lstm_layer_3(output_2)
    # output_3, forward_h3, forward_c3, backward_h3, backward_c3 = self.lstm_layer_3(output_2)
    return output_3, h3, c3

  # initializing weights
  def initialize_hidden_state(self):
    # return [tf.zeros((self.batch_sz, self.l1_enc_units)), tf.zeros((self.batch_sz, self.l1_enc_units))]
    return [tf.zeros((self.batch_sz, self.l1_enc_units)), tf.zeros((self.batch_sz, self.l1_enc_units)), tf.zeros((self.batch_sz, self.l1_enc_units)), tf.zeros((self.batch_sz, self.l1_enc_units))]

In [9]:
## Test Encoder Stack

units = 1024
encoder = Encoder(vocab_size, embedding_dim, units, BATCH_SIZE, embeddings_matrix)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_h.shape))
print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))

Encoder output shape: (batch size, sequence length, units) (16, 1024, 256)
Encoder h vecotr shape: (batch size, units) (16, 256)
Encoder c vector shape: (batch size, units) (16, 256)


In [10]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, attention_type='bahdanau'):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.attention_type = attention_type
    # Embedding Layer
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    #Final Dense layer on which softmax will be applied
    self.fc = tf.keras.layers.Dense(vocab_size)
    # Define the fundamental cell for decoder recurrent structure
    self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)
    # Sampler
    self.sampler = tfa.seq2seq.sampler.TrainingSampler()
    # Create attention mechanism with memory = None
    self.attention_mechanism = self.build_attention_mechanism(self.dec_units, None, self.batch_sz*[max_length_articles], self.attention_type)
    # Wrap attention mechanism with the fundamental rnn cell of decoder
    self.rnn_cell = self.build_rnn_cell(batch_sz)
    # Define the decoder with respect to fundamental rnn cell
    self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)

  def build_rnn_cell(self, batch_sz):
    rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell, self.attention_mechanism, attention_layer_size=self.dec_units)
    return rnn_cell

  def build_attention_mechanism(self, dec_units, memory, memory_sequence_length, attention_type='bahdanau'):
    # ------------- #
    # typ: Which sort of attention (Bahdanau, Luong)
    # dec_units: final dimension of attention outputs 
    # memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)
    # memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)

    if(attention_type=='bahdanau'):
      return tfa.seq2seq.BahdanauAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)
    else:
      return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)

  def build_initial_state(self, batch_sz, encoder_state, Dtype):
    decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_sz, dtype=Dtype)
    decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
    return decoder_initial_state

  def call(self, inputs, initial_state):
    x = self.embedding(inputs)
    outputs, dec_h, dec_c = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_sz*[max_length_summaries-1])
    return outputs, dec_h, dec_c

In [11]:
dec_units = units//4
# Test decoder stack
decoder = Decoder(vocab_size, embedding_dim, dec_units, BATCH_SIZE, 'bahdanau')
sample_x = tf.random.uniform((BATCH_SIZE, max_length_summaries))
decoder.attention_mechanism.setup_memory(sample_output)
initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c], tf.float32)

sample_decoder_outputs, dec_h, dec_c = decoder(sample_x, initial_state)
print(sample_x.shape)
print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)
# print("Decoder hidden Shape: ", dec_h)
print("Decoder cell Shape: ", dec_c.shape)

(16, 128)
Decoder Outputs Shape:  (16, 127, 43826)
Decoder cell Shape:  (16,)


In [12]:
optimizer = tf.keras.optimizers.Adam()


def loss_function(real, pred):
  # real shape = (BATCH_SIZE, max_length_output)
  # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
  cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
  loss = cross_entropy(y_true=real, y_pred=pred)
  mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
  mask = tf.cast(mask, dtype=loss.dtype)  
  loss = mask* loss
  loss = tf.reduce_mean(loss)
  return loss

In [13]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_h, enc_c = encoder(inp, enc_hidden)
    # dec_input = targ
    # real = targ
    dec_input = targ[ : , :-1 ] # Ignore <end> token
    real = targ[ : , 1: ]         # ignore <start> token
    # Set the AttentionMechanism object with encoder_outputs
    decoder.attention_mechanism.setup_memory(enc_output)
    # Create AttentionWrapperState as initial_state for decoder
    decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
    pred, _, _ = decoder(dec_input, decoder_initial_state)
    logits = pred.rnn_output
    loss = loss_function(real, logits)

  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))

  return loss

In [14]:
EPOCHS = 1
# steps_per_epoch = num_examples//BATCH_SIZE
steps_per_epoch = 1
for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  # print(enc_hidden[0].shape, enc_hidden[1].shape)

  for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  # if (epoch + 1) % 2 == 0:
    # checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.5497
Epoch 1 Loss 4.5497
Time taken for 1 epoch 168.8712022304535 sec



In [15]:
def evaluate_sentence(sentence):
    sentence = data.preprocess(sentence)
    inputs = data.tokenizer.texts_to_sequences([sentence])
    inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                          maxlen=max_length_articles,
                                                          padding='post')
    inputs = tf.convert_to_tensor(inputs)
    inference_batch_size = inputs.shape[0]
    # enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size, units))]
    enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units)), tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]
    e_out, e_h, e_c = encoder(inputs, enc_start_state)

    decoded_seq = np.zeros((1,max_length_summaries))
    decoded_seq[0,0]=word_index['<start>']
    li_dec = decoded_seq
    decoded_seq_word = '<start>'

    li=[]
    len_pred_summary = max_length_summaries - 2
    decoder.attention_mechanism.setup_memory(e_out)
    decoder_initial_state = decoder.build_initial_state(inference_batch_size, [e_h, e_c], tf.float32)
    decoded_seq, h, c = decoder(decoded_seq, decoder_initial_state)
    print(decoded_seq[0][0].shape)
    # decoded_new = numpy.reshape(decoded_seq, )
    ans_arr = np.argmax(decoded_seq[0][0], axis = 1)
    print(ans_arr.shape)
    return ans_arr

    # while decoded_seq_word !='<end>' and len_pred_summary>=0:
        
    #     decoded_seq, h,c = decoder(decoded_seq, decoder_initial_state)
    #     print(decoded_seq[0][0][i].shape)
    #     decoded_seq = np.argmax(decoded_seq[0][0][i])
    #     print(decoded_seq)
    #     decoded_seq_word = reverse_word_index.get(decoded_seq, 0)
    #     # decoded_seq = np.zeros((1,64))
    #     li_dec[0,i] = word_index.get(decoded_seq_word, 0)
    #     decoded_seq = li_dec
      
    #     e_h = h
    #     e_c = c
    #     li.append(decoded_seq_word)
        
    #     len_pred_summary = len_pred_summary-1
    #     i = i+1
    
    # # li.remove('<end>')
    # out_final = " ".join(li)
    
    # return out_final

def summarize(sentence):
  result = evaluate_sentence(sentence)
  result = data.tokenizer.sequences_to_texts([result])
  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

In [16]:
# def evaluate_sentence(sentence):
#   sentence = data.preprocess(sentence)
#   # print(sentence)
#   # inputs = [data.tokenizer.word_index[i] for i in sentence.split(' ')]
#   inputs = data.tokenizer.texts_to_sequences([sentence])
#   # print(inputs)
#   inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
#                                                           maxlen=max_length_articles,
#                                                           padding='post')
#   # print(inputs)
#   inputs = tf.convert_to_tensor(inputs)
#   inference_batch_size = inputs.shape[0]
#   result = ''

#   print("before encoder")
#   enc_start_state = [tf.zeros((inference_batch_size, 32)), tf.zeros((inference_batch_size,32)), tf.zeros((inference_batch_size, 32)), tf.zeros((inference_batch_size,32))]
#   enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)
#   print("out of encoder")

#   dec_h = enc_h
#   dec_c = enc_c

#   start_tokens = tf.fill([inference_batch_size], data.tokenizer.word_index['<start>'])
#   print(start_tokens)
#   end_token = data.tokenizer.word_index['<end>']

#   # --OLD--
#   # greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

#   # Instantiate BasicDecoder object
#   # --OLD--
#   # decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc)
#   # Setup Memory in decoder stack
#   decoder.attention_mechanism.setup_memory(enc_out)

#   # set decoder_initial_state
#   decoder_initial_state = decoder.build_initial_state(inference_batch_size, [enc_h, enc_c], tf.float32)

#   ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder 
#   ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. 
#   ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function
#   decoder_embedding_matrix = decoder.embedding.variables[0]

#   # --NEW--
#   print("before decoder")
#   dec_seq = []
#   dec_out = start_tokens
#   dec_len = 0
#   # --OLD--
#   # outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_tokens, end_token= end_token, initial_state=decoder_initial_state)
#   # --NEW--
#   while dec_out != end_token and dec_len <= max_length_summaries:
#     print("1")
#     print(dec_out)
#     dec_out, dec_h, dec_c = decoder(dec_out, decoder_initial_state)
#     print("2")
#     dec_out = np.argmax(dec_out[0.-1,:])
#     dec_seq.append(dec_out)
#     dec_len += 1
    
#   print("after decoder")
#   # --OLD--
#   # return outputs.sample_id.numpy()
#   return [dec_seq]

# def summarize(sentence):
#   result = evaluate_sentence(sentence)
#   # result = data.tokenizer.sequences_to_texts(result)
#   print('Input: %s' % (sentence))
#   print('Predicted translation: {}'.format(result))

In [17]:
test_sentence = "There are a number of job descriptions waiting for Darren Fletcher when he settles in at West Brom but the one he might not have expected is Saido Berahino’s nanny. Fletcher’s unveiling as the deadline day signing from Manchester United was almost eclipsed by the twenty-one-year-old striker, who is acquiring the habit of talking himself into trouble. Ten years Berahino’s senior, Fletcher will be expected to mentor a player who told the world this week that he wanted to play for a bigger club. Tony Pulis has advised Saido Berahino to focus on his performances at West Brom . Darren Fletcher has signed for the baggies where he will be asked to provide a role model for young players . That is off the pitch. On it, the Scotland midfielder wants to prove he is good enough to cut the mustard in the Premier League after finding starts harder and harder to come by at Old Trafford."
summarize(test_sentence)

(127, 43826)
(127,)
Input: There are a number of job descriptions waiting for Darren Fletcher when he settles in at West Brom but the one he might not have expected is Saido Berahino’s nanny. Fletcher’s unveiling as the deadline day signing from Manchester United was almost eclipsed by the twenty-one-year-old striker, who is acquiring the habit of talking himself into trouble. Ten years Berahino’s senior, Fletcher will be expected to mentor a player who told the world this week that he wanted to play for a bigger club. Tony Pulis has advised Saido Berahino to focus on his performances at West Brom . Darren Fletcher has signed for the baggies where he will be asked to provide a role model for young players . That is off the pitch. On it, the Scotland midfielder wants to prove he is good enough to cut the mustard in the Premier League after finding starts harder and harder to come by at Old Trafford.
Predicted translation: ['horvath the the the the the the the the the the the the the the

In [None]:
reverse_word_index[72]

'there'

In [None]:
tf.__version__

'2.4.1'

In [None]:
a = np.zeros((1,1))
a[0,0] = 69
a

array([[69.]])