In [1]:
!nvidia-smi

/bin/bash: nvidia-smi: command not found


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install tqdm tensorflow_datasets tensorflow_addons pickle5

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 13.8 MB/s 
[?25hCollecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[K     |████████████████████████████████| 256 kB 48.5 MB/s 
Installing collected packages: tensorflow-addons, pickle5
Successfully installed pickle5-0.0.12 tensorflow-addons-0.15.0


In [5]:
from tqdm import tqdm
import pickle5 as pickle
import pandas as pd
tqdm.pandas()
import os, string, re, json, ast
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from nltk import wordpunct_tokenize
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Layer, Input, Dense, Embedding, Concatenate, TimeDistributed, RNN, LSTMCell, AdditiveAttention, GRU, LSTM
from tensorflow.keras.backend import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.losses import Loss, SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow_addons as tfa

In [6]:
filepath = '/content/drive/MyDrive/text_summarizer_models/no_glove/'

In [7]:
tokenizer = pickle.load(open(filepath + 'tokenizer.pickle', 'rb'))

In [8]:
vocab_size = len(tokenizer.word_index) + 1
max_features = vocab_size

In [9]:
vocab_size

197747

In [10]:
# Create vocabulary using tokenizer's assigned indices to words
word_index = tokenizer.word_index

In [11]:
embed_size = 192
maxlen_article = 550
maxlen_summary = 50
batch_size = 32

In [12]:
X_train = pickle.load(open(filepath + 'X_train.pickle', 'rb'))
y_train = pickle.load(open(filepath + 'y_train.pickle', 'rb'))

In [13]:
X_val = pickle.load(open(filepath + 'X_val.pickle', 'rb'))
y_val = pickle.load(open(filepath + 'y_val.pickle', 'rb'))

In [14]:
X_train.shape

(79519, 550)

In [15]:
X_val.shape

(8836, 550)

## Text Summarizer Model

In [16]:
class Encoder(Layer):
    def __init__(self, input_vocab_size, embedding_dim):
        super(Encoder, self).__init__()
        self.input_vocab_size = input_vocab_size

        # The embedding layer converts tokens to vectors
        self.embedding = Embedding(self.input_vocab_size, embedding_dim, embeddings_initializer = 'glorot_normal')

        # The GRU RNN layer processes those vectors sequentially.

        self.lstm1 = LSTM(128, return_sequences=True, 
                     return_state=True)

        # self.lstm2 = LSTM(128, return_sequences=True,
        #              return_state=True)
    
        # self.lstm3 = LSTM(128, return_sequences=True,
        #              return_state=True)

    def call(self, tokens, state=None):
    
        # 2. The embedding layer looks up the embedding for each token.
        vectors = self.embedding(tokens)

        # 3. The GRU processes the embedding sequence.
        #    output shape: (batch, s, enc_units)
        #    state shape: (batch, enc_units)
        output_1, state_h1, state_c1  = self.lstm1(vectors)
        # output_2, state_h2, state_c2 = self.lstm2(output_1)
        # output_3, state_h3, state_c3 = self.lstm3(output_2)
        # 4. Returns the new sequence and its state.
        # return output_3, state_h3
        # return output_2, state_h2
        return output_1, state_h1

In [17]:
import typing
from typing import Any, Tuple

class DecoderInput(typing.NamedTuple):
  new_tokens: Any
  enc_output: Any
  mask: Any

class DecoderOutput(typing.NamedTuple):
  logits: Any
  attention_weights: Any

In [18]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super().__init__()
    # For Eqn. (4), the  Bahdanau attention
    self.W1 = tf.keras.layers.Dense(units, use_bias=False)
    self.W2 = tf.keras.layers.Dense(units, use_bias=False)

    self.attention = AdditiveAttention()

  def call(self, query, value, mask):
    # From Eqn. (4), `W1@ht`.
    w1_query = self.W1(query)

    # From Eqn. (4), `W2@hs`.
    w2_key = self.W2(value)

    query_mask = tf.ones(tf.shape(query)[:-1], dtype=bool)
    value_mask = mask

    context_vector, attention_weights = self.attention(
        inputs = [w1_query, value, w2_key],
        mask=[query_mask, value_mask],
        return_attention_scores = True,
    )

    return context_vector, attention_weights

In [19]:
class Decoder(Layer):
    def __init__(self, output_vocab_size, embedding_dim, dec_units):
        super(Decoder, self).__init__()
        self.dec_units = dec_units
        self.output_vocab_size = output_vocab_size
        self.embedding_dim = embedding_dim

        # For Step 1. The embedding layer converts token IDs to vectors
        self.embedding = Embedding(self.output_vocab_size, embedding_dim, embeddings_initializer = 'glorot_normal')

        # For Step 2. The RNN keeps track of what's been generated so far.
        self.lstm = LSTM(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

        # For step 3. The RNN output will be the query for the attention layer.
        self.attention = BahdanauAttention(self.dec_units)

        # For step 4. Eqn. (3): converting `ct` to `at`
        self.Wc = Dense(dec_units, activation=tf.keras.activations.tanh,
                                    use_bias=False)

        # For step 5. This fully connected layer produces the logits for each
        # output token.
        self.fc = Dense(self.output_vocab_size)
    
    def call(self, inputs: DecoderInput, state=None) -> Tuple[DecoderOutput, tf.Tensor]:

        # Step 1. Lookup the embeddings
        vectors = self.embedding(inputs.new_tokens)
    
        # Step 2. Process one step with the RNN
        rnn_output, state, carry_state = self.lstm(vectors)

        # Step 3. Use the RNN output as the query for the attention over the
        # encoder output.
        context_vector, attention_weights = self.attention(query = rnn_output, value = inputs.enc_output, mask = inputs.mask)

        # Step 4. Eqn. (3): Join the context_vector and rnn_output
        #     [ct; ht] shape: (batch t, value_units + query_units)
        context_and_rnn_output = tf.concat([context_vector, rnn_output], axis = -1)

        # Step 4. Eqn. (3): `at = tanh(Wc@[ct; ht])`
        attention_vector = self.Wc(context_and_rnn_output)
        
        # Step 5. Generate logit predictions:
        logits = self.fc(attention_vector)

        return DecoderOutput(logits, attention_weights), state

In [20]:
class MaskedLoss(Loss):
    def __init__(self):
        self.name = 'masked_loss'
        self.loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

    def __call__(self, y_true, y_pred):
        # Calculate the loss for each item in the batch.
        loss = self.loss(y_true, y_pred)
    
        # Mask off the losses on padding.
        mask = tf.cast(y_true != 0, tf.float32)
        loss *= mask

        # Return the total.
        return tf.reduce_sum(loss)

In [21]:
class TextSummarizerModel(Model):
    def __init__(self, use_tf_function=True):
        super(TextSummarizerModel, self).__init__()
        self.encoder = Encoder(vocab_size, embed_size)
        self.decoder = Decoder(vocab_size, embed_size, 128)
        self.use_tf_function = use_tf_function

    def _loop_step(self, new_tokens, input_mask, enc_output, dec_state):
        input_token, target_token = new_tokens[:, 0:1], new_tokens[:, 1:2]
        # Run the decoder one step.
        decoder_input = DecoderInput(new_tokens=input_token,
                               enc_output=enc_output,
                               mask=input_mask)
        # print('Entered Decoder')
        dec_result, dec_state = self.decoder(decoder_input, state=dec_state)
        # print('Exit Decoder')
        # `self.loss` returns the total for non-padded tokens
        y = target_token
        y_pred = dec_result.logits
        step_loss = self.loss(y, y_pred)
        # print('Exit Loop Decoder')
        return step_loss, dec_state

    def get_masks(self, input_tokens, target_tokens):
        return (input_tokens != 0), (target_tokens != 0)

    def test_step(self, inputs):
        max_target_length = maxlen_summary
        X_eval, y_eval = inputs
        input_mask, target_mask = self.get_masks(X_eval, y_eval)
        enc_output, enc_state = self.encoder(X_eval)
        dec_state = enc_state
        loss = tf.constant(0.0)
        for t in range(max_target_length - 1):
            new_tokens = y_eval[:, t:t+2]
            step_loss, dec_state = self._loop_step(new_tokens, input_mask,
                                             enc_output, dec_state)
            loss = loss + step_loss
        
        average_loss = loss / tf.reduce_sum(tf.cast(target_mask, tf.float32))
        return {'average_loss': average_loss}

    def _train_step(self, input_tokens, target_tokens):
        max_target_length = maxlen_summary

        steps_per_epoch = X_train.shape[0] // batch_size

        for i in tf.range(steps_per_epoch):

            batch_inputs = input_tokens[i * batch_size: (i + 1) * batch_size, :]
            batch_outputs = target_tokens[i * batch_size: (i + 1) * batch_size, :]

            input_mask, target_mask = self.get_masks(batch_inputs, batch_outputs)

            with tf.GradientTape() as tape:
                # Encode the input
                enc_output, enc_state = self.encoder(batch_inputs)

                # Initialize the decoder's state to the encoder's final state.
                # This only works if the encoder and decoder have the same number of
                # units.
                dec_state = enc_state
                loss = tf.constant(0.0)

                for t in tf.range(max_target_length - 1):
                    # Pass in two tokens from the target sequence:
                    # 1. The current input to the decoder.
                    # 2. The target for the decoder's next prediction.
                    new_tokens = batch_outputs[:, t:t+2]
                    step_loss, dec_state = self._loop_step(new_tokens, input_mask,
                                             enc_output, dec_state)
                    loss = loss + step_loss

                # Average the loss over all non padding tokens.
                average_loss = loss / tf.reduce_sum(tf.cast(target_mask, tf.float32))

            # Apply an optimization step
            variables = self.trainable_variables
            gradients = tape.gradient(average_loss, variables)
            self.optimizer.apply_gradients(zip(gradients, variables))

            # Return a dict mapping metric names to current value
            # print('Batch {}: Loss = {}'.format(i, average_loss))
            # print('Time Taken for batch: {}s'.format(time.time() - start_time))
            return {'batch_loss': average_loss}

    @tf.function
    def _tf_train_step(self, input_tokens, output_tokens):
        # print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
        return self._train_step(input_tokens, output_tokens)

    def train_step(self, inputs):
        input_tokens, output_tokens = inputs
        if self.use_tf_function:
            return self._tf_train_step(input_tokens, output_tokens)
        else:
            return self._train_step(input_tokens, output_tokens)


In [22]:
class BatchLogs(tf.keras.callbacks.Callback):
    def __init__(self, key):
        self.key = key
        self.logs = []

    def on_train_batch_end(self, n, logs):
        self.logs.append(logs[self.key])

batch_loss = BatchLogs('batch_loss')

In [23]:
opt_path = 'optimizer_weights_03-02-2022_17:_34:_15_12.npy'
optimizer_weights_path = filepath + opt_path

In [24]:
optimizer_weights = np.load(optimizer_weights_path, allow_pickle = True)

In [25]:
model = TextSummarizerModel()

In [26]:
optimizer = tf.keras.optimizers.Adam()

In [27]:
class OptimizerSaver(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        weights = optimizer.get_weights()
        optimizer_path = filepath + 'optimizer_weights_' + dt_string + '_{}.npy'.format(epoch)
        np.save(optimizer_path, optimizer.get_weights())

op_saver = OptimizerSaver()

In [28]:
# Configure the loss and optimizer
model.compile(optimizer = optimizer, loss = MaskedLoss())

In [29]:
weights_path = 'retrained_model_checkpoint_weights_03-02-2022_17:_34:_15_13.h5'

In [30]:
model.fit(X_train, y_train)

   3/2485 [..............................] - ETA: 7:54:06 - batch_loss: 11.9235

KeyboardInterrupt: ignored

In [31]:
optimizer.set_weights(optimizer_weights)

In [32]:
model.compile(optimizer = optimizer, loss = MaskedLoss())

In [33]:
model.built = True

In [34]:
model.load_weights(filepath + weights_path)

In [35]:
from datetime import datetime

now = datetime.now()
dt_string = now.strftime("%d-%m-%Y_%H:_%M:_%S")

In [36]:
ckpt_filepath = filepath + 'retrained_model_checkpoint_weights_'+ dt_string +'_{epoch:02d}.h5'
callback = tf.keras.callbacks.ModelCheckpoint(
	ckpt_filepath,
	period=1,
	monitor="batch_loss",
	verbose=0,
	save_best_only=False,
	save_weights_only=True,
	mode="auto",
	save_freq="epoch"
)



### Model Testing

In [37]:
class Summarizer(tf.Module):
    def __init__(self, encoder, decoder):
        self.encoder = encoder
        self.decoder = decoder
        self.output_token_string_from_index = (tf.keras.layers.StringLookup(
            vocabulary = list(word_index.keys()),
            mask_token='',
            invert=True))
        # The output should never generate padding, unknown, or start.
        index_from_string = tf.keras.layers.StringLookup(vocabulary = list(word_index.keys()), mask_token='')
        token_mask_ids = index_from_string(['', '<s>', '</s>']).numpy()
        token_mask = np.zeros([vocab_size], dtype=np.bool)
        token_mask[np.array(token_mask_ids)] = True
        self.token_mask = token_mask
        self.start_token = index_from_string(tf.constant('<s>'))
        self.end_token = index_from_string(tf.constant('</s>'))
    
    def tokens_to_text(self, result_tokens):
        result_text_tokens = self.output_token_string_from_index(result_tokens)
        
        result_text = tf.strings.reduce_join(result_text_tokens,
                                       axis=1, separator=' ')

        result_text = tf.strings.strip(result_text)
        return result_text

    def sample(self, logits, temperature):
        token_mask = self.token_mask[tf.newaxis, tf.newaxis, :]
  
        # Set the logits for all masked tokens to -inf, so they are never chosen.
        logits = tf.where(self.token_mask, -np.inf, logits)

        if temperature == 0.0:
            new_tokens = tf.argmax(logits, axis=-1)
        else: 
            logits = tf.squeeze(logits, axis=1)
        
        new_tokens = tf.random.categorical(logits/temperature,
                                        num_samples=1)
        return new_tokens

    def abstractive_summarize(self, input_tokens, *, max_length=maxlen_article, return_attention=True, temperature=1.0):
        batch_size = tf.shape(input_tokens)[0]
        enc_output, enc_state = self.encoder(input_tokens)

        dec_state = enc_state
        new_tokens = tf.fill([batch_size, 1], self.start_token)

        result_tokens = []
        attention = []
        done = tf.zeros([batch_size, 1], dtype=tf.bool)

        for _ in range(max_length):
            dec_input = DecoderInput(new_tokens=new_tokens,
                             enc_output=enc_output,
                             mask=(input_tokens!=0))

            dec_result, dec_state = self.decoder(dec_input, state=dec_state)

            attention.append(dec_result.attention_weights)

            new_tokens = self.sample(dec_result.logits, temperature)

            # If a sequence produces an `end_token`, set it `done`
            done = done | (new_tokens == self.end_token)
            # Once a sequence is done it only produces 0-padding.
            new_tokens = tf.where(done, tf.constant(0, dtype=tf.int64), new_tokens)

            # Collect the generated tokens
            result_tokens.append(new_tokens)

            if tf.executing_eagerly() and tf.reduce_all(done):
                break

        # Convert the list of generates token ids to a list of strings.
        result_tokens = tf.concat(result_tokens, axis=-1)
        result_text = self.tokens_to_text(result_tokens)

        if return_attention:
            attention_stack = tf.concat(attention, axis=1)
            return {'text': result_text, 'attention': attention_stack}
        else:
            return {'text': result_text}

    @tf.function(input_signature=[tf.TensorSpec(dtype=tf.int32, shape=[None, maxlen_article])])
    def tf_summarize(self, input_tokens):
        return self.abstractive_summarize(input_tokens)

In [38]:
summarizer = Summarizer(encoder = model.encoder, decoder = model.decoder)

In [39]:
tf.saved_model.save(summarizer, 'summarizer',
                    signatures={'serving_default': summarizer.tf_summarize})



INFO:tensorflow:Assets written to: summarizer/assets


INFO:tensorflow:Assets written to: summarizer/assets


In [None]:
summarizer.tf_summarize(X_val[:2, :])['text']

In [40]:
!mv summarizer /content/drive/MyDrive/text_summarizer_models/