# Pointer Generator Network

In [None]:
# Import local files
import vocabulary as vocab
from example import Example
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt

In [None]:
from vocabulary import UNK
from vocabulary import START
from vocabulary import END
from vocabulary import PAD

In [None]:
# Import modules
import numpy as np
import pandas as pd
import tensorflow as tf

### Load Dataset

In [None]:
dataset = pd.read_excel('Dataset.xlsx', 'Sheet1')

In [None]:
dataset

In [None]:
# Split dataset into input out sentences
src = np.array(dataset['Src'])
target = np.array(dataset['Target'])

In [None]:
target[:10]

In [None]:
src[:10]

### Preprocess source

In [None]:
# Function being using to create vocabulary from src sentences
def preprocess_src(text):
    text = str(text)
    text = text.strip()
    text = text.lower()
    return text

for i in range(0, len(src)):
    src[i] = preprocess_src(src[i])

### Generate Vocabulary

In [None]:
src_vocab = vocab.Vocabulary()
target_vocab = vocab.Vocabulary()

Run either the first cell or the second cell below this cell to load the vocabulary. Do not run them both.

While training of this model, the vocabulary was generated from the training set (second cell)

In [None]:
# Load vocabulary using pretrained glove word embedding. Glove word embedding of any dimension can be loaded through this function.
# Use the last parameter load_word_embedding=False to load only the vocabulary from the word embedding file
# and not the word embeddings
vocab.load_from_glove_vector(src_vocab, 'glove.6B.100d.txt', load_word_embedding=True)

In [None]:
# Generate vocabulary using sentences. The structure sentences should be a list of lists
# [ [sentence 1], [sentence 2], [sentence 3]... ]
vocab.generate_vocab_from_text(src_vocab, src, 5000)

Load python vocabulary, the vocabulary of output sentences

In [None]:
# Loads vocabulary from excel file. First argument is the file name, second argument is the sheet name.
vocab.load_vocab_from_excel(
    target_vocab, 'Python Vocabulary.xlsx', 'Python Vocabulary.xlsx')

### Generate examples

In [None]:
# Get max length of source and target sentences
src_max_len = max([len(str(sentence).split()) for sentence in src])
target_max_len = max([len(str(sentence).split()) for sentence in target])

In [None]:
examples = []
for i, j in zip(src, target):
    examples.append(
        Example(i, j, src_vocab, target_vocab, src_max_len, target_max_len))

In [None]:
# Shuffle examples
np.random.shuffle(examples)

#### Convert examples (Only for Attention mechanism)

Convert the examples so that they are compatible to be used with Attention mechanism. Attention mechanism will allow the <unk> tokens to be replaced with the words in the input sentence. If you want this to be a simple Seq2Seq model without attention, do not run the cell below.

In [None]:
for i in range(0, len(examples)):
    for j in range(0, len(examples[i].enc_input)):
        if examples[i].enc_input[j] >= src_vocab.vocab_size:
            examples[i].enc_input[j] = src_vocab.word_index[UNK]

for i in range(0, len(examples)):
    for j in range(0, len(examples[i].dec_input)):
        if examples[i].dec_input[j] >= target_vocab.vocab_size:
            examples[i].dec_input[j] = target_vocab.word_index[UNK]

In [None]:
# Train test split
test_size = 0 # in %age
# Get test_size % of examples
num_of_train_exs = len(examples) - int((test_size/100) * len(examples))
x_train = examples[:num_of_train_exs]
x_test = examples[num_of_train_exs:]

In [None]:
print('Number of training examples:', len(x_train))
print('Number of testing examples:', len(x_test))


### Define variables

In [None]:
src_vocab_size = src_vocab.vocab_size
target_vocab_size = target_vocab.vocab_size
# Get the max number of source oov word for extended vocabulary
max_src_oov_word = max([len(ex.source_oov_words) for ex in examples])

In [None]:
BUFFER_SIZE = len(examples)
BATCH_SIZE = 64
steps_per_epoch = len(examples)//BATCH_SIZE
embedding_dim = 256
units = 1024

### Model

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, enc_units, emb_dims, src_vocab_size):
        super(Encoder, self).__init__()

        # Define lstm
        self.lstm1 = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(
                enc_units, return_sequences=True, recurrent_initializer='glorot_uniform'))
        self.lstm2 = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(
                enc_units, return_sequences=True, recurrent_initializer='glorot_uniform'))
                
        self.src_embedding = tf.keras.layers.Embedding(src_vocab_size + 1, emb_dims)
    
    def call(self, x):
        # x: (batch_size, max_inp_len) np.ndarray

        x = tf.convert_to_tensor(x)
        
        embeddings = self.src_embedding(x) # (batch size, max_inp_len, emb_dims)
        
        # Pass through the LSTM cell
        activations = self.lstm1(embeddings) # (batch size, max_inp_len, enc_units)
        activations = self.lstm2(activations) # (batch size, max_inp_len, enc_units)

        return activations

In [None]:
class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, activations, s_prev):
        # activations (batch size, max_inp_length, enc_units)
        # s_prev (batch size, dec_units)

        # Add a dimension to s_prev
        s_prev = tf.expand_dims(s_prev, 1) # (batch size, 1, dec_units)
        w1 = self.W1(activations) # (batch size, max_inp_length, units)
        w2 = self.W2(s_prev) # (batch size, 1, units)
        self.score = self.V(tf.nn.tanh(w1 + w2)) # (batch size, max_inp_length, units)

        attention_weights = tf.nn.softmax(self.score, axis=1) # (batch size, max_inp_length, 1)
        context = attention_weights * activations # (batch size, max_inp_length, enc_units)
        context = tf.reduce_sum(context, axis=1) # (batch size, enc_units)

        return context, attention_weights

In [None]:
class PGen(tf.keras.Model):
    def __init__(self, units):
        super(PGen, self).__init__()
        self.Wh = tf.keras.layers.Dense(units)
        self.Ws = tf.keras.layers.Dense(units)
        self.Wx = tf.keras.layers.Dense(units)
        self.pgen = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, x, dec_hidden, context_vec):
        # x: word embedding at time t (batch size, emb_dims)
        # dec_hidden: hidden state of decoder at time t (batch size, dec_units)
        # context_vec: context vector at time t (batch size, enc_units)

        wx = self.Wx(x) # (batch size, units)
        ws = self.Ws(dec_hidden) # (batch size, units)
        wh = self.Wh(context_vec) # (batch size, units)

        pgen = self.pgen(wx + ws + wh) # (batch size, 1)

        return pgen

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, dec_units, emb_dims, target_vocab_size):
        super(Decoder, self).__init__()
        self.dec_units = dec_units

        self.target_embedding = tf.keras.layers.Embedding(target_vocab_size + 1, emb_dims)
        # Define LSTM cell
        self.lstm = tf.keras.layers.LSTM(dec_units, return_state=True, recurrent_initializer='glorot_uniform')

        self.fc = tf.keras.layers.Dense(target_vocab_size)

        self.attention = Attention(dec_units)

        self.pgen = PGen(dec_units)

    def call(self, x, activations, dec_hidden, dec_cell):
        # x: (batch size, 1)
        # activations: (batch size, max_inp_length, enc_units)
        # dec_hidden: (batch size, dec_units)
        # dec_cell: (batch size, dec_units)

        # Get context vector (batch size, enc_units) and attention weights (batch size, max_inp_length, 1)
        context, attention_weights = self.attention(activations, dec_hidden)

        # Get embeddings
        x = self.target_embedding(x) # (batch size, 1, emb_dims)

        # Calculate pgen
        pgen = self.pgen(tf.reshape(x, [x.shape[0], -1]), dec_hidden, context)

        # concatenate word embeddings and context vector
        # concatenate on last axis
        concat = tf.concat([tf.expand_dims(context, 1), x], axis=-1) # (batch size, 1, emb_dims + enc_units)

        # Pass through the lstm and get hidden state and cell state
        dec_hidden, _, dec_cell = self.lstm(concat, initial_state=[dec_hidden, dec_cell])

        # Get the prediction at time t
        predictions = self.fc(dec_hidden) # (batch size, target_vocab_size)

        # Softmax the predictions
        predictions = tf.nn.softmax(predictions, axis=1)

        return dec_hidden, dec_cell, predictions, tf.reshape(attention_weights, [x.shape[0], -1]), pgen

    def initialize_states(self, batch_size):
        return tf.zeros((batch_size, self.dec_units)), tf.zeros((batch_size, self.dec_units))

In [None]:
# Define encoder and decoder
encoder = Encoder(units, embedding_dim, src_vocab_size)
decoder = Decoder(units, embedding_dim, target_vocab_size)

In [None]:
# Only run this cell if you have saved weights
encoder.load_weights('enc_weights')
decoder.load_weights('dec_weights')

### Training

In [None]:
loss_plot = []

In [None]:
# Scale the attention over non-pad tokens
# enc_pad_mask is a list of lists of 1s and 0s. The 0s represent the padded area of encoder input
def apply_attention_mask(attention_dists, enc_pad_mask):
    # attention_dist: (batch size, src_max_len)
    # enc_pad_mask: (batch size, src_max_len)
    attention_dists = tf.math.multiply(attention_dists, enc_pad_mask)
    
    masked_sum = tf.reduce_sum(attention_dists, axis=-1) # (batch size,)
                            # (batch size, 1)
    return attention_dists / tf.reshape(masked_sum, [-1, 1])

In [None]:
# Calculate final distribution using vocabulary distribution and copy distribution
def calculate_final_dist(inp, vocab_dists, copy_dists, enc_pad_mask, pgen, max_src_oov_word, ptr_net=False):
    # inp : (batch size, src_max_len)
    # vocab_dist, the predictions of one time step for the whole batch (batch size, target_vocab_size)
    # The attention for one time step for over the whole batch (batch size, src_max_len)
    if ptr_net:
        copy_dists = apply_attention_mask(copy_dists, enc_pad_mask)

        vocab_dists = tf.math.multiply(pgen, vocab_dists)
        copy_dists = tf.math.multiply((1 - pgen), copy_dists)
        # (batch size, src_vocab_size + max_src_oov_word)
        copy_dists_projected = []

        for i in range(inp.shape[0]):
            copy_dists_projected.append(
                tf.scatter_nd(
                    tf.expand_dims(inp[i, :], 1), copy_dists[i, :], [src_vocab_size + max_src_oov_word]))
        # Concatenate vocab_dist and copy dist
        # (batch size, target_vocab_size + src_vocab_size + max_src_oov_word)
        final_dists = tf.concat([vocab_dists, copy_dists_projected], axis=1)

        return final_dists
    else:
        # else final distribution is just the vocabulary distribution (Seq2Seq with Attention)
        # (batch size, target_vocab_size)
        return vocab_dists

#### Define loss function and optimizer

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

def loss_function(real, pred, dec_pad_mask):
    # real = target (batch size, 1)
    # (batch size, target_vocab_size + src_vocab_size + max_src_oov_word) or
    # (batch size, target_vocab_size)
    # pred = final_dists, we'll get from calculate_final_dist function
    # ded_pad_mask: (batch size, 1)

    # Get probability of correct labels
    batch_sz = tf.expand_dims(tf.range(0, limit=real.shape[0]), 1) # (batch size, 1)
    real = tf.stack((batch_sz, real), 2)
    correct_probs = tf.gather_nd(pred, real) # (batch size, 1)
    # Calculate negtaive log likelihood of corrent probs
    loss = -tf.math.log(correct_probs) # (batch size, 1)
    # Apply dec_pad_mask to exclude the loss associated with PAD tokens
    loss = tf.math.multiply(loss, tf.cast(dec_pad_mask, dtype=tf.float32)) # (batch size, )
    # Take the average loss of the whole batch
    mean_loss = tf.reduce_mean(loss) # (1, 1)

    return mean_loss

In [None]:
def train_step(inp, targ, enc_pad_mask, dec_pad_mask):
    loss = 0

    with tf.GradientTape() as tape:
        # Get encoder output
        activations = encoder(inp)

        # Initialize decoder states
        dec_hidden, dec_cell = decoder.initialize_states(targ.shape[0])

        # Initial token (START)
        dec_inp = tf.expand_dims([target_vocab.word_index[START]] * targ.shape[0], 1)
        for t in range(1, targ.shape[1]):
            # Get decoder output
            dec_hidden, dec_cell, predictions, attention_weights, pgen = \
                decoder(dec_inp, activations, dec_hidden, dec_cell)

            # Calculate final distribution (vocabulary distribution + attention over inputs)
            # Set the last argument ptr_net=False if you want to disable the attention mechanism,
            # in that case the final distribution will be the vocabulary distrbution only
            final_dists = calculate_final_dist(
                inp, predictions, attention_weights, enc_pad_mask, pgen, max_src_oov_word, ptr_net=True)

            # Calculate loss
            loss += \
                loss_function(tf.expand_dims(targ[:, t], 1), final_dists, tf.expand_dims(dec_pad_mask[:, t], 1))

            for i in range(0, targ.shape[0]):
                if targ[i, t] >= target_vocab_size:
                    targ[i, t] = target_vocab.word_index[UNK]
            # Get next input for decoder
            dec_inp = tf.expand_dims(targ[:, t], 1)

        # Calculate loss over the whole batch
        batch_loss = (loss / int(targ.shape[1]))
        
        # Get trainable variables
        trainable_variables = encoder.trainable_variables + decoder.trainable_variables
        
        # Compute gradient w.r.t loss
        gradients = tape.gradient(loss, trainable_variables)
        
        # Back propagate and update weights
        optimizer.apply_gradients(zip(gradients, trainable_variables))
        
        return batch_loss

In [None]:
EPOCHS = 25

for epoch in range(EPOCHS):
    start = time.time()

    total_loss = 0

    # Shuffle the dataset
    np.random.shuffle(x_train)

    for batch, i in enumerate(range(0, len(x_train), BATCH_SIZE)):
        batch_length = i + 64
        if batch_length > len(x_train):
            exs = x_train[i:len(x_train)]
        else:
            exs = x_train[i:batch_length]

        inp = []
        targ = []
        enc_pad_mask = []
        dec_pad_mask = []

        for ex in exs:
            inp.append(ex.enc_input)
            targ.append(ex.dec_input)
            enc_pad_mask.append(ex.enc_pad_mask)
            dec_pad_mask.append(ex.dec_pad_mask)

        inp = np.array(inp)
        targ = np.array(targ)
        enc_pad_mask = np.array(enc_pad_mask)
        dec_pad_mask = np.array(dec_pad_mask)

        batch_loss = train_step(inp[:], targ[:], enc_pad_mask[:], dec_pad_mask[:])

        total_loss += batch_loss

        if batch % 5 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                        batch,
                                                        batch_loss.numpy()))
    # Store loss per epoch
    loss_plot.append(total_loss / steps_per_epoch)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
# Save model weights
encoder.save_weights('enc_weights', save_format='tf')
decoder.save_weights('dec_weights', save_format='tf')

In [None]:
# Plot training loss
plt.plot(loss_plot)
plt.xlabel('EPOCHS')
plt.ylabel('LOSS')

### Predict

In [None]:
def evaluate(inp):
    # Generate example
    ex = Example(inp, '', src_vocab, target_vocab, src_max_len, 0)
    # Pass the input to the encoder
    inp = np.reshape(ex.enc_input, (1, len(ex.enc_input)))

    activations = encoder(inp)

    dec_inp = tf.expand_dims([ex.dec_input], 0)
    dec_hidden, dec_cell = decoder.initialize_states(1)

    counter = 0
    output = '<start> '

    while counter < target_max_len:
        dec_hidden, dec_cell, predictions, attention_weights, pgen = decoder(
                dec_inp, activations, dec_hidden, dec_cell)
        
        final_dist = calculate_final_dist(
            inp, predictions, attention_weights, ex.enc_pad_mask, pgen, len(ex.source_oov_words), ptr_net=True)

        prediction_idx = tf.argmax(final_dist[0]).numpy()

        if prediction_idx < target_vocab_size:
            output += target_vocab.index_word[prediction_idx] + ' '

            if target_vocab.index_word[prediction_idx] == END:
                return output.strip(), ex.preprocessed_enc_input

        elif prediction_idx >= target_vocab_size and prediction_idx < (src_vocab_size + target_vocab_size):
            output += src_vocab.index_word[prediction_idx - target_vocab_size] + ' '

            if src_vocab.index_word[prediction_idx - target_vocab_size] == END:
                return output.strip(), ex.preprocessed_enc_input

        else:
            output += ex.source_oov_words[prediction_idx - src_vocab_size - target_vocab_size]

        counter += 1

    output += '<end>'
    return output.strip(), ex.preprocessed_enc_input

In [None]:
for i in range(0, 20):
    result, sent = evaluate(examples[i].preprocessed_enc_input)
    print(result)
    print(sent, '\n\n')