<a href="https://colab.research.google.com/github/NiloyPurkait/GSoC-2020/blob/master/V2.0/Transformers/Adversarial_Training(with_error).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Adverserial training script


In [None]:
#! pip install tf-nightly-gpu

In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt
import io
import unicodedata
import re
from re import finditer

## Setup input pipeline

In [None]:

from google.colab import drive
drive.mount('/content/gdrive')

file_path = "/content/gdrive/My Drive/f_data.txt"
test_path = "/content/gdrive/My Drive/data/processed_graphs/eng/gat/test_data.txt"

In [None]:
from pretraining import *
from transformer_generator import *
from transformer_discriminator import *

In [None]:

batch_size = 16
max_len = 40
train_dataset, tokenizer_txt = create_generator_dataset(file_path, BATCH_SIZE=batch_size, MAX_LEN=max_len)

## Loss and metrics

In [None]:
def discriminator_loss(real_output, fake_output):

    '''
  Quantifies discriminator's ability to distinguish real sequences from fakes.
  It compares the discriminator's predictions on real images to an array of 1s,
  and the discriminator's predictions on fake (generated) sequences
  to an array of 0s.
    '''

    loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    real_loss = loss_object(tf.ones_like(real_output), real_output)
    fake_loss = loss_object(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss



def generator_loss(real_output, fake_output):

    '''
  Quantifies generator's ability to trick the discriminator. 
  If the generator is doing well, discriminator will classify 
  fake sequences as real (or 1). We thus compare the discriminators
  decisions on the generated sequences to an array of 1s.
    '''
    loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    fake_output = tf.convert_to_tensor(fake_output, dtype=tf.float32)
    loss_ = loss_object(tf.ones_like(fake_output,dtype=tf.float32), fake_output)
    return  loss_ #tf.reduce_sum(




## Set hyperparameters

To keep this example small and relatively fast, the values for *num_layers, d_model, and dff* have been reduced. 

The values used in the base model of transformer were; *num_layers=6*, *d_model = 512*, *dff = 2048*. See the [paper](https://arxiv.org/abs/1706.03762) for all the other versions of the transformer.

Note: By changing the values below, you can get the model that achieved state of the art on many tasks.

In [None]:
#Generator params
num_layers = 4
d_model = 128
dff = 512
num_heads = 8

input_vocab_size = target_vocab_size = tokenizer_txt.vocab_size + 2
 
dropout_rate = 0.1

generator_optimizer = tf.keras.optimizers.Adam(1e-4)


In [None]:
learning_rate = CustomSchedule(d_model)


train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

In [None]:
generator = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, 
                          pe_input=input_vocab_size, 
                          pe_target=target_vocab_size,
                          rate=dropout_rate)

In [None]:
def pretrain_loss_function(real, pred):
  '''
  # Sparse categorical crossentropy 
  # loss function used for generator pretraining
  '''
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

def pretrain_step(inp, tar):
  '''
  # Pretraining step for generator network
  '''
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  
  with tf.GradientTape() as tape:
    predictions, _ = generator(inp, tar_inp, 
                                 True, 
                                 enc_padding_mask, 
                                 combined_mask, 
                                 dec_padding_mask)
    
    loss = pretrain_loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, generator.trainable_variables)    
  generator_optimizer.apply_gradients(zip(gradients, generator.trainable_variables))
  
  train_loss(loss)
  train_accuracy(tar_real, predictions)



## Pass data through generator to be able to load in weights

In [None]:

for (inpt, targ) in train_dataset:
  pretrain_step(inpt, targ)
  break
generator.load_weights('./generator_weights.h5')

## Define discriminator and load in weights

In [None]:
# Define discriminator and load in weights
DATA_MAX_LEN = 250
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator = TransformerDiscriminator(tokenizer_txt.vocab_size+2, maxlen=DATA_MAX_LEN)
discriminator.load_weights('./discriminator_weights.h5')

## Create the checkpoint path and the checkpoint manager.
 This will be used to save checkpoints every `n` epochs.

In [None]:
checkpoint_path = "./content/checkpoints/train"
ckpt = tf.train.Checkpoint(generator=generator,
                           optimizer=generator_optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)


In [None]:

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')


## Define helper functions to print print and pad predictions

In [None]:
def render_preds(batch_pred, inp, tar, n=2):
    '''
    Print out input, target, and preds
    '''
    print(type(batch_pred), type(inp), batch_pred.shape, inp.shape)
    for (ind,i) in enumerate(batch_pred):
      print('\n| Predicted: ', decode_text(i, tokenizer_txt))
      print('| True: ', decode_text(tar[ind], tokenizer_txt))
      print('| Input RDF: ', decode_text(inp[ind], tokenizer_txt))
      print()
      if ind==n:
        break

def pad(tensor, maxlen=250):
    '''
    Used to pad a tensor, such as a batch of generated text
    '''
    return tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                        padding='post',
                                                        value=0,
                                                        maxlen=maxlen)

## Define function to clean generator output and concatenate sequences during training

In [None]:
##max_len global varable


def gen_batch(preds, inp, tar):
  '''
  Cleans predictions and created concatenated 
  triple-text batches for the discriminator

  Takes predictions, input and target batch
  Returns training data for discriminator (all_data, all_labels)
          as well as the generated triple-text batch (gens)
  '''
  #to collect cleaned generations
  gen_data = []

  # Iterate over predictions in batch
  for sent in preds:

    # Decode predicted sequence into string
    unparsed = decode_text(sent, tokenizer_txt)

    #  Remove all characters after '<end>' token from generated outputs
    retokenized = tokenizer_txt.encode(unparsed.split('<end>')[0]+'<end>')
    gen_data.append(retokenized)

  # Pad cleaned generations
  gen_data = pad(gen_data)

  # Horizontally stack input triples and generated sequences
  gens = pad(tf.concat([inp, gen_data], axis=-1, name='concat'))

  # Horizontally input triples and real target sequences
  real = pad(tf.concat([inp, tar], axis=-1, name='concat'))

  # Vertically stack real and generated sequences 
  all_data = tf.concat([gens, real], axis=0)
  all_labels = tf.concat(  [
                            tf.zeros((batch_size, 1)),
                            tf.ones((batch_size, 1))
                           ], axis=0)

  return all_data, all_labels, gens




# Wrap the function into tensorflow op for eager execution
@tf.function( experimental_relax_shapes=True)
def tf_gen_batch(preds, inp,  tar):
  all_data, all_labels, gens = tf.py_function(gen_batch, inp=[preds, inp, tar], Tout=[tf.int32, tf.float32, tf.int32])
  return all_data, all_labels, gens




## Define training step

In [None]:
##
## Nested gradient tapes
##


def train_step(inp, tar):

    # targets shifted by 1 index position
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    
    #Get encoding, combined and decoding masks
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    # Initialize Generator gradient tape
    with tf.GradientTape() as gen_tape:

        #Get predictions from generator
        predictions, _ = generator(inp, tar_inp, 
                             True, 
                             enc_padding_mask, 
                             combined_mask, 
                             dec_padding_mask)

        #Get predictions per input in batch
        batch_pred = tf.argmax(predictions, axis=-1)

        # Make discriminator training data by cleaning 
        # generated dataand assembling real data
        all, labels, gens = tf_gen_batch(batch_pred,  inp, tar)

        # Add noise to labels
        labels += 0.05 * tf.random.uniform(tf.shape(labels))

        # Get discriminator's predictions of generator's output
        disc_preds = discriminator(gens)
        disc_preds = tf.convert_to_tensor( disc_preds, dtype=tf.float32)

        # Initialize discriminator gradient tape
        with tf.GradientTape() as disc_tape:

            # predict on real and generated sequence
            predictions = discriminator(all)
            # Calculate loss using discriminator loss function
            d_loss = discriminator_loss(labels, predictions)
        
        # Get discriminator gradients and apply using optimizer
        disc_grads = disc_tape.gradient(d_loss, discriminator.trainable_weights)
        discriminator_optimizer.apply_gradients(zip(disc_grads, discriminator.trainable_weights))
    
    # Make a tensor of ones, as ideal labels for of generated sequences
    ideal_labels = tf.ones((batch_size, 1))
    # and calculate generator loss 
    g_loss = generator_loss(ideal_labels, disc_preds)
    
    # Get generator gradients and apply using optimizer
    gen_grads = gen_tape.gradient(g_loss, generator.trainable_weights)
    generator_optimizer.apply_gradients(zip(gen_grads, generator.trainable_weights))


    # display generator loss
    train_loss(g_loss)
    #display discriminator loss
    train_loss(d_loss)


In [None]:
EPOCHS = 10

## Define training function

In [None]:
def train():
  '''
  Function to initialize training process
  Prints Generator and discriminator loss during training
  '''
  for epoch in range(EPOCHS):
    start = time.time()
    
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    for (batch, (inp, tar)) in enumerate(train_dataset):
      train_step(inp, tar)
      
      if batch % 50 == 0:
        print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
            epoch + 1, batch, train_loss.result(), train_accuracy.result()))
        
    if (epoch + 1) % 5 == 0:
      ckpt_save_path = ckpt_manager.save()
      print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                          ckpt_save_path))
      
    print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                  train_loss.result(), 
                                                  train_accuracy.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

In [None]:
train()

In [None]:
##
## Sequential gradient tapes
##

def train_step(inp, tar):

    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)


    predictions, _ = generator(inp, tar_inp, 
                             True, 
                             enc_padding_mask, 
                             combined_mask, 
                             dec_padding_mask)

    batch_pred = tf.argmax(predictions, axis=-1)

    all, labels, gens = gen_batch(batch_pred, inp, tar)
    labels += 0.05 * tf.random.uniform(tf.shape(labels))
    disc_preds = discriminator(gens)
    disc_preds = tf.convert_to_tensor( disc_preds, dtype=tf.float32)


    # Train the discriminator
    with tf.GradientTape() as tape:
        predictions = discriminator(all)
        d_loss = discriminator_loss(labels, predictions)

    grads = tape.gradient(d_loss, discriminator.trainable_weights)
    discriminator_optimizer.apply_gradients(
        zip(grads, discriminator.trainable_weights)
    )

    # Train the generator (note that we should *not* update the weights
    # of the discriminator)!
    with tf.GradientTape() as tape:
        tape.watch(disc_preds)

        ideal_labels = tf.ones((batch_size, 1))

        g_loss = generator_loss(ideal_labels, predictions)
    
    grads = tape.gradient(g_loss, generator.trainable_weights)
    print(grads)

    generator_optimizer.apply_gradients(zip(grads, generator.trainable_weights))
    train_loss(g_loss)
    train_loss(d_loss)


In [None]:
train()

In [None]:
generator.save_weights('./generator_weights.h5')

## Evaluate

In [None]:
def evaluate_(inp_sentence):

  encoder_input = tf.expand_dims(inp_sentence, 0)

  decoder_input = [tokenizer_txt.vocab_size]
  output = tf.expand_dims(decoder_input, 0)
    
  for i in range(MAX_LENGTH):
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
        encoder_input, output)
  
    # predictions.shape == (batch_size, seq_len, vocab_size)
    predictions, attention_weights = transformer(encoder_input, 
                                                 output,
                                                 False,
                                                 enc_padding_mask,
                                                 combined_mask,
                                                 dec_padding_mask)
    
    # select the last word from the seq_len dimension
    predictions = predictions[: ,-1:, :]  # (batch_size, 1, vocab_size)

    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)
    
    # return the result if the predicted_id is equal to the end token
    if predicted_id == tokenizer_txt.vocab_size+1:
      return tf.squeeze(output, axis=0)
    
    # concatentate the predicted_id to the output which is given to the decoder
    # as its input.
    output = tf.concat([output, predicted_id], axis=-1)

  return tf.squeeze(output, axis=0)

In [None]:
MAX_LENGTH=250
rdfb, txtb = next(iter(train_dataset))

In [None]:
predicted_sentence = evaluate_(rdfb[0])

In [None]:
decode_text(predicted_sentence, tokenizer_txt)