<a href="https://colab.research.google.com/github/NiloyPurkait/GSoC-2020/blob/master/RDF_GAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

import random
import pickle
import math
import os
import re
import unicodedata
from functools import reduce
import numpy as np
import sentencepiece as spm
from tqdm import tqdm
import time

import tensorflow as tf
from tensorflow.keras import backend as K

import logging


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # FATAL
logging.getLogger('tensorflow').setLevel(logging.FATAL)

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:

#tiny number
_NEG_INF = -1e8

# Preprocessing Helper functions 


In [0]:



def _tensorize(vocab, text):
    """
    Function to convert texts into number sequences first, and then
    add padding. Basically, tensorising them.
    :param vocab: The vocab which is used to lookup ids
    :type vocab: tf.tokenizer obj
    :param text: A list of sentences or a text file
    :type text: list
    :return: tensorised text data
    :rtype: tf.tensor
    """
    tensor = vocab.texts_to_sequences(text)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                           padding='post')

    return tensor


In [0]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [0]:

def padding(tensor, max_length):
    """
    Pads the given tensor to a maximum sequence length along
    axis 1.
    for ex -
    let the tensor be [1,2,3,4] if th given max_length is 5
    the tensor becomes [1,2,34,0]
    Mostly used to pad the target sentences of the multilingual
    model and the node_list of all models,

    :param tensor:A tf tensor
    :type tensor:tf.tensor
    :param max_length:Dimension along axis 1, of the new tensor
    :type max_length:int
    :return:The padded tensor
    :rtype:tf tensor.
    """

    padding = tf.constant([[0, 0], [0, max_length - tensor.shape[1]]])
    padded_tensor = tf.pad(tensor, padding, mode='CONSTANT')

    return padded_tensor

In [0]:
# from src.DataLoader imports

## Dataset Loading Functions

In [0]:
def LoadGatDataset(train_path, eval_path, test_path, srv_vocab,
                   tgt_vocab, lang, num_examples=None):
    train_ = {}
    eval_ = {}
    test_ = {}


    # load the train and eval datasets
    with open(train_path, 'rb') as f:
        train_set = pickle.load(f)
    with open(eval_path, 'rb') as f:
        eval_set = pickle.load(f)
    with open(test_path, 'rb') as f:
        test_set = pickle.load(f)
    with open(srv_vocab, 'rb') as f:
        src_vocab = pickle.load(f)

    train_input, train_tgt = zip(*train_set)
    eval_input, eval_tgt = zip(*eval_set)
    (train_nodes, train_labels, train_node1, train_node2) = zip(*train_input)
    (eval_nodes, eval_labels, eval_node1, eval_node2) = zip(*eval_input)
    (test_nodes, test_labels, test_node1, test_node2) = zip(*test_set)

    train_["train_node_tensor"] = _tensorize(src_vocab, train_nodes)
    train_["train_label_tensor"] = _tensorize(src_vocab, train_labels)
    train_["train_node1_tensor"] = _tensorize(src_vocab, train_node1)
    train_["train_node2_tensor"] = _tensorize(src_vocab, train_node2)

    eval_["eval_node_tensor"] = _tensorize(src_vocab, eval_nodes)
    eval_["eval_label_tensor"] = _tensorize(src_vocab, eval_labels)
    eval_["eval_node1_tensor"] = _tensorize(src_vocab, eval_node1)
    eval_["eval_node2_tensor"] = _tensorize(src_vocab, eval_node2)

    test_["test_node_tensor"] = _tensorize(src_vocab, test_nodes)
    test_["test_label_tensor"] = _tensorize(src_vocab, test_labels)
    test_["test_node1_tensor"] = _tensorize(src_vocab, test_node1)
    test_["test_node2_tensor"] = _tensorize(src_vocab, test_node1)


    train_tgt_tensor = src_vocab.texts_to_sequences(train_tgt)
    train_["train_tgt_tensor"] = tf.keras.preprocessing.sequence.pad_sequences(train_tgt_tensor, padding='post')
    eval_tgt_tensor = src_vocab.texts_to_sequences(eval_tgt)
    eval_["eval_tgt_tensor"] = tf.keras.preprocessing.sequence.pad_sequences(eval_tgt_tensor, padding='post')
    target_vocab = src_vocab

    return (train_, eval_, test_, src_vocab, target_vocab, max_length(train_tgt_tensor))

In [0]:
def GetGATDataset(train_path, eval_path,
                  test_path, src_vocab,
                  tgt_vocab, lang,
                  set=None):
  

    (train, eval, test, src_vocab, tgt_vocab, max_length_targ) = LoadGatDataset(train_path,
                                                                                eval_path,
                                                                                test_path, src_vocab,
                                                                                tgt_vocab, lang)
    node_tensor = padding(train["train_node_tensor"], 16)
    label_tensor = padding(train["train_label_tensor"], 16)
    node1_tensor = padding(train["train_node1_tensor"], 16)
    node2_tensor = padding(train["train_node2_tensor"], 16)

    eval_nodes = padding(eval["eval_node_tensor"], 16)
    eval_labels = padding(eval["eval_label_tensor"], 16)
    eval_node1 = padding(eval["eval_node1_tensor"], 16)
    eval_node2 = padding(eval["eval_node2_tensor"], 16)

    test_nodes = padding(test["test_node_tensor"], 16)
    test_labels = padding(test["test_label_tensor"], 16)
    test_node1 = padding(test["test_node1_tensor"], 16)
    test_node2 = padding(test["test_node2_tensor"], 16)

    print('\nTrain Tensor shapes (nodes, labels, node1, node2, target) : ')
    print(node_tensor.shape, label_tensor.shape, node1_tensor.shape, node2_tensor.shape, train["train_tgt_tensor"].shape)
    print('\nEval Tensor shapes (nodes, labes, node1, node2) : ')
    print(eval_nodes.shape, eval_labels.shape, eval_node1.shape, eval_node2.shape, eval["eval_tgt_tensor"].shape)
    print('\nTest Tensor shapes (nodes, labes, node1, node2) : ')
    print(test_nodes.shape, test_labels.shape, test_node1.shape, test_node2.shape)

    TRAIN_BUFFER_SIZE = len(train["train_tgt_tensor"])
    EVAL_BUFFER_SIZE = len(eval["eval_tgt_tensor"])
    BATCH_SIZE = batch_size
    steps_per_epoch = len(train["train_tgt_tensor"]) // BATCH_SIZE
    src_vocab_size = len(src_vocab.word_index) + 1

    tgt_vocab_size = len(tgt_vocab.word_index) + 1

    dataset_size = train["train_tgt_tensor"].shape[0]

    dataset = tf.data.Dataset.from_tensor_slices((node_tensor, label_tensor,
                                                  node1_tensor, node2_tensor, train["train_tgt_tensor"])).shuffle(TRAIN_BUFFER_SIZE)
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

    eval_set = tf.data.Dataset.from_tensor_slices((eval_nodes, eval_labels,
                                                   eval_node1, eval_node2, eval["eval_tgt_tensor"])).shuffle(EVAL_BUFFER_SIZE)
    eval_set = eval_set.batch(BATCH_SIZE, drop_remainder=True)

    test_set = tf.data.Dataset.from_tensor_slices((test_nodes, test_labels,
                                                   test_node1, test_node2))
    test_set = test_set.batch(BATCH_SIZE, drop_remainder=True)

    if set == None:
        return (dataset, eval_set, test_set, TRAIN_BUFFER_SIZE, BATCH_SIZE, steps_per_epoch,
                src_vocab_size, src_vocab, tgt_vocab_size, tgt_vocab,
                max_length_targ, dataset_size)
    elif set == 'test':
        return (test_set, TRAIN_BUFFER_SIZE, BATCH_SIZE, steps_per_epoch,
                src_vocab_size, src_vocab, tgt_vocab_size, tgt_vocab)


  



## Load processed Dataset

In [0]:




train_path = '/content/gdrive/My Drive/data/reif_train'
eval_path = '/content/gdrive/My Drive/data/reif_eval'
test_path = '/content/gdrive/My Drive/data/reif_test'
src_vocab = '/content/gdrive/My Drive/data/reif_src_vocab'
tgt_vocab =  '/content/gdrive/My Drive/data/train_vocab.model'


lang = 'eng'
batch_size = 2

In [11]:
(dataset, eval_set, test_set, BUFFER_SIZE, BATCH_SIZE, steps_per_epoch,
     src_vocab_size, src_vocab, tgt_vocab_size, tgt_vocab, max_length_targ, dataset_size) = GetGATDataset(train_path, eval_path,
                                                                                                     test_path, src_vocab,
                                                                                                      tgt_vocab, lang)


Train Tensor shapes (nodes, labels, node1, node2, target) : 
(34352, 16) (34352, 16) (34352, 16) (34352, 16) (34352, 82)

Eval Tensor shapes (nodes, labes, node1, node2) : 
(4316, 16) (4316, 16) (4316, 16) (4316, 16) (4316, 70)

Test Tensor shapes (nodes, labes, node1, node2) : 
(4224, 16) (4224, 16) (4224, 16) (4224, 16)


## Probe the loaded dataset

In [12]:
tgt_vocab_size, src_vocab_size

(10248, 10248)

In [13]:
 dataset_size, BUFFER_SIZE, BATCH_SIZE, steps_per_epoch

(34352, 34352, 2, 17176)

## Make example input and target batches

In [0]:
#Example inputs and target batch for testing

nodes_, labels_, node1_, node2_, target_ = next(iter(dataset))

In [15]:
 def convert_to_string(target_, n):
    '''
    args:
      target_ : target tensor batch
      n = nth element in batch 

    usage:
      Takes in a tensor of (batch_size, vocab_size) and
      converts a batch instance into its string equivalent
    '''
    sequence = [tf.reduce_sum(i).numpy() for i in target_[n]]
    text = ' '.join([src_vocab.index_word[i] if i!=0 else '<pad>' for i in sequence])
    print(text)

convert_to_string(target_, 1)

<start> banyumasan people , one of the ethnic groups found in java , in the region of malaysia eat ayam penyet . <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


# Implement Encoder Module Layers:
To assemble the encoder modules, we must first define the following layers:
- Shared Embedding Layer
- Graph Attention Layer
- Feed Forward Layer


### Define Embedding Layer

In [0]:
# Embedding layer

class EmbeddingSharedWeights(tf.keras.layers.Layer):
    """Calculates input embeddings and pre-softmax linear with shared weights."""

    def __init__(self, vocab_size, hidden_size):
        """Specify characteristic parameters of embedding layer.

        Args:
          vocab_size: Number of tokens in the embedding. (Typically ~32,000)
          hidden_size: Dimensionality of the embedding. (Typically 512 or 1024)
        """
        super(EmbeddingSharedWeights, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size

    def build(self, input_shape):
        """Build embedding layer."""
        with tf.name_scope("embedding_and_softmax"):
            # Create and initialize weights. The random normal initializer was chosen
            # arbitrarily, and works well.
            self.shared_weights = self.add_weight(
                "weights",
                shape=[self.vocab_size, self.hidden_size],
                dtype="float32",
                initializer=tf.random_normal_initializer(
                    mean=0., stddev=self.hidden_size ** -0.5))
        super(EmbeddingSharedWeights, self).build(input_shape)

    def get_config(self):
        return {
            "vocab_size": self.vocab_size,
            "hidden_size": self.hidden_size,
        }

    def call(self, inputs, mode="embedding"):
        """Get token embeddings of inputs.

        Args:
          inputs: An int64 tensor with shape [batch_size, length]
          mode: string, a valid value is one of "embedding" and "linear".
        Returns:
          outputs: (1) If mode == "embedding", output embedding tensor, float32 with
            shape [batch_size, length, embedding_size]; (2) mode == "linear", output
            linear tensor, float32 with shape [batch_size, length, vocab_size].
        Raises:
          ValueError: if mode is not valid.
        """
        if mode == "embedding":
            return self._embedding(inputs)
        elif mode == "linear":
            return self._linear(inputs)
        else:
            raise ValueError("mode {} is not valid.".format(mode))

    def _embedding(self, inputs):
        """Applies embedding based on inputs tensor."""
        with tf.name_scope("embedding"):
            # Create binary mask of size [batch_size, length]
            mask = tf.cast(tf.not_equal(inputs, 0), tf.float32)
            embeddings = tf.gather(self.shared_weights, inputs)
            embeddings *= tf.expand_dims(mask, -1)
            # Scale embedding by the sqrt of the hidden size
            embeddings *= self.hidden_size ** 0.5

            return embeddings

    def _linear(self, inputs):
        """Computes logits by running inputs through a linear layer.

        Args:
          inputs: A float32 tensor with shape [batch_size, length, hidden_size]
        Returns:
          float32 tensor with shape [batch_size, length, vocab_size].
        """
        with tf.name_scope("presoftmax_linear"):
            batch_size = tf.shape(inputs)[0]
            length = tf.shape(inputs)[1]

            x = tf.reshape(inputs, [-1, self.hidden_size])
            logits = tf.matmul(x, self.shared_weights, transpose_b=True)

            return tf.reshape(logits, [batch_size, length, self.vocab_size])



Define Graph Attention Layer


In [0]:
# Encoder layers = Embedding shared weights + GA Layer + FFN Layer


class GraphAttentionLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, num_heads, reg_scale=0.001, rate=0.1):
        """
        Graph Attention Network Layer, takes input and returns embedded
        node features with self attention applied on the feature matrix
        """
        super(GraphAttentionLayer, self).__init__()
        self.in_dim = d_model
        self.out_dim = dff
        self.num_heads = num_heads
        self.dropout_rate = rate
        self.kernels = []
        self.biases = []
        self.attn_kernels = []

        self.lrelu = tf.keras.layers.LeakyReLU()
        self.dropout = tf.keras.layers.Dropout(rate)
        self.reg = tf.keras.regularizers.l2(l=reg_scale)

        for head in range(self.num_heads):
            kernel = self.add_weight(shape=(self.in_dim, self.out_dim),
                                     initializer='glorot_uniform',
                                     regularizer=self.reg,
                                     name='kernel_{}'.format(head))
            bias = self.add_weight(shape=(self.out_dim,),
                                   initializer='glorot_uniform',
                                   regularizer=self.reg,
                                   name='bias_{}'.format(head))
            self.kernels.append([kernel, bias])
            # Attention kernels
            attn_kernel_self = self.add_weight(shape=(self.out_dim, 1),
                                               initializer='glorot_uniform',
                                               regularizer=self.reg,
                                               name='attn_kernel_self_{}'.format(head))
            attn_kernel_neighs = self.add_weight(shape=(self.out_dim, 1),
                                                 initializer='glorot_uniform',
                                                 regularizer=self.reg,
                                                 name='attn_kernel_neigh_{}'.format(head))
            self.attn_kernels.append([attn_kernel_self, attn_kernel_neighs])

    def call(self, nodes):
        inputs = nodes

        outputs = []
        for head in range(self.num_heads):
            kernel = self.kernels[head]
            attention_kernel = self.attn_kernels[head]
            features = tf.keras.backend.dot(inputs, kernel[0])
            features = tf.add(features, kernel[1])
            attn_for_self = tf.keras.backend.dot(features, attention_kernel[0])
            attn_for_neighs = tf.keras.backend.dot(features, attention_kernel[1])
            # Attention head a(Wh_i, Wh_j) = a^T [[Wh_i], [Wh_j]]
            dense = tf.matmul(attn_for_self, attn_for_neighs, transpose_b=True)
            dense = self.lrelu(dense)

            # Mask values before activation (Vaswani et al., 2017)
            # mask_local = -10e9 * (1.0 - adj)
            # dense += mask_local

            # Apply softmax to get attention coefficients
            dense = tf.math.softmax(dense)  # (N x N)

            # Apply dropout to features and attention coefficients
            if self.trainable:
                dense = self.dropout(dense)  # (N x N)
                features = self.dropout(features)  # (N x F')

            # Linear combination with neighbors' features
            node_features = tf.matmul(dense, features)  # (N x F')
            outputs.append(node_features)

        output = tf.reduce_mean(tf.stack(outputs), axis=0)  # N x F')
        output = tf.nn.relu(output)

        return output



### Define Feed-Forward Layer

In [0]:
# Feed forward layer

class FeedForwardNetwork(tf.keras.layers.Layer):
    """Fully connected feedforward network."""

    def __init__(self, hidden_size, filter_size, relu_dropout):
        """Initialize FeedForwardNetwork.

        Args:
          hidden_size: int, output dim of hidden layer.
          filter_size: int, filter size for the inner (first) dense layer.
          relu_dropout: float, dropout rate for training.
        """
        super(FeedForwardNetwork, self).__init__()
        self.hidden_size = hidden_size
        self.filter_size = filter_size
        self.relu_dropout = relu_dropout

    def build(self, input_shape):
        self.filter_dense_layer = tf.keras.layers.Dense(
            self.filter_size,
            use_bias=True,
            activation=tf.nn.relu,
            name="filter_layer")
        self.output_dense_layer = tf.keras.layers.Dense(
            self.hidden_size, use_bias=True, name="output_layer")
        super(FeedForwardNetwork, self).build(input_shape)

    def get_config(self):
        return {
            "hidden_size": self.hidden_size,
            "filter_size": self.filter_size,
            "relu_dropout": self.relu_dropout,
        }

    def call(self, x, training):
        """Return outputs of the feedforward network.

        Args:
          x: tensor with shape [batch_size, length, hidden_size]
          training: boolean, whether in training mode or not.

        Returns:
          Output of the feedforward network.
          tensor with shape [batch_size, length, hidden_size]
        """
        # Retrieve dynamically known shapes
        batch_size = tf.shape(x)[0]
        length = tf.shape(x)[1]

        output = self.filter_dense_layer(x)
        if training:
            output = tf.nn.dropout(output, rate=self.relu_dropout)
        output = self.output_dense_layer(output)

        return output



## Assemble Graph Encoder module
We put together the encoder module using the defined layers above

In [0]:
# Graph Encoder Layer

class GraphEncoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, trainable, dff,
                 filter_size, reg_scale=0.001, rate=0.1):

        super(GraphEncoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        self.node_role_layer = tf.keras.layers.Dense(self.d_model, input_shape=(2 * d_model,))
        self.enc_layers = []
        for _ in range(num_layers):
            gat_layer = GraphAttentionLayer(d_model, dff, num_heads,
                                            reg_scale=reg_scale, rate=rate)
            ffn_layer = FeedForwardNetwork(dff, filter_size, rate)
            self.enc_layers.append([gat_layer, ffn_layer])

        self.dropout = tf.keras.layers.Dropout(rate)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.edge_layer = tf.keras.layers.Dense(self.d_model)
        self.trainable = trainable

    def call(self, node_tensor, label_tensor, node1_tensor, node2_tensor):
        # adding embedding and position encoding.

        edge_tensor = tf.concat([node1_tensor, node2_tensor], 2)
        edge_tensor = tf.cast(self.node_role_layer(edge_tensor), dtype=tf.float32)

        node_tensor *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        edge_tensor *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

        edges = self.edge_layer(tf.add(edge_tensor, label_tensor))


        for i, layer in enumerate(self.enc_layers):
            if i == 0:
                x = self.enc_layers[i][0](node_tensor)
                x = self.enc_layers[i][1](x, self.trainable)
                x += edges 
            else:
                shortcut = x
                x = self.enc_layers[i][0](x)
                x = self.enc_layers[i][1](x, self.trainable)
                x += edges
                x += shortcut

        return self.layernorm(x)

# Implement RNN Decoder Module Layers
To assemble the Decoder module, we will use the following layers:
- Embedding Layer
- Bidirectional GRU
- Bahanadu Attention Layer

In [0]:
# RNN Decoder =  embedding + birirect_GRU + BahanaduAttention

### Define Bahdanau Attention Layer

In [0]:
# Bahanadu Attention layer

class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, hidden_size)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

### Assemble RNN decoder Module
We combine all the decoder layers under one class

In [0]:
# RNN Decoder
class RNNDecoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(RNNDecoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.forward_gru = tf.keras.layers.GRU(self.dec_units,
                                                    return_sequences=True,
                                                    return_state=True,
                                                    go_backwards=False,
                                                    recurrent_initializer='glorot_uniform')
        self.backward_gru = tf.keras.layers.GRU(self.dec_units,
                                                     return_sequences=True,
                                                     return_state=True,
                                                     go_backwards=True,
                                                     recurrent_initializer='glorot_uniform')
        self.gru = tf.keras.layers.Bidirectional(self.forward_gru, backward_layer=self.backward_gru,
                                                 merge_mode='ave')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)
        self.layernorm = tf.keras.layers.LayerNormalization()

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1) , x], axis=-1) 

        # passing the concatenated vector to the GRU
        output = self.gru(x)
        output, state = output[0], output[2]
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        output = self.layernorm(output)

        # output shape == (batch_size, vocab)
        x = tf.nn.softmax(self.fc(output))

        return x, state, attention_weights



# Implement GAT Model as Generator
We put together the encoder and decoder modules from above to construct the Graph Attention model:

GAT model gets input tensors (nodes, labels, node1, node2) and state (generated sequence uptil time t), and will output the probabilits of next token in sequence, as well as the decoder's hidden state.

The Encoder recieves the input tensors and uses the Graph Attention module to encode them. The Decoder will then be fed the current state of the generated sequence, along with the encoder hidden state and the encoder output the next token in the sequence, the decoder rnn state, and the attention weights. 

In [0]:
class GATModel(tf.keras.Model):
    """
    Model that uses Graph Attention encoder and RNN decoder (for now)
    """

    def __init__(self, 
                 enc_layers, enc_units,  emb_dim, num_heads,
                 hidden_size, filter_size, batch_size,  reg_scale,
                 dropout, src_vocab_size, tgt_vocab_size,
                 target_lang):
      
        super(GATModel, self).__init__()
        self.emb_layer = EmbeddingSharedWeights(
            src_vocab_size, emb_dim)

        self.tgt_emb_layer = EmbeddingSharedWeights(
            tgt_vocab_size, emb_dim)

        self.encoder = GraphEncoder(enc_layers, emb_dim, num_heads, True,  hidden_size, # trainable=True
                                    filter_size, reg_scale=reg_scale, rate=dropout)
        self.decoder = RNNDecoder(tgt_vocab_size, emb_dim, enc_units, batch_size)
        self.vocab_tgt_size = tgt_vocab_size
        self.batch_size=batch_size
        self.num_heads = num_heads
        self.target_lang = target_lang
        self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
        self.hidden = tf.keras.layers.Dense(hidden_size)


    def __call__(self, nodes, labels, node1, node2, state):
        """
        Puts the tensors through encoders and decoders
        :param adj: Adjacency matrices of input example
        :type adj: tf.tensor
        :param nodes: node features
        :type nodes: tf.tensor
        :param targ: target sequences
        :type targ: tf.tensor
        :return: output probability distribution
        :rtype: tf.tensor
        """
        node_tensor = tf.cast(self.emb_layer(nodes), dtype=tf.float32)
        label_tensor = tf.cast(self.emb_layer(labels), dtype=tf.float32)
        node1_tensor = tf.cast(self.emb_layer(node1), dtype=tf.float32)
        node2_tensor = tf.cast(self.emb_layer(node2), dtype=tf.float32)
        #reward = tf.cast(self.emb_layer(reward), dtype=tf.float32)

        enc_output = self.encoder(node_tensor, label_tensor, node1_tensor, node2_tensor) # self.num_heads, self.encoder.trainable
        batch = enc_output.shape[0]
        self.enc_output_hidden = tf.reshape(enc_output, shape=[batch, -1])
        enc_hidden = self.hidden(self.enc_output_hidden)


        predictions, dec_hidden, _ = self.decoder(state, enc_hidden, enc_output)

        return predictions, dec_hidden

# Implement LSTM model as Discriminator

In [0]:
from tensorflow.keras.layers import Input, Lambda,  Concatenate, LSTM
 
from tensorflow.keras.layers import Dense, Embedding,Activation, Dropout
from tensorflow.keras.models import Model

def Discriminator(V, E, H=64, dropout=0.1):
    '''
    Disciriminator model.
    # Arguments:
        V: int, Vocabrary size
        E: int, Embedding size
        H: int, LSTM hidden size
        dropout: float
    # Returns:
        discriminator: keras model
            input: word ids, shape = (B, T)
            output: probability of true data or not, shape = (B, 1)
    '''

    def Highway(x, num_layers=1, activation='relu', name_prefix=''):

      '''
      Layer wrapper function for Highway network
      # Arguments:
          x: tensor, shape = (B, input_size)
      # Optional Arguments:
          num_layers: int, dafault is 1, the number of Highway network layers
          activation: keras activation, default is 'relu'
          name_prefix: str, default is '', layer name prefix
      # Returns:
          out: tensor, shape = (B, input_size)
      '''
      input_size = K.int_shape(x)[1]
      for i in range(num_layers):
          gate_ratio_name = '{}Highway/Gate_ratio_{}'.format(name_prefix, i)
          fc_name = '{}Highway/FC_{}'.format(name_prefix, i)
          gate_name = '{}Highway/Gate_{}'.format(name_prefix, i)

          gate_ratio = Dense(input_size, activation='sigmoid', name=gate_ratio_name)(x)
          fc = Dense(input_size, activation=activation, name=fc_name)(x)
          x = Lambda(lambda args: args[0] * args[2] + args[1] * (1 - args[2]), name=gate_name)([fc, x, gate_ratio])
      return x


    input_ = Input(shape=(None,), dtype='int32', name='Input')   # (B, T)
    out = Embedding(V, E, mask_zero=True, name='Embedding')(input_)  # (B, T, E)
    out = LSTM(H)(out)
    out = Highway(out, num_layers=1)
    out = Dropout(dropout, name='Dropout')(out)
    out = Dense(1, activation='sigmoid', name='FC')(out)

    discriminator = Model(input_, out)
    return discriminator

# Define generator loss, Agent and Environment

## Define Agent class

In [0]:
class Agent(object):
    '''
    On each step, Agent act on state.
    Then Environment return next state, reward, and so on.
    '''
    def __init__(self, BATCH_SIZE, src_vocab_size, generator_model, lr=1e-3):
        '''
        # Arguments:
            BATCH_SIZE: int, batch_size
            src_vocab_size: int, Vocabrary size
            emb_dim: int, Embedding size
        # Optional Arguments:
            lr: float, learning rate, default is 0.001
        '''
        self.num_actions = src_vocab_size
        self.B = BATCH_SIZE
        self.V = src_vocab_size
        self.lr = lr
        self.eps = 0.1
        self.generator = generator_model

    def act(self, nodes, labels, node1, node2, state, epsilon=0, deterministic=False):
        '''
        # Arguments:
            state: numpy array, dtype=int, shape = (B, t)
            epsilon: float, 0 <= epsilon <= 1,
                if epsilon is 1, the Agent will act completely random.
        # Returns:
            action: numpy array, dtype=int, shape = (B, 1)
        '''
        try:
            word = state.numpy()#[:, -1].reshape([self.B, 1], dtype=np.int32)
        except AttributeError as e:
            word =np.expand_dims( state[:,-1] , axis=1)
            pass
        

        return self._act_on_word(nodes, labels, node1, node2, word, epsilon=epsilon, deterministic=deterministic)


    def prepare_prob(self, x):
          x = np.array(x)
          tot = sum(x)
          if tot > 1.0:
              leftover = tot - 1.0 
              x[-1] =  x[-1] - leftover
          elif tot < 1.0:
              leftover = 1.0 - tot 
              x[-1] =  x[-1] + leftover
          return x


    def sampling_word(self, prob):
        '''
        # Arguments:
            prob: numpy array, dtype=float, shape = (B, V),
        # Returns:
            action: numpy array, dtype=int, shape = (B, )
        '''
        action = np.zeros((self.B,), dtype=np.int32)
        for i in range(self.B):
            p = prob[i]
            p = self.prepare_prob(p)

            action[i] = np.random.choice(self.V, p=p)
        return action

    def _act_on_word(self, nodes, labels, node1, node2, word, epsilon=0, deterministic=False, PAD=0, EOS=8):
        '''
        # Arguments:
            word: numpy array, dtype=int, shape = (B, 1),
                word indicates current word.
            epsilon: float, 0 <= epsilon <= 1,
                if epsilon is 1, the Agent will act completely random.
        # Returns:
            action: numpy array, dtype=int, shape = (B, 1)
        '''
        action = None
        is_PAD = word == PAD
        is_EOS = word == EOS
        is_end = is_PAD.astype(np.int) + is_EOS.astype(np.int)
        is_end = 1 - is_end

        is_end = is_end.reshape([self.B, 1])
        if np.random.rand() <= epsilon:
            action = np.random.randint(low=0, high=self.num_actions, size=(self.B, 1))
            return epsilon, action * is_end
        elif not deterministic:
            probs, dec_hidden = self.generator(nodes, labels, node1, node2, word)
            action = self.sampling_word(probs).reshape([self.B, 1])
        else:
            probs = self.generator(nodes, labels, node1, node2, word) # (B, T)
            action = np.argmax(probs, axis=-1).reshape([self.B, 1])

        return probs, action * is_end


## Define Environment class

In [0]:
class Environment(object):
    '''
    On each step, Agent act on state.
    Then Environment return next state, reward, and so on.
    '''
    def __init__(self, discriminator, agent_copy, n_sample=5):
        '''
        Environment class for Reinforcement Learning
        # Arguments:
            discriminator: keras model
            g_beta: SeqGAN.rl.Agent, copy of Agent
                params of g_beta.generator should be updated with those of original
                generator on regular occasions.
        # Optional Arguments
            n_sample: int, default is 16, the number of Monte Calro search sample
        '''

        self.n_sample = n_sample
        self.BOS = 7
        self.discriminator = discriminator
        self.g_beta = agent_copy
        self.B = batch_size
        self.reset()

    def reset(self):
        self.t = 1
        self._state = np.zeros([self.B, 1], dtype=np.int32)
        self._state[:, 0] = self.BOS



    def get_state(self):
        return self._state     #[:, 1:]   # Exclude BOS

    def _append_state(self, word, state=None):
        '''
        # Arguments:
            word: numpy array, dtype=int, shape = (B, 1)
        '''

        if state is None:

            #print(word)
            #print(self._state)
            self._state = np.concatenate([self._state, word], axis=-1)
        else:

            return np.concatenate([state, word], axis= -1)

    def step(self, nodes, labels, node1, node2, action, targ_shape):
        '''
        Step t -> t + 1 and returns a result of the Agent action.
        # Arguments:
            action: numpy array, dtype=int, shape = (B, 1),
                state is Y_0:t-1, and action is y_t
        # Returns:
            next_state: numpy array, dtype=int, shape = (B, t)
            reward: numpy array, dtype=float, shape = (B, 1)
            is_episode_end: bool
            info: dict
        '''
        self.t = self.t + 1

        reward = self.Q(nodes, labels, node1, node2, action, self.n_sample, targ_shape)
        is_episode_end = self.t > targ_shape

        self._append_state(action)
        next_state = self.get_state()
        info = None

        return [next_state, reward, is_episode_end, info]

    def render(self, head=1):
        for i in range(head):
            ids = self.get_state()[i]
            words = [self.data_generator.id2word[id] for id in ids.tolist()]
            print(''.join(words))
        print('-' * 80)


    def Q(self, nodes, labels, node1, node2, action, targ_shape, n_sample=16):
        '''
        State-Action value function using Rollout policy
        # Arguments:
            action: numpy array, dtype=int, shape = (B, 1)

        # Optional Arguments:
            n_sample: int, default is 16, number of samples for Monte Calro Search

        # Returns:
            reward: numpy array, dtype=float, shape = (B, ), State-Action value

        # Requires:
            t, T: used to define time range.
            state: determined texts, Y[0:t-1], used for Rollout.
            action: next words, y[t], used for sentence Y[0:t].
            g_beta: Rollout policy.
        '''
 
        reward = np.zeros([self.B, 1])
        if self.t == 2:
            Y_base = self._state    # Initial case
        else:
            Y_base = self.get_state()    # (B, t-1)

        if self.t >= targ_shape:
            Y = self._append_state(action, state=Y_base)
            return self.discriminator.predict(Y)

        # Rollout
        for idx_sample in range(n_sample):
            Y = Y_base

            _, y_t = self.g_beta.act(nodes, labels, node1, node2, Y, epsilon=self.g_beta.eps)

            Y = self._append_state(y_t, state=Y)
            
            for tau in range(self.t+1, targ_shape):
                _, y_tau = self.g_beta.act(nodes, labels, node1, node2, Y, epsilon=self.g_beta.eps)
                Y = self._append_state(y_tau, state=Y)   
            reward += self.discriminator.predict(Y) / n_sample
            #print('generated : ', self._append_state(y_tau, state=Y))
            #print('reward : ', reward)

        return reward

## Initialize training variables and model

In [0]:
vocab_size = 10248
embedding_size = 16
BATCH_SIZE = batch_size
epochs = 10
step=0
steps = epochs * steps_per_epoch

In [0]:
'''
GATModel args: 

    enc_layers, enc_units,  emb_dim, num_heads,
    hidden_size, filter_size, batch_size,  reg_scale,
    dropout, src_vocab_size, tgt_vocab_size,
    target_lang

'''

# Generator model
generator = GATModel(2, 64,  64, 2,
                 64, 64, BATCH_SIZE, 0.0,
                 0.2, src_vocab_size, tgt_vocab_size,
                 tgt_vocab)

# Generator optimizer
gen_optimizer = tf.keras.optimizers.Adam()


In [0]:
## Generator checkpoint/not necessary for discriminator##################


checkpoint_dir = './gdrive/My Drive/RDF_GAN/training_checkpoints'




In [0]:

# Discriminator model
discriminator = Discriminator(vocab_size,
                              embedding_size,
                              H=64, dropout=0.1).compile('adam', 'binary_crossentropy')

In [0]:
discriminator = Discriminator(vocab_size,
                              embedding_size,
                              H=64, dropout=0.1)
discriminator.compile('adam', 'binary_crossentropy')

In [0]:
# Agent instance, takes initialized generator
agent = Agent(BATCH_SIZE, vocab_size, generator)


# Environment instance, takes initialized discriminator and generator
environment =  Environment(discriminator, agent)

## Define function to generate discriminator data

In [0]:
def get_disc_data(real, fake):

    real.extend(fake[0])
    random.shuffle(real)

    disc_data = real
    disc_data = np.vstack(disc_data)
    
    X, y = disc_data[:,:-1], disc_data[:,-1]
    return X, y

## Define Loss function

In [0]:
def generator_loss(y_pred, y_true, reward):

    out = K.clip(y_pred, 1e-8, 1-1e-8)

    y_true = K.one_hot(y_true, num_classes=vocab_size)

    log_lik = y_true*K.log(out)
    
    return K.sum(-log_lik * reward)

In [0]:
def generate_output(state, targ):


  print('\nReal : ')
  for t in targ:
    print(' '.join(['<PAD>' if i==0 else tgt_vocab.index_word[i] for i in np.array(t)]))
    break
    
  print('\nGenerated : ')
  for s in state:
    print(' '.join(['<PAD>' if i==0 else tgt_vocab.index_word[i] for i in s]), '\n')
    break

In [0]:
def save_progress(path):
  agent.generator.save_weights(path+'/generator_weights')
  discriminator.save_weights(path+'/discriminator_weights')

## Define training step

In [0]:

# Training step 
#@tf.function
def train_step(nodes, labels, node1, node2, targ, agent, environment, discriminator):
  loss = 0
  st = 0

  with tf.GradientTape() as tape:

    # Lists to store training data for discriminator
    disc_fake = []

    # Store the target sequences as a negative class instance 
    disc_real = [[np.concatenate([np.array(y), np.array([1])])] for y in targ]

    # predictions with Rollout policy, using environment class
    for t in range(1, targ.shape[1]):
      st+=1
      if st%10==0:
          print('\nt-Step :', st)


      # Get state at time t
      state = targ[:,t-1 :t]

      # Get predictions and action from agent
      probs, action = agent.act(nodes, labels, node1, node2, state)

      # Generate next state and reward from environment
      state_, reward, is_episode_end, info = environment.step(nodes, labels, node1, node2, action, targ.shape[1])

      # Calculate loss
      
      #print(probs.shape, K.one_hot(targ[:,t], num_classes=10013).shape)
      loss += generator_loss(probs, targ[:,t], reward)#action


    # generate output to examine
    generate_output(state_, targ)

    # Store the complete generated sequences as a negative class instance 
    disc_fake.append([[np.concatenate([np.array(s), np.array([0])])] for s in state_])



  
    # Get X_train, y_train for discriminator
    d_x, d_y = get_disc_data(disc_real, disc_fake)

    # Train Discriminator
    print('Discriminator loss :')
    discriminator.fit(d_x, d_y, batch_size=BATCH_SIZE, epochs=1)

  # Calculate batch loss
  batch_loss = (loss / int(targ.shape[1]))

  # Define trainable variables
  variables = agent.generator.trainable_variables
  
  # Define gradients
  gradients = tape.gradient(loss, variables)

  # Update model
  gen_optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss, discriminator, agent

## Execute training loop

In [0]:
# Training algorithm



step=0
steps = epochs * steps_per_epoch
epochs=5
for epoch in range(epochs):

    with tqdm(total=(34352 // batch_size)) as pbar:
        for (batch, (node_, label_, node1_, node2_, targ_)) in tqdm(enumerate(dataset)):
            start = time.time()
            step += 1

            batch_loss, discriminator, agent = train_step(node_, label_, node1_, node2_, targ_, agent, environment, discriminator)
            environment = Environment(discriminator, agent)

            save_progress(checkpoint_dir)

            print('Epoch {} Batch {} '.format(epoch, batch))
            print('Generator Batch Loss: ')
            print(tf.math.reduce_mean(batch_loss).numpy())

            print('Time {} \n'.format(time.time() - start))
            pbar.update(1)


  0%|          | 0/17176 [00:00<?, ?it/s]
0it [00:00, ?it/s][A


t-Step : 10

t-Step : 20

t-Step : 30

t-Step : 40

t-Step : 50

t-Step : 60

t-Step : 70

t-Step : 80

Real : 
<start> 1974 is one of the model years of the amc matador which was made in thames , new zealand . <end> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Generated : 
<start> sludge_metal amsterdam_airport_schiphol damon 09 royce volkswagen except 21st mexico saab officially vituti 25ft gottingen in2000 granola amatariciana falls 1939-01-03 uab suburban_legends <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <

  0%|          | 1/17176 [03:10<910:34:19, 190.86s/it]
1it [03:10, 190.84s/it][A

Epoch 0 Batch 0 
Generator Batch Loss: 
8.98693
Time 190.79594469070435 


t-Step : 10

t-Step : 20

t-Step : 30

t-Step : 40

t-Step : 50

t-Step : 60

t-Step : 70

t-Step : 80

Real : 
<start> adolfo suarez madrid barajas airport is located in alcobendas , part of the community of madrid in spain . the airport is operated by enaire which is located in madrid . <end> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Generated : 
<start> bi batlle stella violet visvesaraya loctated inauguration 1982m "ajo blanco" alexandre airpori jose alpena_county_regional_airport 214 headed madeleine caterpillar_inc. indie_rock does yet sebastian ries 905 metres lpena asteroid a894 malay maple 2001-03-01 undia launches <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD

  0%|          | 2/17176 [03:53<698:48:23, 146.48s/it]
2it [03:53, 146.47s/it][A

Epoch 0 Batch 1 
Generator Batch Loss: 
9.087674
Time 42.92470955848694 


t-Step : 10

t-Step : 20

t-Step : 30

t-Step : 40

t-Step : 50

t-Step : 60

t-Step : 70

t-Step : 80

Real : 
<start> alan shepard was born on 1923 11 18 . <end> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Generated : 
<start> 30843 jan manchester prime_minister_of_romania i whom university_of_texas_system talib reddy elizabeth_garrett <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD

  0%|          | 3/17176 [04:37<551:36:57, 115.64s/it]
3it [04:37, 115.62s/it][A

Epoch 0 Batch 2 
Generator Batch Loss: 
8.789771
Time 43.65278601646423 


t-Step : 10

t-Step : 20

t-Step : 30

t-Step : 40

t-Step : 50

t-Step : 60

t-Step : 70

t-Step : 80

Real : 
<start> the book alcatraz versus the evil librarians comes from the u . s where there are many asian americans . <end> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Generated : 
<start> involves theam akeen "1927" andre itilian manufacturer "4/22" 1fc jorge_orosmán_da_silva harbour reddy mughan "nurturing excellence" deers bajji wheel summity a_epsth_2nd_group ambient_music 352 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PA

  0%|          | 4/17176 [05:21<449:06:26, 94.15s/it] 
4it [05:21, 94.14s/it] [A

Epoch 0 Batch 3 
Generator Batch Loss: 
8.681596
Time 44.017436265945435 


t-Step : 10

t-Step : 20

t-Step : 30

t-Step : 40

t-Step : 50

t-Step : 60

t-Step : 70

t-Step : 80

Real : 
<start> michele marcolini has been associated with f . c . bari 1908 . <end> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Generated : 
<start> closely aip_advances presidency 75 kuttikkattor clerk alongside otkrytiye 185.42 (centimetres) inline-four_engine agnes drum_and_bass aaarhus arapiraquens <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

  0%|          | 5/17176 [06:03<374:28:10, 78.51s/it]
5it [06:03, 78.50s/it][A

Epoch 0 Batch 4 
Generator Batch Loss: 
8.497562
Time 42.001798152923584 


t-Step : 10

t-Step : 20

t-Step : 30

t-Step : 40

t-Step : 50

t-Step : 60

t-Step : 70

t-Step : 80

Real : 
<start> the comic character , arion , is also known by he name ahri ahn . <end> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Generated : 
<start> alcatraz_versus_the_scriveners_bones native albennie_jones rq masum compostela organization 1996-05-30 informally informally andreas_voßkuhle filipinos_in_japan zaragoza spaceport lasalle medicine <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PA

  0%|          | 6/17176 [06:47<325:09:19, 68.17s/it]
6it [06:47, 68.17s/it][A

Epoch 0 Batch 5 
Generator Batch Loss: 
8.392691
Time 44.05031108856201 


t-Step : 10

t-Step : 20

t-Step : 30

t-Step : 40

t-Step : 50

t-Step : 60

t-Step : 70

t-Step : 80

Real : 
<start> the atlas ii , from the u . s . , was made by lockheed martin and launched from the spaceport florida launch complex 36 . <end> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Generated : 
<start> involved desserts 1 straus coritiba_foot_ball_club here techsystems acoustic political madrid orfeo england serkey rule yemi sroke 1856-09-22 dalpatbhai seattle genada parliament_of_catalonia riverside skyrypnyk utterar uttar a_glastonbury_romance 77 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <P

  0%|          | 7/17176 [07:31<290:13:16, 60.85s/it]
7it [07:31, 60.85s/it][A

Epoch 0 Batch 6 
Generator Batch Loss: 
8.373082
Time 43.76064896583557 



In [0]:
BOS = tgt_vocab.word_index['<start>']
EOS = tgt_vocab.word_index['<end>']

BOS, EOS



In [0]:
v
