In [1]:
import glob
import json
import pandas as pd
import tensorflow as tf
import spacy
import re
import string
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense,Flatten, Concatenate, TimeDistributed, Bidirectional, Attention, Reshape
from tensorflow.keras.models import Model
from tensorflow import TensorShape
import tensorflow_addons as tfa
from langdetect import detect
import tensorflow_datasets as tfds
import itertools
from numba import jit, cuda

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
print("TensorFlow version: ", tf.__version__)

In [None]:
device_name = tf.test.gpu_device_name()
if not device_name:
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
import wandb
wandb.init()

In [None]:
%%time

ds = tfds.load('cnn_dailymail', split='train', as_supervised=True)
h,a = [],[]

for article, highlights in ds:  
    h.append(str(highlights.numpy()))  
    a.append(str(article.numpy()))
cnn = pd.DataFrame(list(zip(a, h)), 
               columns =['article', 'highlights']) 

In [None]:
%%time

cnn['article'] = cnn.article.apply(lambda x : x.split()[:800] if len(x.split())>500 else x.split())
cnn['highlights'] = cnn.highlights.apply(lambda x : x.split()[:800] if len(x.split())>500 else x.split())

cnn['article'] = cnn.article.apply(lambda x: (" ").join(x))
cnn['highlights'] = cnn.highlights.apply(lambda x: (" ").join(x))

cnn['len_bt'] = cnn.article.map(lambda x: len(x.split(" ")))
cnn['len_ab'] = cnn.highlights.map(lambda x: len(x.split(" ")))

#cnn.query('len_bt <= 1000 and len_ab <= 30', inplace = True)
cnn = cnn[:1000]
cnn

In [None]:
plt.scatter(cnn.len_bt,cnn.len_ab)

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
#@jit(target ="cuda")
def clean_text(bodytext):
    cleaned = list()
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table 
    table = str.maketrans('', '', string.punctuation)
    for word in bodytext:
        words = str(word)       
        words = words.lower()
        words = words.translate(table)
        words = re_print.sub('', words) 
        if words.isalpha() == True:
            cleaned.append(words)
    cleaned.insert(0, '<start>')
    cleaned.append('<end>')
    return cleaned

In [None]:
%%time

nlp = spacy.load("en_core_web_lg")
bt_vector = list()
bt_list = []
ab_list = []
for i in range(len(cnn)):
    bodytext = nlp(cnn.iloc[i].article)
    bt_clean = clean_text(bodytext)
    bt_list.append(bt_clean)
    
    abstract = nlp(cnn.iloc[i].highlights)
    ab_clean = clean_text(abstract)
    ab_list.append(ab_clean)
com_list = ab_list + bt_list
    #c_papers.append(papers)
bt_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
bt_tokenizer.fit_on_texts(com_list)
data_bt = bt_tokenizer.texts_to_sequences(bt_list)
data_ab = bt_tokenizer.texts_to_sequences(ab_list)

longest_seq = max(max([len(x) for x in data_bt]), max([len(x) for x in data_ab]))
#x_voc_size = max([len(x) for x in data_bt])#, max([len(x) for x in data_ab]))
#y_voc_size = max([len(y) for y in data_ab])
data_bt = tf.keras.preprocessing.sequence.pad_sequences(data_bt,padding='post', maxlen = longest_seq)
data_ab = tf.keras.preprocessing.sequence.pad_sequences(data_ab,padding='post', maxlen = longest_seq) 

In [None]:
def max_len(tensor):
    #print( np.argmax([len(t) for t in tensor]))
    return max( len(t) for t in tensor)

In [None]:

X_train,  X_test, Y_train, Y_test = train_test_split(data_bt,data_ab,test_size=0.2)
BATCH_SIZE = 20
BUFFER_SIZE = len(X_train)
steps_per_epoch = BUFFER_SIZE//BATCH_SIZE
embedding_dims = 256
rnn_units = 64
dense_units = 64
Dtype = tf.float32
#dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


a = bt_tokenizer.word_index
print(list(a.keys())[list(a.values()).index(9326)])

In [None]:
vocab_size = len(bt_tokenizer.word_index)+1  
vocab_size

In [None]:
def initialize_initial_state():
        return [tf.zeros((BATCH_SIZE, rnn_units)), tf.zeros((BATCH_SIZE, rnn_units))]
encoder_initial_cell_state = initialize_initial_state()

In [None]:
#ENCODER
class EncoderNetwork(tf.keras.Model):
    def __init__(self,input_vocab_size,embedding_dims, rnn_units ):
        super().__init__()
        self.encoder_embedding = tf.keras.layers.Embedding(input_dim=input_vocab_size,
                                                           output_dim=embedding_dims)
        self.encoder_rnnlayer = tf.keras.layers.LSTM(rnn_units,return_sequences=True, 
                                                     return_state=True )
    
#DECODER
class DecoderNetwork(tf.keras.Model):
    def __init__(self,output_vocab_size, embedding_dims, rnn_units):
        super().__init__()
        self.decoder_embedding = tf.keras.layers.Embedding(input_dim=output_vocab_size,
                                                           output_dim=embedding_dims) 
        self.dense_layer = tf.keras.layers.Dense(output_vocab_size)
        self.decoder_rnncell = tf.keras.layers.LSTMCell(rnn_units)
        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()
        # Create attention mechanism with memory = None
        self.attention_mechanism = self.build_attention_mechanism(dense_units,None,BATCH_SIZE*[longest_seq]) 
        self.rnn_cell =  self.build_rnn_cell(BATCH_SIZE)
        self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler= self.sampler,
                                                output_layer=self.dense_layer)

    def build_attention_mechanism(self, units,memory, memory_sequence_length):
        return tfa.seq2seq.LuongAttention(units, memory = memory, 
                                          memory_sequence_length=memory_sequence_length)
        #return tfa.seq2seq.BahdanauAttention(units, memory = memory, memory_sequence_length=memory_sequence_length)

    # wrap decodernn cell  
    def build_rnn_cell(self, batch_size ):
        rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnncell, self.attention_mechanism,
                                                attention_layer_size=dense_units)
        return rnn_cell
    
    def build_decoder_initial_state(self, batch_size, encoder_state,Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size = batch_size, 
                                                                dtype = Dtype)
        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state) 
        return decoder_initial_state

encoderNetwork = EncoderNetwork(vocab_size,embedding_dims, rnn_units)
decoderNetwork = DecoderNetwork(vocab_size,embedding_dims, rnn_units)
optimizer = tf.keras.optimizers.Adam()

In [None]:
def loss_function(y_pred, y):
   
    #shape of y [batch_size, ty]
    #shape of y_pred [batch_size, Ty, output_vocab_size] 
    sparsecategoricalcrossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                                                  reduction='none')
    loss = sparsecategoricalcrossentropy(y_true=y, y_pred=y_pred)
    mask = tf.logical_not(tf.math.equal(y,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)
    loss = mask* loss
    loss = tf.reduce_mean(loss)
    return loss

def train_step(input_batch, output_batch,encoder_initial_cell_state):
    #initialize loss = 0
    loss = 0
    with tf.GradientTape() as tape:
        encoder_emb_inp = encoderNetwork.encoder_embedding(input_batch)
        a, a_tx, c_tx = encoderNetwork.encoder_rnnlayer(encoder_emb_inp, 
                                                        initial_state =encoder_initial_cell_state)

        #[last step activations,last memory_state] of encoder passed as input to decoder Network
        
         
        # Prepare correct Decoder input & output sequence data
        decoder_input = output_batch[:,:-1] # ignore <end>
        #compare logits with timestepped +1 version of decoder_input
        decoder_output = output_batch[:,1:] #ignore <start>


        # Decoder Embeddings
        decoder_emb_inp = decoderNetwork.decoder_embedding(decoder_input)

        #Setting up decoder memory from encoder output and Zero State for AttentionWrapperState
        decoderNetwork.attention_mechanism.setup_memory(a)
        decoder_initial_state = decoderNetwork.build_decoder_initial_state(BATCH_SIZE,
                                                                           encoder_state=[a_tx, c_tx],
                                                                           Dtype=tf.float32)
        
        #BasicDecoderOutput        
        outputs, _, _ = decoderNetwork.decoder(decoder_emb_inp,initial_state=decoder_initial_state,
                                               sequence_length=BATCH_SIZE*[longest_seq-1])

        logits = outputs.rnn_output
        #Calculate loss

        loss = loss_function(logits, decoder_output)

    #Returns the list of all layer variables / weights.
    variables = encoderNetwork.trainable_variables + decoderNetwork.trainable_variables  
    # differentiate loss wrt variables
    gradients = tape.gradient(loss, variables)

    #grads_and_vars – List of(gradient, variable) pairs.
    grads_and_vars = zip(gradients,variables)
    optimizer.apply_gradients(grads_and_vars)
    return loss


In [None]:
def generator(X_train, Y_train, batch_size):
    num_samples = len(X_train)
    print(num_samples)
    while True:
        for offset in range(0, num_samples, batch_size):
            yield X_train[offset:offset+batch_size], Y_train[offset:offset+batch_size]
    
train_samples = generator(X_train, Y_train, batch_size = 8000)


In [None]:
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.InteractiveSession(config=config)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(next(train_samples)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

epochs = 100
for i in range(1, epochs+1):

    encoder_initial_cell_state = initialize_initial_state()
    total_loss = 0.0
    

    for ( batch , (input_batch, output_batch)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(input_batch, output_batch, encoder_initial_cell_state)
        total_loss += batch_loss
        dataset = tf.data.Dataset.from_tensor_slices(next(train_samples)).batch(BATCH_SIZE, drop_remainder=True)
        #print(next(train_samples))
        if (batch+1)%2 == 0:
            print("total loss: {} epoch {} batch {} ".format(batch_loss.numpy(), i, batch+1))

In [None]:
#In this section we evaluate our model on a raw_input converted to german, for this the entire sentence has to be passed
#through the length of the model, for this we use greedsampler to run through the decoder
#and the final embedding matrix trained on the data is used to generate embeddings
input_raw=X_test[0].reshape(1,-1)
# inp_bodytext = nlp(input_raw)
# input_lines = clean_text(inp_bodytext)

# We have a transcript file containing English-German pairs
# Preprocess X
#input_raw = clean_text(input_raw)
#input_lines = [f'{bt_tok} {input_raw}']
# input_sequences = [[bt_tokenizer.word_index[w] for w in line.split()] for line in input_raw]
# input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_raw,
#                                                                maxlen=longest_seq, padding='post')
# inp = tf.convert_to_tensor(input_sequences)

#print("inp", inp.shape)
#print("inp_seq",input_sequences)
#inference_batch_size = input_sequences.shape[0]
encoder_initial_cell_state = [tf.zeros((1, rnn_units)),tf.zeros((1, rnn_units))]
encoder_emb_inp = encoderNetwork.encoder_embedding(input_raw)
a, a_tx, c_tx = encoderNetwork.encoder_rnnlayer(encoder_emb_inp,initial_state =encoder_initial_cell_state)
# print('a_tx :', a_tx.shape)
# print('c_tx :', c_tx.shape)

start_tokens = tf.fill([1],bt_tokenizer.word_index['<start>'])

end_token = bt_tokenizer.word_index['<end>']

greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

decoder_input = tf.expand_dims([bt_tokenizer.word_index['<start>']]* 1,1)
decoder_emb_inp = decoderNetwork.decoder_embedding(decoder_input)

decoder_instance = tfa.seq2seq.BasicDecoder(cell = decoderNetwork.rnn_cell, sampler = greedy_sampler,
                                            output_layer=decoderNetwork.dense_layer)
decoderNetwork.attention_mechanism.setup_memory(c_tx)
#pass [ last step activations , encoder memory_state ] as input to decoder for LSTM
# print(f"decoder_initial_state = [a_tx, c_tx] : {np.array([a_tx, c_tx]).shape}")
decoder_initial_state = decoderNetwork.build_decoder_initial_state(1,
                                                                   encoder_state=[a_tx, c_tx],
                                                                   Dtype=tf.float32)
# print(f"""
# Compared to simple encoder-decoder without attention, the decoder_initial_state
# is an AttentionWrapperState object containing s_prev tensors and context and alignment vector

# decoder initial state shape: {np.array(decoder_initial_state).shape}
# decoder_initial_state tensor
# {decoder_initial_state}
# """)

# Since we do not know the target sequence lengths in advance, we use maximum_iterations to limit the translation lengths.
# One heuristic is to decode up to two times the source sentence lengths.
maximum_iterations = tf.round(tf.reduce_max(longest_seq)*2)

#initialize inference decoder
decoder_embedding_matrix = decoderNetwork.decoder_embedding.variables[0] 
(first_finished, first_inputs,first_state) = decoder_instance.initialize(decoder_embedding_matrix,
                             start_tokens = start_tokens,
                             end_token=end_token,
                             initial_state = decoder_initial_state)
#print( first_finished.shape)
#print(f"first_inputs returns the same decoder_input i.e. embedding of  {'<start>'} : {first_inputs.shape}")
#print(f"start_index_emb_avg {tf.reduce_sum(tf.reduce_mean(first_inputs, axis=0))}") # mean along the batch

inputs = first_inputs
state = first_state 

predictions = np.empty((1,0), dtype = np.int32)                                                                             
for j in range(maximum_iterations):
    outputs, next_state, next_inputs, finished = decoder_instance.step(j,inputs,state)
    inputs = next_inputs
    state = next_state
    #print(next_inputs)
    outputs = np.expand_dims(outputs.sample_id,axis = -1)
    predictions = np.append(predictions, outputs, axis = -1)

In [None]:
print("\nSummary:")
for i in range(len(predictions)):
    line = predictions[i,:]
    seq = list(itertools.takewhile( lambda index: index !=2, line))
    print(" ".join( [bt_tokenizer.index_word[w] for w in seq]))

In [None]:
cnn.describe()