In [None]:
import glob
import json
import pandas as pd
import tensorflow as tf
import spacy
import re
import string
from sklearn.model_selection import train_test_split


In [None]:
path = '/home/prajakta/Documents/SharpestMinds/COVID-analysis/data/*.json'
files = glob.glob(path)
papers = []
for file in files:
    with open(file) as json_file:
            text = json.load(json_file)
            papers.append([text['paper_id'], text['bodytext'], text['abstract']])
data = pd.DataFrame(papers, columns = ['paper_id', 'bodytext', 'abstract'])
filter = data.abstract != ""
data = data[filter][:15]
#first_10 = data[:10]

In [None]:
def clean_text(bodytext):
    cleaned = list()
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table 
    table = str.maketrans('', '', string.punctuation)
    for word in bodytext:
        words = str(word)       
        words = words.lower()
        words = words.translate(table)
        words = re_print.sub('', words) 
        if words.isalpha() == True:
            cleaned.append(words)
    cleaned.insert(0, '<start>')
    cleaned.append('<end>')
    return cleaned

In [None]:
nlp = spacy.load("en_core_web_sm")
bt_vector = list()
bt_list = []
ab_list = []
for i in range(len(data)):
    bodytext = nlp(data.iloc[i].bodytext)
    bt_clean = clean_text(bodytext)
    bt_list.append(bt_clean)
    
    abstract = nlp(data.iloc[i].abstract)
    ab_clean = clean_text(abstract)
    ab_list.append(ab_clean)
com_list = ab_list + bt_list
    #c_papers.append(papers)
bt_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
bt_tokenizer.fit_on_texts(com_list)
data_bt = bt_tokenizer.texts_to_sequences(bt_list)
data_ab = bt_tokenizer.texts_to_sequences(ab_list)
data_bt = tf.keras.preprocessing.sequence.pad_sequences(data_bt,padding='post')
data_ab = tf.keras.preprocessing.sequence.pad_sequences(data_ab,padding='post')
#     bt_vector.append(data_bt)
#
#bt_tokenizer.index_word

In [None]:
def max_len(tensor):
    #print( np.argmax([len(t) for t in tensor]))
    return max( len(t) for t in tensor)

In [None]:
X_train,  X_test, Y_train, Y_test = train_test_split(data_bt,data_ab,test_size=0.2)
BATCH_SIZE = 5
BUFFER_SIZE = len(X_train)
steps_per_epoch = BUFFER_SIZE//BATCH_SIZE
embedding_dims = 256
rnn_units = 793
dense_units = 1024
Dtype = tf.float32   #used to initialize DecoderCell Zero state

In [None]:
Tx = max_len(data_bt)
Ty = max_len(data_ab)  

input_vocab_size = len(bt_tokenizer.word_index)+1  
output_vocab_size = len(bt_tokenizer.word_index)+ 1
dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
#example_X, example_Y = next(iter(dataset))

In [None]:
#ENCODER
class EncoderNetwork(tf.keras.Model):
    def __init__(self,input_vocab_size,embedding_dims, rnn_units ):
        super().__init__()
        self.encoder_embedding = tf.keras.layers.Embedding(input_dim=input_vocab_size,
                                                           output_dim=embedding_dims)
        self.encoder_rnnlayer = tf.keras.layers.LSTM(rnn_units,return_sequences=True, 
                                                     return_state=True )

encoderNetwork = EncoderNetwork(input_vocab_size,embedding_dims, rnn_units)


In [None]:
def loss_function(y_pred, y):
   
    #shape of y [batch_size, ty]
    #shape of y_pred [batch_size, Ty, output_vocab_size] 
    sparsecategoricalcrossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                                                  reduction='none')
    loss = sparsecategoricalcrossentropy(y_true=y, y_pred=y_pred)
    mask = tf.logical_not(tf.math.equal(y,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)
    loss = mask* loss
    loss = tf.reduce_mean(loss)
    return loss




def train_step(input_batch, output_batch,encoder_initial_cell_state):
    #initialize loss = 0
    loss = 0
    with tf.GradientTape() as tape:
        encoder_emb_inp = encoderNetwork.encoder_embedding(input_batch)
        a, a_tx, c_tx = encoderNetwork.encoder_rnnlayer(encoder_emb_inp, 
                                                        initial_state =encoder_initial_cell_state)
        logits = c_tx
        #Calculate loss
        
        ab_ohe = tf.one_hot(a_tx, input_vocab_size)
        #ab_ohe_test = tf.one_hot(, input_vocab_size)
        
        loss = loss_function(ab_ohe, output_batch)

    #Returns the list of all layer variables / weights.
    variables = encoderNetwork.trainable_variables
    # differentiate loss wrt variables
    gradients = tape.gradient(loss, variables)

    #grads_and_vars – List of(gradient, variable) pairs.
    grads_and_vars = zip(gradients,variables)
    optimizer.apply_gradients(grads_and_vars)
    #print("here")
    return loss
        

In [None]:
#RNN LSTM hidden and memory state initializer
def initialize_initial_state():
        return [tf.zeros((BATCH_SIZE, rnn_units)), tf.zeros((BATCH_SIZE, rnn_units))]

In [None]:
encoder_initial_cell_state = initialize_initial_state()
encoder_emb_inp = encoderNetwork.encoder_embedding(input_batch)
a, a_tx, c_tx = encoderNetwork.encoder_rnnlayer(encoder_emb_inp, 
                                                        initial_state =encoder_initial_cell_state)

In [None]:
epochs = 15
for i in range(1, epochs+1):

    encoder_initial_cell_state = initialize_initial_state()
    total_loss = 0.0

    for ( batch , (input_batch, output_batch)) in enumerate(dataset.take(steps_per_epoch)):
        #bt_ohe = tf.one_hot(input_batch, input_vocab_size)
        batch_loss = train_step(input_batch, output_batch, encoder_initial_cell_state)
        total_loss += batch_loss
        if (batch+1)%5 == 0:
            print("total loss: {} epoch {} batch {} ".format(batch_loss.numpy(), i, batch+1))