In [1]:
import glob
import json
import pandas as pd
import tensorflow as tf
import spacy
import re
import string
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np


In [2]:
path = '/home/prajakta/Documents/SharpestMinds/COVID-analysis/data/*.json'
files = glob.glob(path)
papers = []
for file in files:
    with open(file) as json_file:
            text = json.load(json_file)
            papers.append([text['paper_id'], text['bodytext'], text['abstract']])
data = pd.DataFrame(papers, columns = ['paper_id', 'bodytext', 'abstract'])
filter = data.abstract != ""
data = data[filter]
#first_10 = data[:10]

In [3]:
def clean_text(bodytext):
    cleaned = list()
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table 
    table = str.maketrans('', '', string.punctuation)
    for word in bodytext:
        words = str(word)       
        words = words.lower()
        words = words.translate(table)
        words = re_print.sub('', words) 
        if words.isalpha() == True:
            cleaned.append(words)
    cleaned.insert(0, '<start>')
    cleaned.append('<end>')
    return cleaned

In [4]:
nlp = spacy.load("en_core_web_sm")
bt_vector = list()
bt_list = []
ab_list = []
for i in range(len(data)):
    bodytext = nlp(data.iloc[i].bodytext)
    bt_clean = clean_text(bodytext)
    bt_list.append(bt_clean)
    
    abstract = nlp(data.iloc[i].abstract)
    ab_clean = clean_text(abstract)
    ab_list.append(ab_clean)
com_list = ab_list + bt_list
    #c_papers.append(papers)
bt_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
bt_tokenizer.fit_on_texts(com_list)
data_bt = bt_tokenizer.texts_to_sequences(bt_list)
data_ab = bt_tokenizer.texts_to_sequences(ab_list)

longest_seq = max(max([len(x) for x in data_bt]), max([len(x) for x in data_ab]))
data_bt = tf.keras.preprocessing.sequence.pad_sequences(data_bt,padding='post', maxlen = longest_seq)
data_ab = tf.keras.preprocessing.sequence.pad_sequences(data_ab,padding='post', maxlen = longest_seq) 

In [5]:
def max_len(tensor):
    #print( np.argmax([len(t) for t in tensor]))
    return max( len(t) for t in tensor)

In [6]:
#dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
X_train,  X_test, Y_train, Y_test = train_test_split(data_bt,data_ab,test_size=0.2)
BATCH_SIZE = 5
BUFFER_SIZE = len(X_train)
steps_per_epoch = BUFFER_SIZE//BATCH_SIZE
embedding_dims = 256
rnn_units = 1024
dense_units = 1024
Dtype = tf.float32 

In [7]:
vocab_size = len(bt_tokenizer.word_index)+1  


In [8]:
def initialize_initial_state():
        return [tf.zeros((BATCH_SIZE, rnn_units)), tf.zeros((BATCH_SIZE, rnn_units))]
encoder_initial_cell_state = initialize_initial_state()

In [9]:
#for ( batch , (input_batch, output_batch)) in enumerate(dataset.take(steps_per_epoch)):
inputs = keras.Input(shape=(longest_seq,))
emb = layers.Embedding(input_dim = vocab_size, output_dim = embedding_dims)(inputs)
lstm = layers.LSTM(rnn_units, return_sequences = True, return_state = True)(emb)
# encoder_states = [lstm[1], lstm[2]]
dense = layers.Dense(vocab_size)(lstm[0])
print(dense)
# decoder_inputs = keras.Input(shape=(None, 50))
# decoder_lstm = layers.LSTM(rnn_units, return_sequences=True, return_state=True)
# decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
# decoder_dense = layers.Dense(50, activation='softmax')
# decoder_outputs = decoder_dense(decoder_outputs)
# model = keras.Model([inputs, decoder_inputs], decoder_outputs)

model = keras.Model(inputs=inputs, outputs=dense)    
print(inputs.shape, dense.shape)
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.Adam(),
    metrics=["sparse_categorical_accuracy"], #choose a better metric AUC
)


history = model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=15, validation_split=0.2)

test_scores = model.evaluate(X_test, Y_test, verbose=1)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

Tensor("dense/Identity:0", shape=(None, 7143, 6282), dtype=float32)
(None, 7143) (None, 7143, 6282)
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15


KeyboardInterrupt: 

In [10]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 7143)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 7143, 256)         1608192   
_________________________________________________________________
lstm (LSTM)                  [(None, 7143, 1024), (Non 5246976   
_________________________________________________________________
dense (Dense)                (None, 7143, 6282)        6439050   
Total params: 13,294,218
Trainable params: 13,294,218
Non-trainable params: 0
_________________________________________________________________


In [11]:
p_data = data_bt[0].reshape(1,-1)
summary = model.predict(p_data)

In [None]:
indexes = tf.math.argmax(summary, axis=2).numpy()
words = [bt_tokenizer.index_word[y] for x in indexes for y in x]
" ".join(words)