# Imports

In [None]:
import numpy as np
import pandas as pd
import os
from collections import Counter
import pickle
from itertools import chain

import gc
import warnings
warnings.simplefilter('ignore')

from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

import tensorflow as tf
from keras.models import Model, Sequential
from keras.layers.recurrent import LSTM
from keras.layers import Embedding, Dense, Input, RepeatVector, TimeDistributed, concatenate, add, Dropout
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

%config InteractiveShell.ast_node_interactivity = 'all'

### Necessary for tf to grab as much memory as is available, not as much as it wants

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)

# Helper Functions

In [None]:
def extract_embeddings(w2v_model_path, embeddings_path):
    """Imports the the previously trained word2vec
    model and procedes to extract and save to disk the embeddings 
    weights/vectors as a numpy matrix. """
    
    model = Word2Vec.load(w2v_model_path)   
    weights = model.wv.syn0
    np.save(open(embeddings_path, 'wb'), weights)


def update_dicts(w2v_model_path):
    """Uses a word2vec model's vocabulary to
    construct word-index and index-word mappings, it also adds
    special tokens as the lowerest indeces"""
    
    model = Word2Vec.load(w2v_model_path)
   
    word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    word2idx['<unk>'] = len(word2idx)
        
    idx2word = dict([(v, k) for k, v in word2idx.items()])
    
    return word2idx, idx2word


def create_embeddings_layer(embeddings_path):
    """Creates and embeddings layer fromt the
    weights matrix of a word2vec model"""
    
    weights = np.load(open(embeddings_path, 'rb'))
   
    layer = Embedding(input_dim=weights.shape[0],
                      output_dim=weights.shape[1],
                      weights=[weights], trainable=False)
    print("(vocab_size, output_dim) : ", weights.shape)
    return layer

#Retrieving Data Necessary to Build the Embedding Layer
abs_model_path = 'PubMed_200k_RCT_model_10000'
abs_embed_path = 'Abstracts_embeddings.plz' #save path

#abstracts
extract_embeddings(abs_model_path, abs_embed_path)

#abstract word-ID map dictionaries update
ABS_word2idx, ABS_idx2word = update_dicts(abs_model_path)

#dictionary length
print("Dictionary Length: ",len(ABS_idx2word.values()))

def loadW2V(w2vPath):
    
    from gensim.models import Word2Vec
    
    model = Word2Vec.load(w2vPath)
    
    return model

def transform_input_text(texts, abs_model_path):
    """Transforms the input text so that the words are arrays padded
    to the maximum allowed input length"""
    
    from keras.preprocessing.sequence import pad_sequences
    
    unknown_emb = np.random.rand(1, EMBEDDING_SIZE)
    w2v = loadW2V(abs_model_path)
    
    temp = []
    for line in texts:
        x = np.zeros(shape=(max_input_seq_length, EMBEDDING_SIZE))
        for idx, word in enumerate(line.lower().split(' ')):
            if idx >= max_input_seq_length:
                break
            emb = unknown_emb
            if word in w2v:
                emb = w2v[word]
            x[idx, :] = emb
        temp.append(x)
    temp = pad_sequences(temp, maxlen=max_input_seq_length)

    print(temp.shape)
    return temp

def transform_target_encoding(texts):
    """Splits the target text and adds the start
    and end tokens"""
    
    temp = []
    for line in texts:
        x = []
        line2 = 'START ' + line.lower() + ' END'
        for word in line2.split(' '):
            x.append(word)
            if len(x)+1 >= max_target_seq_length:
                break
        temp.append(x)

    temp = np.array(temp)
    return temp

#defaults for params

MAX_INPUT_SEQ_LENGTH = 500
MAX_TARGET_SEQ_LENGTH = 50
MAX_INPUT_VOCAB_SIZE = 5000
MAX_TARGET_VOCAB_SIZE = 2000

def fit_text(X, Y, input_seq_max_length=None, target_seq_max_length=None):
    """Creates the dictionaries for the word to id lookup and vice versa,
    calculates the maximum input and output sequence length and the 
    number of tokens in the dictionary"""
    
    if input_seq_max_length is None:
        input_seq_max_length = MAX_INPUT_SEQ_LENGTH
    if target_seq_max_length is None:
        target_seq_max_length = MAX_TARGET_SEQ_LENGTH
    input_counter = Counter()
    target_counter = Counter()
    max_input_seq_length = 0
    max_target_seq_length = 0

    for line in X:
        text = [word.lower() for word in line.split(' ')]
        seq_length = len(text)
        if seq_length > input_seq_max_length:
            text = text[0:input_seq_max_length]
            seq_length = len(text)
        for word in text:
            input_counter[word] += 1
        max_input_seq_length = max(max_input_seq_length, seq_length)

    for line in Y:
        line2 = 'START ' + line.lower() + ' END'
        text = [word for word in line2.split(' ')]
        seq_length = len(text)
        if seq_length > target_seq_max_length:
            text = text[0:target_seq_max_length]
            seq_length = len(text)
        for word in text:
            target_counter[word] += 1
            max_target_seq_length = max(max_target_seq_length, seq_length)

    input_word2idx = dict()
    for idx, word in enumerate(input_counter.most_common(MAX_INPUT_VOCAB_SIZE)):
        input_word2idx[word[0]] = idx + 2
    input_word2idx['PAD'] = 0
    input_word2idx['UNK'] = 1
    input_idx2word = dict([(idx, word) for word, idx in input_word2idx.items()])

    target_word2idx = dict()
    for idx, word in enumerate(target_counter.most_common(MAX_TARGET_VOCAB_SIZE)):
        target_word2idx[word[0]] = idx + 1
    target_word2idx['UNK'] = 0

    target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])
    
    num_input_tokens = len(input_word2idx)
    num_target_tokens = len(target_word2idx)

    config = dict()
    config['input_word2idx'] = input_word2idx
    config['input_idx2word'] = input_idx2word
    config['target_word2idx'] = target_word2idx
    config['target_idx2word'] = target_idx2word
    config['num_input_tokens'] = num_input_tokens
    config['num_target_tokens'] = num_target_tokens
    config['max_input_seq_length'] = max_input_seq_length
    config['max_target_seq_length'] = max_target_seq_length

    return config

# Get the data

Note that due to the size constraints, the two pickle files needed for the cell bellow to run were not uploaded; they can be generated by using the 'preproc' notebook after downloading the raw data.

In [None]:
with open('titlesAbstracts.pkl', 'rb') as fh:
    titles = pickle.load(fh)

with open('abstractsCorpus.pkl', 'rb') as fh:
    text = pickle.load(fh)

textConcat = list()

from itertools import chain
for each in text.values():
    tmp = ' '.join(list(chain.from_iterable(each)))
    textConcat.append(tmp)

X = textConcat[:10000]
Y = list(titles.values())[:10000]

Xfull = textConcat
Yfull = list(titles.values())

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

# set up the model

### Params:

In [None]:
weights = np.load(open(abs_embed_path, 'rb'))
vocab_size = weights.shape[0]
EMBEDDING_SIZE = weights.shape[1]
max_input_seq_length = 500 #the longest abstract (calculated in prepoc); the + 100 part is to account for added tokens
num_input_tokens = vocab_size
num_target_tokens = vocab_size
max_target_seq_length = 50
HIDDEN_UNITS = 100
default_batch_size = 64
default_epochs = 2
verbose = 1

conf = fit_text(X, Y)

input_word2idx = conf['input_word2idx']
input_idx2word = conf['input_idx2word']
target_word2idx = conf['target_word2idx'] 
target_idx2word = conf['target_idx2word']
num_input_tokens = conf['num_input_tokens']
num_target_tokens = conf['num_target_tokens']
max_input_seq_length = conf['max_input_seq_length']
max_target_seq_length = conf['max_target_seq_length']

### Model:

In [None]:
encoder_inputs = Input(shape=(None, EMBEDDING_SIZE), name='encoder_inputs')
encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm')
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs)
encoder_states = [encoder_state_h, encoder_state_c]

decoder_inputs = Input(shape=(None, num_target_tokens), name='decoder_inputs')
decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm')
decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs, initial_state=encoder_states)

decoder_dense = Dense(units=num_target_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_inputs = [Input(shape=(HIDDEN_UNITS,)), Input(shape=(HIDDEN_UNITS,))]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states)

model.summary()

In [None]:
def generate_batch(x_samples, y_samples, batch_size):
    """Generates the batches from the training data given the 
    batch size necessary; pads sequences if required"""
    
    num_batches = len(x_samples) // batch_size
    while True:
        for batchIdx in range(0, num_batches):
            start = batchIdx * batch_size
            end = (batchIdx + 1) * batch_size
            encoder_input_data_batch = pad_sequences(x_samples[start:end],  max_input_seq_length)
            decoder_target_data_batch = np.zeros(shape=(batch_size,  max_target_seq_length,  num_target_tokens))
            decoder_input_data_batch = np.zeros(shape=(batch_size,  max_target_seq_length,  num_target_tokens))
            for lineIdx, target_words in enumerate(y_samples[start:end]):
                for idx, w in enumerate(target_words):
                    w2idx = 0  # default [UNK]
                    if w in target_word2idx:
                        w2idx = target_word2idx[w]
                    if w2idx != 0:
                        decoder_input_data_batch[lineIdx, idx, w2idx] = 1
                        if idx > 0:
                            decoder_target_data_batch[lineIdx, idx - 1, w2idx] = 1
            yield [encoder_input_data_batch, decoder_input_data_batch], decoder_target_data_batch

In [None]:
def fit(Xtrain, Ytrain, Xtest, Ytest, epochs=None, batch_size=None, model_dir_path=None):
    """Trains a model and returns a history so that the training
    statistics could be monitored as well""" 
    
    if epochs is None:
        epochs = DEFAULT_EPOCHS
    if model_dir_path is None:
        model_dir_path = 'models_ATsigns'
    if batch_size is None:
        batch_size = DEFAULT_BATCH_SIZE
    
    checkpoint = ModelCheckpoint('models_ATsigns')
    
    Ytrain =  transform_target_encoding(Ytrain)
    Ytest =  transform_target_encoding(Ytest)

    Xtrain =  transform_input_text(Xtrain, abs_model_path)
    Xtest =  transform_input_text(Xtest, abs_model_path)

    train_gen =  generate_batch(Xtrain, Ytrain, batch_size)
    test_gen =  generate_batch(Xtest, Ytest, batch_size)

    train_num_batches = len(Xtrain) // batch_size
    test_num_batches = len(Xtest) // batch_size

    history =  model.fit_generator(generator=train_gen, steps_per_epoch=train_num_batches,
                                       epochs=epochs,
                                       verbose=1, validation_data=test_gen, validation_steps=test_num_batches,
                                       callbacks=[checkpoint])
    model.save('models_ATsigns')
    return history

In [None]:
def summarize(input_text):
    """Creates the summary from the input sequence;
    samples from the decoder until either the end token is reached
    or the maximum output sequence length is reached"""
    
    unknown_emb = np.random.rand(1, EMBEDDING_SIZE)
    w2v = loadW2V(abs_model_path)
    
    input_seq = np.zeros(shape=(1,  max_input_seq_length, EMBEDDING_SIZE))
    for idx, word in enumerate(input_text.lower().split(' ')):
        if idx >=  max_input_seq_length:
            break
        emb =  unknown_emb  # default [UNK]
        if word in w2v:
            emb =  w2v[word]
        input_seq[0, idx, :] = emb
    states_value =  encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1,  num_target_tokens))
    target_seq[0, 0,  target_word2idx['START']] = 1
    target_text = ''
    target_text_len = 0
    terminated = False
    while not terminated:
        output_tokens, h, c =  decoder_model.predict([target_seq] + states_value)

        sample_token_idx = np.argmax(output_tokens[0, -1, :])
        sample_word =  target_idx2word[sample_token_idx]
        target_text_len += 1

        if sample_word != 'START' and sample_word != 'END':
            target_text += ' ' + sample_word

        if sample_word == 'END' or target_text_len >=  max_target_seq_length:
            terminated = True

        target_seq = np.zeros((1, 1,  num_target_tokens))
        target_seq[0, 0, sample_token_idx] = 1

        states_value = [h, c]
    return target_text.strip()

# Train

In [None]:
history = fit(Xtrain, Ytrain, Xtest, Ytest, epochs=5, batch_size=4)

# Predict

Note that the prediction here is mostly useless and just for show as the model progresses

In [None]:
conf = fit_text(X, Y)

HIDDEN_UNITS = 100

MAX_DECODER_SEQ_LENGTH = 4

input_word2idx = conf['input_word2idx']
input_idx2word = conf['input_idx2word']
target_word2idx = conf['target_word2idx'] 
target_idx2word = conf['target_idx2word']
num_input_tokens = conf['num_input_tokens']
num_target_tokens = conf['num_target_tokens']
max_input_seq_length = conf['max_input_seq_length']
max_target_seq_length = conf['max_target_seq_length']

In [None]:
from random import randint
c = randint(10000, len(Xfull))

textPredict = Xfull[c]
labelPredict = Yfull[c]

labelPredict
summarize(textPredict)