# Evaluation using the SNPPhenA corpus corpus 

 SNPPhenA is a corpus for extracting ranked associations of single-nucleotide polymorphisms and phenotypes from literature


#  -------------------------------------------------------------------------------------------

# imports

In [3]:
import tensorflow as tf
from keras.models import load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras_tqdm import TQDMNotebookCallback
import numpy as np
np.random.seed(1337)
from keras_tqdm import TQDMNotebookCallback
import nltk
import xml.etree.ElementTree as ET
import pandas as pd
import os
import string
from nltk.tokenize import TreebankWordTokenizer
from numpy.random import random_sample
import re
import pickle
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from keras.layers import Embedding, Flatten,LSTM
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation,  Input, merge,Conv1D,MaxPooling1D,GlobalMaxPooling1D,Convolution1D
from keras import regularizers
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from keras.layers import Concatenate, concatenate
from keras import backend as K
from keras.layers import multiply
from keras.layers import merge
from keras.layers.core import *
from keras.layers.recurrent import LSTM
from keras.models import *
random_seed=1337

Using TensorFlow backend.


###  Define Callback functions to generate Measures

In [4]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


# Experiments to reproduce the results of Table 9 

### Load pre procssed Data

In [5]:
with open('../data/pickles/train_and_test_data_sentences_snp_2class.pickle', 'rb') as handle:
#with open('../../SNP-Disease/train_and_test_data_sentences_snp_2classWiki.pickle', 'rb') as handle:

    W_train = pickle.load(handle)
    d1_train = pickle.load(handle)
    d2_train = pickle.load(handle)
    Y_train = pickle.load(handle)
    Tr_word_list = pickle.load(handle)
    
    W_test = pickle.load(handle)
    d1_test = pickle.load(handle)
    d2_test = pickle.load(handle)
    Y_test = pickle.load(handle)
    Te_word_list = pickle.load(handle)
    
    
    word_vectors = pickle.load(handle)
    word_dict = pickle.load(handle)
    d1_dict = pickle.load(handle)
    d2_dict = pickle.load(handle)
    label_dict = pickle.load(handle)
    MAX_SEQUENCE_LENGTH = pickle.load(handle)

### Prepare Word Embedding Layer

In [6]:
EMBEDDING_DIM=word_vectors.shape[1]
embedding_matrix=word_vectors

def create_embedding_layer(l2_reg=0.1,use_pretrained=True,is_trainable=False):
    
    if use_pretrained:        
        return Embedding(len(word_dict) ,EMBEDDING_DIM,weights=[embedding_matrix],input_length=MAX_SEQUENCE_LENGTH,trainable=is_trainable,embeddings_regularizer=regularizers.l2(l2_reg))
    else:    
        return Embedding(len(word_dict) ,EMBEDDING_DIM,input_length=MAX_SEQUENCE_LENGTH)
            

### Create the Model

In [7]:
def build_model():    
    
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedding_layer=create_embedding_layer(use_pretrained=True,is_trainable=False)
    embedded_sequences = embedding_layer(sequence_input)

    x = Conv1D(256, 7, activation='relu')(embedded_sequences)
    x = MaxPooling1D(3)(x)
    x = Dropout(0.5)(x)

    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPooling1D(3)(x)
    x = Dropout(0.5)(x)
   
    
    conv_sequence=GlobalMaxPooling1D()(x)    #x = Flatten()(x)

    
    forward = LSTM(100,recurrent_dropout=0.05)(embedded_sequences)
    backward = LSTM(100, go_backwards=True,recurrent_dropout=0.05)(embedded_sequences)
    lstm_sequence = concatenate([forward,backward])
    merge = concatenate([conv_sequence,lstm_sequence])
    x = Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.05))(merge)
    x = Dropout(0.5)(x)
    preds = Dense(2, activation='softmax')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc',f1])
    #model.summary()
    return model


### Run the Evaluation on the test dataset (Average results of 10 times execution)

In [8]:
param='macro'
def train_and_evaluate_model_lstm():
        model = None # Clearing the NN.
        model = build_model()
        history=model.fit(W_train, Y_train,epochs=20,validation_data=(W_test,Y_test), batch_size=32,verbose=1,callbacks=[TQDMNotebookCallback()])        
        predicted = np.argmax(model.predict(W_test), axis=1)
        y_test_to_label = np.argmax(Y_test, axis=1)
        prec, reca, fscore, sup = precision_recall_fscore_support(y_test_to_label, predicted, average=param)
        print("Precision:{:.2f}% Recall:{:.2f}% Fscore:{:.2f}% ".format(prec*100, reca*100, fscore*100))        
        return history,prec, reca, fscore


hists=[]
precission=[]
recall=[]
fscores=[]
for i in range(1):
    hist,prec, reca, fscore=train_and_evaluate_model_lstm()
    hists.append(hist)
    precission.append(prec)
    recall.append(reca)
    fscores.append(fscore)

print("Final Precision:{:.2f}% Recall:{:.2f}% Fscore:{:.2f}% ".format( np.average(precission)*100,  np.average(recall)*100,  np.average(fscores)*100))         

Train on 935 samples, validate on 365 samples


Training:   0%|          | 0/20 [00:00<?, ?it/s]

Epoch 1/20


Epoch 0:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 2/20


Epoch 1:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 3/20


Epoch 2:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 4/20


Epoch 3:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 5/20


Epoch 4:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 6/20


Epoch 5:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 7/20


Epoch 6:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 8/20


Epoch 7:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 9/20


Epoch 8:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 10/20


Epoch 9:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 11/20


Epoch 10:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 12/20


Epoch 11:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 13/20


Epoch 12:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 14/20


Epoch 13:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 15/20


Epoch 14:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 16/20


Epoch 15:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 17/20


Epoch 16:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 18/20


Epoch 17:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 19/20


Epoch 18:   0%|          | 0/935 [00:00<?, ?it/s]

Epoch 20/20


Epoch 19:   0%|          | 0/935 [00:00<?, ?it/s]

Precision:76.11% Recall:75.87% Fscore:75.33% 
Final Precision:76.11% Recall:75.87% Fscore:75.33% 
