In [17]:
from tqdm import tqdm
from os import listdir
import pandas as pd
from xml.dom.minidom import parse
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

stopwords = set(stopwords.words('english'))

output_path_name = "task9.2_raquel_6.txt"

output_path = "evaluations/" + output_path_name
results_path = output_path.replace('.txt', '_All_scores.log')
datadir = '../../data/Test-DDI/DrugBank'
training_data = '/home/raquel/Documents/mai/ahlt/data/Train/All'
train_df_path = '/home/raquel/Documents/mai/ahlt/data/DF/train.csv'

In [18]:

import numpy as np

from keras.layers import Dense, Input, Flatten, Reshape, concatenate, Dropout
from keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Embedding
from keras.layers import LSTM, Bidirectional
from keras.models import Model
from keras import optimizers
from keras import regularizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import sys



In [16]:
train_df = pd.read_csv('saved_train_nice.csv', index_col=0)

sentences = train_df.sentence_text.values
y = train_df['relation_type'].values

y_binary = ['none' if i == 'none' else 'interaction 'for i in y_train]


sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000,stratify=y)


tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences_train)
word_index = tokenizer.word_index


X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)


vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
max_s = [len(x) for x in X_train]
maxlen = np.max(max_s)

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [22]:
from gensim.models import KeyedVectors

fname='/home/raquel/Documents/mai/ahlt/data/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(fname, binary=True)


len(model.wv.vocab)

In [40]:
def get_weight_matrix(w2v,vocab):
    # define weight matrix dimensions with all 0
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, w2v.vector_size))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        try:
            weight_matrix[i] = w2v[word]
        except:
            print(word)
    return weight_matrix

embedding_vectors = get_weight_matrix(model, tokenizer.word_index)
# emb_layer = Embedding(vocab_size, output_dim=w2v.vector_size, weights=[embedding_vectors], input_length=FIXED_LENGTH, trainable=False)


and
of
a
to
rsb
lsb
clofibrate
tolbutamide
mefenamic
methyldopa
aminosalicylic
diflunisal
sulfinpyrazone
chlorpropamide
bromelains
ethacrynic
oxyphenbutazone
fenoprofen
diazoxide
nalidixic
chymotrypsin
cinchophen
dextrothyroxine
oxolinic
phenyramidol
pyrazolones
triclofos
delavirdine
dicumarol
viracept
felbamate
felbatol
crixivan
tegretol
prelone
invirase
fortovase
norvir
agenerase
kaletra
theodur
rifadin
astramorph
kadian
sporanox
nizoral
trileptal
sandimmune
grispeg
neoral
atromids
cypa
ethionamide
iodinecontaining
nsaids
aminoglutethimide
orudis
pertechnetate
troleandomycin
badrenergic
paminosalicylic
cholestographic
mitotane
disopyramide
betablockers
glutethimide
quinupristin
acetylsalicylic
indocin
lodine
trimeprazine
temaril
triflupromazine
vesprin
stelazine
trifluoperazine
mellaril
nuprin
anaprox
compazine
videx
oruvail
voltaren
cataflam
relafen
oxaprozin
daypro
mgkg
promazine
sparine
trimethoprimsulfamethoxazole
hantagonists
ethchlorvynol
tsh
valproic
angiotensinconverting
symp

In [20]:
def kimCNN(EMBEDDING_DIM, MAX_SEQUENCE_LENGTH, MAX_NB_WORDS, word_index, num_labels,loss='binary_crossentropy'):
    """
    Convolution neural network model for sentence classification.
    Parameters
    ----------
    EMBEDDING_DIM: Dimension of the embedding space.
    MAX_SEQUENCE_LENGTH: Maximum length of the sentence.
    MAX_NB_WORDS: Maximum number of words in the vocabulary.
    embeddings_index: A dict containing words and their embeddings.
    word_index: A dict containing words and their indices.
    labels_index: A dict containing the labels and their indices.
    Returns
    -------
    compiled keras model
    """
    print('Preparing embedding matrix.')
    
    MAX_SEQUENCE_LENGTH = maxlen
    num_words = vocab_size
    embedding_layer = Embedding(input_dim=num_words,
                                output_dim=EMBEDDING_DIM,
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)


    print('Training model.')

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    print(embedded_sequences.shape)


    # add first conv filter
    embedded_sequences = Reshape((MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, 1))(embedded_sequences)
    x = Conv2D(100, (5, EMBEDDING_DIM), activation='relu')(embedded_sequences)
    x = MaxPooling2D((MAX_SEQUENCE_LENGTH - 5 + 1, 1))(x)


    # add second conv filter.
    y = Conv2D(100, (4, EMBEDDING_DIM), activation='relu')(embedded_sequences)
    y = MaxPooling2D((MAX_SEQUENCE_LENGTH - 4 + 1, 1))(y)


    # add third conv filter.
    z = Conv2D(100, (3, EMBEDDING_DIM), activation='relu')(embedded_sequences)
    z = MaxPooling2D((MAX_SEQUENCE_LENGTH - 3 + 1, 1))(z)


    # concate the conv layers
    alpha = concatenate([x,y,z])

    # flatted the pooled features.
    alpha = Flatten()(alpha)

    # dropout
    alpha = Dropout(0.5)(alpha)

    # predictions
    preds = Dense(num_labels, activation='softmax')(alpha)

    # build model
    model = Model(sequence_input, preds)
    adadelta = optimizers.Adadelta()
        
    model.compile(loss='categorical_crossentropy',
                  optimizer=adadelta,
                  metrics=['acc'])
    model.summary()

    return model

In [24]:
word_embedding_size = 200
word_pos = 20
model = kimCNN(EMBEDDING_DIM=word_embedding_size, MAX_SEQUENCE_LENGTH=word_pos, MAX_NB_WORDS=len(word_index), word_index=word_index, num_labels=5)


def classify_keras(model):
    from sklearn.preprocessing import LabelBinarizer
    encoder = LabelBinarizer()
    y_train_encoded = encoder.fit_transform(y_train)
    y_test_encoded = encoder.fit_transform(y_test)
    model.fit(X_train, y_train_encoded,
                    epochs=3,
                    verbose=True,
                    validation_data=(X_test, y_test_encoded),
                    batch_size=100)
    y_pred = model.predict(X_test)
    y_class = y_pred > 0.5
    y_class = y_class.astype(int)
    y_labels = [encoder.classes_[l] for l in y_class]
    print(f1_score(y_test, y_labels, average=None))
    print(precision_score(y_test, y_labels, average="macro"))
    print(recall_score(y_test, y_labels, average="macro"))

classify_keras(model)

Preparing embedding matrix.
Training model.
(?, 97, 200)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 97)           0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 97, 200)      992200      input_9[0][0]                    
__________________________________________________________________________________________________
reshape_9 (Reshape)             (None, 97, 200, 1)   0           embedding_9[0][0]                
__________________________________________________________________________________________________
conv2d_25 (Conv2D)              (None, 93, 1, 100)   100100      reshape_9[0][0]                  
____________________________________________________

ValueError: Classification metrics can't handle a mix of multiclass and multiclass-multioutput targets

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.fit_transform(y_test)
model.fit(X_train, y_train_encoded,
                epochs=50,
                verbose=True,
                validation_data=(X_test, y_test_encoded),
                batch_size=100)

Train on 20843 samples, validate on 6948 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
 4500/20843 [=====>........................] - ETA: 16s - loss: 2.2922 - acc: 0.8562