In [1]:
from os import listdir
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from utils import get_entity_dict, smaller_subtree_containing_the_drugs

import numpy as np

from sklearn.datasets import make_classification

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix



output_path_name = "task9.2_raquel_60.txt"

output_path = "evaluations/" + output_path_name
results_path = output_path.replace('.txt', '_All_scores.log')
datadir = '../../data/Test-DDI/DrugBank'
training_data = '/home/raquel/Documents/mai/ahlt/data/Train/All'
train_df_path = '/home/raquel/Documents/mai/ahlt/data/DF/train.csv'

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('saved_train_nice.csv', index_col=0)

sentences = train_df.sentence_text.values
y_train = train_df['relation_type'].values

y_binary = ['none' if i == 'none' else 'interaction 'for i in y_train]


sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y_binary, test_size=0.25, random_state=1000,stratify=y_binary)



def vectorize_data(sentences_train,sentences_test):  
    
    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    
    X_train = vectorizer.transform(sentences_train)
    X_test =  vectorizer.transform(sentences_test)
    
    return X_train, X_test

X_train, X_test = vectorize_data(sentences_train,sentences_test)

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
max_s = [len(x) for x in X_train]
maxlen = np.max(max_s)

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


In [6]:
max_s = [len(x) for x in X_train]

97

In [11]:
np.array(X_train).shape

(20843, 97)

In [16]:
from keras.layers import LSTM
from keras.models import Sequential
from keras import layers


def baseline_nn():
    embed_dim = 200
    embed_out = 20
    model = Sequential()
    # Para hacer cnn le tengo que meter n layer embeding, como está aquí, sino no tira
    model.add(layers.Embedding(input_dim = vocab_size, output_dim=embed_out,input_length = X_train.shape[1], dropout = 0.2))
    model.add(layers.Conv1D(3, 60, activation='relu'))
#     model.add(layers.Conv1D(4, 60, activation='relu'))
#     model.add(layers.Conv1D(5, 60, activation='relu'))
    model.add(layers.GlobalMaxPool1D())
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', 
                      optimizer='adam', 
                      metrics=['accuracy'])
    model.summary()
    return model


def classify_keras(model=baseline_nn()):
    from sklearn.preprocessing import LabelBinarizer
    encoder = LabelBinarizer()
    y_train_encoded = encoder.fit_transform(y_train)
    y_test_encoded = encoder.fit_transform(y_test)
    model.fit(X_train, y_train_encoded,
                    epochs=4,
                    verbose=True,
                    validation_data=(X_test, y_test_encoded),
                    batch_size=10)
    y_pred = model.predict(X_test)
    y_class = y_pred > 0.5
    y_class = y_class.astype(int)
    y_labels = [encoder.classes_[l] for l in y_class]
    print(f1_score(y_test, y_labels, average=None))
    print(precision_score(y_test, y_labels, average="macro"))
    print(recall_score(y_test, y_labels, average="macro"))

classify_keras()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 97, 20)            98940     
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 18, 3)             4803      
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 3)                 0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 4         
Total params: 103,747
Trainable params: 103,747
Non-trainable params: 0
_________________________________________________________________
Train on 20843 samples, validate on 6948 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
[0.56390565 0.93415058]
0.7741319894542452
0.7301450847730053


In [51]:
y_labels = [encoder.classes_[l] for l in y_class]
print(f1_score(y_test, y_labels, average=None))
print(precision_score(y_test, y_labels, average="macro"))
print(recall_score(y_test, y_labels, average="macro"))

[0.6054321  0.93269312]
0.7674267682781615
0.7707331590407378


In [22]:
(np.array(y_binary)=='interaction').sum()

4020

In [23]:
X_train[np.array(y_binary)=='interaction',:]

<4020x5138 sparse matrix of type '<class 'numpy.int64'>'
	with 77969 stored elements in Compressed Sparse Row format>

In [6]:
X_train

<27791x5138 sparse matrix of type '<class 'numpy.int64'>'
	with 590241 stored elements in Compressed Sparse Row format>

In [None]:
train_df = pd.read_csv(train_df_path, index_col=0)

# for index, row in train_df.iterrows():
#     print(train_df.loc[index, 'sentence_text'], train_df.loc[index, ['e1', 'e2']])
#     new_sentence = smaller_subtree_containing_the_drugs(train_df.loc[index, 'sentence_text'],
#                                                         train_df.loc[index, ['e1', 'e2']])
#     train_df.loc[index, 'sentence_text'] = new_sentence


sentences = train_df.sentence_text.values
y = train_df['relation_type'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000,stratify=y)


def vectorize_data(sentences_train,sentences_test):  
    
    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    
    X_train = vectorizer.transform(sentences_train)
    X_test =  vectorizer.transform(sentences_test)
    
    return X_train, X_test

X_train, X_test = vectorize_data(sentences_train,sentences_test)