In [151]:
from tqdm import tqdm
from os import listdir
import pandas as pd
from xml.dom.minidom import parse
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import numpy as np

from sklearn.datasets import make_classification

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC

stopwords = set(stopwords.words('english'))

output_path_name = "task9.2_raquel_50.txt"

output_path = "evaluations/" + output_path_name
results_path = output_path.replace('.txt', '_All_scores.log')
datadir = '../../data/Test-DDI/DrugBank'
training_data = '/home/raquel/Documents/mai/ahlt/data/Train/All'
train_df_path = '/home/raquel/Documents/mai/ahlt/data/DF/train.csv'
test_df_path = '/home/raquel/Documents/mai/ahlt/data/DF/test.csv'

import warnings
warnings.filterwarnings('ignore')


In [156]:
def classify_dense(model=LogisticRegression()):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f1_score(y_test, y_pred, average=None))
    print(precision_score(y_test, y_pred, average="macro"))
    print(recall_score(y_test, y_pred, average="macro"))    
    
def classify_dense(model=LogisticRegression()):
    model.fit(X_train.toarray(), y_train)
    y_pred = model.predict(X_test.toarray())

    print(f1_score(y_test, y_pred, average=None))
    print(precision_score(y_test, y_pred, average="macro"))
    print(recall_score(y_test, y_pred, average="macro"))    

In [120]:
from keras.models import Sequential
from keras import layers



In [139]:
def baseline_nn():
    input_dim = X_train.shape[1]

    model = Sequential()
    model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
    model.add(layers.Dense(5, activation='sigmoid'))

    model.compile(loss='categorical_crossentropy', 
                      optimizer='adam', 
                      metrics=['accuracy'])
    return model

def classify_keras(model=baseline_nn()):
    from sklearn.preprocessing import LabelBinarizer
    encoder = LabelBinarizer()
    y_train_encoded = encoder.fit_transform(y_train)
    y_test_encoded = encoder.fit_transform(y_test)
    model.fit(X_train, y_train_encoded,
                    epochs=10,
                    verbose=False,
                    batch_size=10)
    y_pred = model.predict(X_test)
    y_class = y_pred.argmax(axis=-1)
    y_labels = [encoder.classes_[l] for l in y_class]
    print(f1_score(y_test, y_labels, average=None))
    print(precision_score(y_test, y_labels, average="macro"))
    print(recall_score(y_test, y_labels, average="macro"))    
classify_keras()

[0.43436754 0.29069767 0.         0.11494253 0.90380981]
0.44594754325836866
0.32907380287254506


In [None]:
model = Sequential()
    model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
    model.add(layers.Dense(5, activation='sigmoid'))

    model.compile(loss='categorical_crossentropy', 
                      optimizer='adam', 
                      metrics=['accuracy'])
    return model

In [153]:
def vectorize_data(train_df,test_df):
    
    sentences_train = train_df.sentence_text.values
    sentences_test = test_df.sentence_text.values
    
    y_train = train_df['relation_type'].values
    y_test = test_df['relation_type'].values
    
    
    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    
    X_train = vectorizer.transform(sentences_train)
    X_test =  vectorizer.transform(sentences_test)
   
    
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = vectorize_data(train_df,test_df)
print(X_train.shape)

(27791, 5938)


In [157]:
from sklearn.naive_bayes import GaussianNB
clf3 = GaussianNB()
classify_dense(clf3)

[0.22093023 0.20728745 0.         0.20560748 0.76413302]
0.2664544794420739
0.3412766669715575


In [56]:
classify()

[0.37368421 0.2622549  0.         0.18032787 0.86800663]
0.3540076109181062
0.32969990441305863


In [60]:
clf = RandomForestClassifier(n_estimators=500, max_depth=30,n_jobs=-1,
                             class_weight='balanced', random_state=0)
classify(clf)

[0.36847104 0.33685601 0.03773585 0.34810637 0.69007715]
0.36338671275714607
0.5739660955602753


In [113]:
clf = RandomForestClassifier(n_estimators=700, max_depth=60,n_jobs=-1,
                             class_weight='balanced', random_state=0)
classify(clf)

[0.41438849 0.34820647 0.         0.35674157 0.82390438]
0.35234520149448106
0.4777753868864936


In [109]:
clf = RandomForestClassifier(n_estimators=700, max_depth=80,n_jobs=-1,
                             class_weight='balanced', random_state=0)
classify(clf)

[0.39940828 0.33965844 0.         0.34343434 0.8397692 ]
0.35781792721459327
0.4489296224567484


In [140]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(10,5),learning_rate='adaptive')
classify(clf)

[0.42159383 0.32683658 0.02020202 0.25174825 0.88098918]
0.4607302688813806
0.3650231364244492


In [150]:
rf = RandomForestClassifier(n_estimators=700, max_depth=60,n_jobs=-1,
                             class_weight='balanced', random_state=0)

mlp = MLPClassifier(hidden_layer_sizes=(10,5),learning_rate='adaptive')

from sklearn.ensemble import VotingClassifier
ensemble=VotingClassifier(estimators=[('Random Forest', rf), ('MLP', mlp)], 
                       voting='soft')
classify(ensemble)

[0.42761693 0.34396671 0.         0.24230769 0.882853  ]
0.3841840390116512
0.3772648009290561


In [130]:
X_train, y_train, X_test, y_test = vectorize_data(train_df,test_df)
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)

svd.fit(X_train)
print(svd.explained_variance_ratio_.sum())

X_train = svd.transform(X_train)  

X_test = svd.transform(X_test)  


0.7883861050632311


In [114]:
clf = RandomForestClassifier(n_estimators=500, max_depth=30,n_jobs=-1,
                             class_weight='balanced', random_state=0)
classify(clf)

[0.36847104 0.33685601 0.03773585 0.34810637 0.69007715]
0.36338671275714607
0.5739660955602753


In [103]:
clf = SVC(gamma='auto',class_weight='balanced')
classify(clf)

[0.30536913 0.22955264 0.02083333 0.31319911 0.47318312]
0.29864652225939814
0.5161744094383395


In [116]:
clf = SVC(gamma='auto',class_weight='balanced',C=0.5)
classify(clf)

[0.30950378 0.24197745 0.0265252  0.32951289 0.44484101]
0.304448349777018
0.5421351217408685


In [136]:
train_df = pd.read_csv(train_df_path, index_col=0)
test_df = pd.read_csv(test_df_path, index_col=0)


def tokenize_data(train_df,test_df):
    
    sentences_train = train_df.sentence_text.values
    sentences_test = test_df.sentence_text.values
    
    y_train = train_df['relation_type'].values
    y_test = test_df['relation_type'].values
    
    
    
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(sentences_train)
    
    X_train = tokenizer.texts_to_sequences(sentences_train)
    X_test = tokenizer.texts_to_sequences(sentences_test)
    vocab_size = len(tokenizer.word_index) + 1 
    
    
    maxlen = 90
    X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
    X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
    
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = tokenize_data(train_df,test_df)

In [75]:
lens = [len(x) for x in X_train]

In [79]:
max(lens)

103

In [71]:
classify()

[0.         0.         0.         0.         0.90052961]
0.16558073654390934
0.19742453029343465


In [83]:
clf = RandomForestClassifier(n_estimators=500, max_depth=30,n_jobs=-1,
                             class_weight='balanced', random_state=0)
classify(clf)

[0.121673   0.08962264 0.         0.06153846 0.90507555]
0.3897673890160573
0.22891335715488448


In [1]:
from keras.preprocessing.text import Tokenizer

test_df = 

def train_baseline():
    train_df = pd.read_csv(train_df_path, index_col=0)
    sentences_train = train_df.sentence_text.values
    y_train = train_df['relation_type'].values
    vectorizer = Tokenizer(num_words=5000)
    vectorizer.fit_on_texts(sentences_train)
#     vectorizer = CountVectorizer()
#     vectorizer.fit(sentences_train)
#     X_train = vectorizer.transform(sentences_train)
    
    X_train = tokenizer.texts_to_sequences(sentences_train)
    
    print('training...')
    # classifier = RandomForestClassifier(n_jobs=-1, class_weight='balanced')
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    print('trained')
    return vectorizer, classifier


vectorizer, classifier = train_baseline()


def check_interaction(sentence):
    # uses the vectorizer and the classifier already trained
    sentence_array = vectorizer.transform([sentence])
    y_pred = classifier.predict(sentence_array)

    if y_pred[0] == 'none':
        return False, "null"
    else:
        return True, y_pred[0]


def predict(datadir, output_path, test=False):
    # process each file in directory
    with open(output_path, 'w') as file:
        for f in tqdm(listdir(datadir)):

            # parse XML file, obtaining a DOM tree
            tree = parse(datadir + "/" + f)

            # process each sentence in the file
            sentences = tree.getElementsByTagName("sentence")
            for s in sentences:
                sid = s.attributes["id"].value  # get sentence id
                stext = s.attributes["text"].value  # get sentence text

                # load sentence entities
                entities = {}
                ents = s.getElementsByTagName("entity")
                for e in ents:
                    id = e.attributes["id"].value
                    offs = e.attributes["charOffset"].value.split("-")
                    entities[id] = offs

                # for each pair in the sentence, decide whether it is DDI and its type
                pairs = s.getElementsByTagName("pair")
                for p in pairs:
                    id_e1 = p.attributes["e1"].value
                    id_e2 = p.attributes["e2"].value
                    (is_ddi, ddi_type) = check_interaction(stext)
                    ddi = "1" if is_ddi else "0"
                    file.write(sid + "|" + id_e1 + "|" + id_e2 + "|" + ddi + "|" + ddi_type)
                    file.write('\n')
                    if test:
                        return


def show_results(results_path):
    import subprocess
    subprocess.call(['java', '-jar', '../../eval/evaluateDDI.jar', '../../data/Test-DDI/All', output_path])
    results_file = open(results_path, 'r')
    print(results_file.read())
    results_file.close()


predict(datadir, output_path, False)
show_results(results_path)

Using TensorFlow backend.


NameError: name 'pd' is not defined