In [9]:
from tqdm import tqdm
from os import listdir
import pandas as pd
from xml.dom.minidom import parse
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from utils import get_entity_dict, smaller_subtree_containing_the_drugs, preprocess
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
stopwords = set(stopwords.words('english'))

output_path_name = "task9.2_cascade_rf_rf_99.txt"

output_path = "evaluations/" + output_path_name
results_path = output_path.replace('.txt', '_All_scores.log')
datadir = '../../data/Test-DDI/DrugBank'
training_data = '/home/raquel/Documents/mai/ahlt/data/Train/All'
train_df_path = '../../../data/DF/train.csv'
processed_train_df_path = '../../../data/DF/train_processed.csv'

encoder = LabelBinarizer()
tokenizer = Tokenizer()
vectorizer = CountVectorizer()

import warnings
warnings.filterwarnings('ignore')

In [2]:
# train_df = pd.read_csv(train_df_path, index_col=0)
train_df = pd.read_csv(train_df_path, index_col=0)

sentences, dictionary, y_train_encoded = preprocess(train_df, processed_train_df_path, encoder)
y = train_df['relation_type'].values



y_binary = ['none' if i == 'none' else 'interaction'for i in y]

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000,stratify=y)

def vectorize_data(sentences_train,sentences_test):  

    vectorizer.fit(sentences_train)
    
    X_train = vectorizer.transform(sentences_train)
    X_test =  vectorizer.transform(sentences_test)
    
    return X_train, X_test

X_train, X_test = vectorize_data(sentences_train,sentences_test)

loaded preprocessed data


In [3]:
def classify(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f1_score(y_test, y_pred, average=None))
    print(precision_score(y_test, y_pred, average="macro"))
    print(recall_score(y_test, y_pred, average="macro"))    

from sklearn.linear_model import LogisticRegression

    
def classify_dense(model=LogisticRegression()):
    model.fit(X_train.toarray(), y_train)
    y_pred = model.predict(X_test.toarray())

    print(f1_score(y_test, y_pred, average=None))
    print(precision_score(y_test, y_pred, average="macro"))
    print(recall_score(y_test, y_pred, average="macro"))    

In [4]:
rf = RandomForestClassifier(n_estimators=100,n_jobs=-1,criterion= 'gini', max_depth= 60,
                             class_weight='balanced')

classify(rf)

[0.57281553 0.62331288 0.61176471 0.53037037 0.93559493]
0.6711937115317246
0.6415464702393192


In [5]:
lr = LogisticRegression(class_weight='balanced')
classify(rf)

[0.56296296 0.62135922 0.60465116 0.5280236  0.93489036]
0.6648543543489074
0.6383752711319601


In [6]:
mlp = MLPClassifier(activation='tanh', alpha= 0.1, hidden_layer_sizes=(30, 5), learning_rate='constant')
classify(mlp)

[0.55585831 0.62290862 0.58666667 0.51929825 0.94209961]
0.7285644735817022
0.5889583313903344


In [None]:
X_train

In [7]:
def tune_model(model, param_grid,model_name='model_hp'):
    clf = GridSearchCV(model, param_grid, cv=3,
                       scoring='f1_macro')
    clf.fit(X_train, y_train)
    with open(model_name,'w') as f:
        f.write("Best parameters set found on development set:\n")
        print("Best parameters set found on development set:")
        print()
        f.write(str(clf.best_params_))
        print(clf.best_params_)
        f.write("\nDetailed classification report:\n")

        print("Detailed classification report:")
        y_true, y_pred = y_test, clf.predict(X_test)
        f.write(str(classification_report(y_true, y_pred)))
        print(classification_report(y_true, y_pred))

### binary

In [None]:
rf_param_grid = [
  {'max_depth': [None,15,20, 30, 60, 90, 120], 'criterion': ['gini','entropy'],'min_samples_leaf':[1,5,15] }
 ]

rf = RandomForestClassifier(n_estimators=500,n_jobs=-1,
                             class_weight='balanced')


tune_model(rf,rf_param_grid,'hyperparams/rf_best.txt')

# Default

In [11]:
rf_param_grid = [
  {'max_depth': [None, 30, 60, 90, 120], 'criterion': ['gini','entropy'], }
 ]

rf = RandomForestClassifier(n_estimators=5,n_jobs=-1,
                             class_weight='balanced')

tune_model(rf,rf_param_grid,'hyper_rf.txt')


Best parameters set found on development set:

{'criterion': 'gini', 'max_depth': 60}
Detailed classification report:
              precision    recall  f1-score   support

      advise       0.55      0.51      0.53       206
      effect       0.58      0.59      0.59       422
         int       0.67      0.51      0.58        47
   mechanism       0.43      0.51      0.47       330
        none       0.93      0.92      0.93      5943

    accuracy                           0.87      6948
   macro avg       0.63      0.61      0.62      6948
weighted avg       0.87      0.87      0.87      6948



In [None]:
mlp_param_grid = [
    {'hidden_layer_sizes': [(15,5), (30,5),(10,10,5), (20,20,5)],
     'activation':['identity','logistic', 'tanh','relu'],
     'alpha':[0.0001,0.001,0.01,0.1],
     'learning_rate':['constant','adaptive']
    }
]
mlp = MLPClassifier()
tune_model(mlp, mlp_param_grid,'hyper_mlp.txt')