In [1]:
from os import listdir
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from utils import get_entity_dict, smaller_subtree_containing_the_drugs

import numpy as np

from sklearn.datasets import make_classification

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix



output_path_name = "task9.2_raquel_60.txt"

output_path = "evaluations/" + output_path_name
results_path = output_path.replace('.txt', '_All_scores.log')
datadir = '../../data/Test-DDI/DrugBank'
training_data = '/home/raquel/Documents/mai/ahlt/data/Train/All'
train_df_path = '/home/raquel/Documents/mai/ahlt/data/DF/train.csv'

import warnings
warnings.filterwarnings('ignore')

In [2]:
# train_df = pd.read_csv(train_df_path, index_col=0)
train_df = pd.read_csv('saved_train_nice.csv', index_col=0)

# for index, row in train_df.iterrows():
# #     print(train_df.loc[index, 'sentence_text'], train_df.loc[index, ['e1', 'e2']])
#     new_sentence = smaller_subtree_containing_the_drugs(train_df.loc[index, 'sentence_text'],
#                                                         train_df.loc[index, ['e1', 'e2']])
#     train_df.loc[index, 'sentence_text'] = new_sentence


sentences = train_df.sentence_text.values
y = train_df['relation_type'].values
y_binary = ['none' if i == 'none' else 'interaction'for i in y]

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y_binary, test_size=0.25, random_state=1000,stratify=y)


def vectorize_data(sentences_train,sentences_test):  
    
    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    
    X_train = vectorizer.transform(sentences_train)
    X_test =  vectorizer.transform(sentences_test)
    
    return X_train, X_test

X_train, X_test = vectorize_data(sentences_train,sentences_test)

In [3]:
def classify(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f1_score(y_test, y_pred, average=None))
    print(precision_score(y_test, y_pred, average="macro"))
    print(recall_score(y_test, y_pred, average="macro"))    

from sklearn.linear_model import LogisticRegression

    
def classify_dense(model=LogisticRegression()):
    model.fit(X_train.toarray(), y_train)
    y_pred = model.predict(X_test.toarray())

    print(f1_score(y_test, y_pred, average=None))
    print(precision_score(y_test, y_pred, average="macro"))
    print(recall_score(y_test, y_pred, average="macro"))    

In [10]:
rf = RandomForestClassifier(n_estimators=100,n_jobs=-1,criterion= 'gini', max_depth= 60,
                             class_weight='balanced')

classify(rf)

[0.6243429  0.91867285]
0.7416836552039618
0.8255233172853551


In [21]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

clf = QuadraticDiscriminantAnalysis()
classify_dense(clf)

[0.34769463 0.93136085]
0.8031887646205071
0.6070241925154641


In [None]:
from sklearn.svm import SVC

clf = SVC()
classify_dense(clf)


In [None]:

mlp = MLPClassifier(activation='tanh', alpha= 0.1, hidden_layer_sizes=(30, 5), learning_rate='constant')
classify(mlp)

In [28]:
X_train

<20843x5687 sparse matrix of type '<class 'numpy.int64'>'
	with 700216 stored elements in Compressed Sparse Row format>

In [4]:
def tune_model(model, param_grid,model_name='model_hp'):
    clf = GridSearchCV(model, param_grid, cv=3,
                       scoring='f1_macro')
    clf.fit(X_train, y_train)
    with open(model_name,'w') as f:
        f.write("Best parameters set found on development set:\n")
        print("Best parameters set found on development set:")
        print()
        f.write(str(clf.best_params_))
        print(clf.best_params_)
        f.write("\nDetailed classification report:\n")

        print("Detailed classification report:")
        y_true, y_pred = y_test, clf.predict(X_test)
        f.write(str(classification_report(y_true, y_pred)))
        print(classification_report(y_true, y_pred))

### binary

In [5]:
rf_param_grid = [
  {'max_depth': [None,15,20, 30, 60, 90, 120], 'criterion': ['gini','entropy'],'min_samples_leaf':[1,5,15] }
 ]

rf = RandomForestClassifier(n_estimators=500,n_jobs=-1,
                             class_weight='balanced')


tune_model(rf,rf_param_grid,'hyperparams/rf_binary.txt')

Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': 90, 'min_samples_leaf': 1}
Detailed classification report:
              precision    recall  f1-score   support

 interaction       0.55      0.73      0.63      1005
        none       0.95      0.90      0.92      5943

   micro avg       0.87      0.87      0.87      6948
   macro avg       0.75      0.81      0.78      6948
weighted avg       0.89      0.87      0.88      6948



# Default

In [30]:
rf_param_grid = [
  {'max_depth': [None, 30, 60, 90, 120], 'criterion': ['gini','entropy'], }
 ]

rf = RandomForestClassifier(n_estimators=5,n_jobs=-1,
                             class_weight='balanced')

tune_model(rf,rf_param_grid,'hyperparams/rf.txt')


Best parameters set found on development set:

{'criterion': 'gini', 'max_depth': 60}
Detailed classification report:
              precision    recall  f1-score   support

      advise       0.25      0.69      0.36       206
      effect       0.27      0.70      0.39       422
         int       0.07      0.64      0.13        47
   mechanism       0.17      0.52      0.25       330
        none       0.92      0.59      0.72      5943

   micro avg       0.59      0.59      0.59      6948
   macro avg       0.33      0.63      0.37      6948
weighted avg       0.82      0.59      0.66      6948



In [31]:
mlp_param_grid = [
    {'hidden_layer_sizes': [(15,5), (30,5),(10,10,5), (20,20,5)],
     'activation':['identity','logistic', 'tanh','relu'],
     'alpha':[0.0001,0.001,0.01,0.1],
     'learning_rate':['constant','adaptive']
    }
]
mlp = MLPClassifier()
tune_model(mlp, mlp_param_grid,'hyperparams/mlp.txt')

Best parameters set found on development set:

{'activation': 'tanh', 'alpha': 0.1, 'hidden_layer_sizes': (30, 5), 'learning_rate': 'constant'}
Detailed classification report:
              precision    recall  f1-score   support

      advise       0.44      0.18      0.25       206
      effect       0.46      0.17      0.25       422
         int       0.00      0.00      0.00        47
   mechanism       0.34      0.08      0.13       330
        none       0.87      0.97      0.92      5943

   micro avg       0.85      0.85      0.85      6948
   macro avg       0.42      0.28      0.31      6948
weighted avg       0.80      0.85      0.81      6948

