## Data split

In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [2]:
from scripts.baseline_model import MoleculeModel
from collections import defaultdict
from tqdm.notebook import tqdm

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv('../data/merged_final_dataset.csv')

In [5]:
data

Unnamed: 0,index,smiles,active
0,0,CC(=O)C1=CC2=CC=CC=C2OC1=O,1
1,1,C1COC2=CC=CC=C2C1O,1
2,2,C1=CC=C2C(=C1)C(=CC(=O)O2)O,1
3,3,C1=CC(=CC=C1C2=COC3=C(C2=O)C=CC(=C3O)O)O,1
4,4,C1CC1NC2=C3C(=NC(=N2)N)N(C=N3)[C@@H]4C[C@@H](C...,1
...,...,...,...
481,139,COC1=C(C=C(C=C1)CNC2=NC(=NC=C2C(=O)NCC3=NC=CC=...,1
482,140,CN1C[C@@H]2[C@@H](C1)C3=C(C=CC(=C3)Cl)OC4=CC=C...,1
483,141,CC#CC(=O)N1CCC[C@H]1C2=NC(=C3N2C=CN=C3N)C4=CC=...,1
484,142,CC1=C(SC(=N1)NC(=O)N2CCC[C@H]2C(=O)N)C3=CC(=NC...,1


In [6]:
TARGET_COLUMN = 'active'

In [None]:
features = defaultdict(list)
for smi in tqdm(data.smiles):
    mol = MoleculeModel(smi, count=False)
    representation = mol.get_mol_representation()
    
    fp = representation['fingerprint']
    for i in fp:
        bit = fp[i]
        features['f' + str(i)].append(bit)
    descriptors = representation['descriptors_padel']
    for k, v in descriptors.items():
        features[k].append(v)

for k, v in features.items():
    data[k] = pd.Series(v)

  0%|          | 0/486 [00:00<?, ?it/s]

In [None]:
data

In [None]:
data = data.dropna()

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

y = data[TARGET_COLUMN]

X = data.drop(columns = [TARGET_COLUMN, 'smiles'])
X = MinMaxScaler().fit_transform(X) # added the scaler

X_train_val,  X_test, y_train_val, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42, shuffle = True, stratify = None)

skf = StratifiedKFold(n_splits=4,  random_state=42, shuffle=True)

## Models creating

In [None]:
def get_model(model):
    '''
    Get the model from six state-of-the-art machine learning models.
    '''
    if model=='svm':
        from sklearn.svm import SVC
        names = ["Linear SVM"]
        classifiers = [
        SVC()    
        ]
    elif model=='ab':
        from sklearn.ensemble import AdaBoostClassifier
        names = ["AdaBoost"]
        classifiers = [
        AdaBoostClassifier() 
        ]
    elif model=='knn':
        from sklearn.neighbors import KNeighborsClassifier
        names = ["K-Nearest Neighbors"]
        classifiers = [
        KNeighborsClassifier()
        ]
    elif model=='dt':
        from sklearn.tree import DecisionTreeClassifier
        names = ["Decision Tree"]
        classifiers = [
        DecisionTreeClassifier()   
        ]
    elif model=='nb':
        from sklearn.naive_bayes import GaussianNB
        names = ["Naive Bayes"]
        classifiers = [
         GaussianNB()   
        ]
    elif model=='rfc':
        from sklearn.ensemble import RandomForestClassifier
        names = ["Random Forest"]
        classifiers = [
        RandomForestClassifier()
        ]
    elif model=='xgboost':
        from xgboost import XGBClassifier
        names = ["XGBoost"]
        classifiers = [
        XGBClassifier()
        ]
    elif model=='mlpclassifier':
        from sklearn.neural_network import MLPClassifier
        names = ["MLPClassifier"]
        classifiers = [
        MLPClassifier()
        ]
    else:
        raise RuntimeError('Unknown classifier')
    
    return classifiers

## Parameters list

In [None]:
parameters = {
              'svm': {'model__C': (1, 5, 10, 50, 100), 'model__probability': [True]}, 
              'ab': {'model__n_estimators': (10, 25, 50, 100, 125, 150, 200)}, 
              'knn': {'model__n_neighbors': (3, 5, 10, 50, 75, 100), 'model__leaf_size': (1, 2, 3, 5, 10, 15, 20), 
                      'model__weights': ['uniform', 'distance']}, 
              'dt': {'model__max_depth': (10, 25, 50, 100), 'model__min_samples_leaf': (1, 3, 5, 10)}, 
              'nb': {}, 
              'rfc': {'model__max_depth': (2, 3, 5, 7, 10), 'model__n_estimators': (50, 100, 150, 200),
                     'model__min_samples_leaf': (1, 3, 5, 10)},
              'mlpclassifier': {'model__hidden_layer_sizes': (
                                  (100, 60, 30, 10), 
                                  (150, 100, 50, 25, 10),
                                  (100, 50, 25, 10)),
                                'model__alpha': (0.0001, 0.001, 0.00001, 0.01), 
                                'model__learning_rate': ['constant', 'adaptive'],
                               },
              'xgboost': {'model__n_estimators': (10, 25, 50, 100)}
            }

## Evaluating

In [None]:
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE

In [None]:
best_clfs_encode = {}
for name, param in parameters.items():
    model = get_model(name)[0]

    param['over__k_neighbors'] = (1, 3, 5, 10, 15, 20, 25)
    over = SMOTE(sampling_strategy=1)
    steps = [('over', over), ('model', model)]
    
#     steps = [('model', model)]

    pipeline = Pipeline(steps=steps)
    clf = GridSearchCV(pipeline, param, scoring='roc_auc', cv=skf, n_jobs=-1)
    clf.fit(X_train_val, y_train_val)

    print(f'Best params for {name}:', clf.best_params_)
    pipe = clf.best_estimator_['model']

    print('Test ROC AUC for the best model %.2f' % roc_auc_score(y_test, pipe.predict_proba(X_test)[:,1]))
    print('Test accuracy for the best model %.2f' % accuracy_score(y_test, pipe.predict(X_test)))
    print('Test f1-score for the best model %.2f' % f1_score(y_test, pipe.predict(X_test)))
    print()
    best_clfs_encode[name] = clf.best_estimator_