## Data split

In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [2]:
from scripts.baseline_model import MoleculeModel
from collections import defaultdict
from tqdm.notebook import tqdm

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv('../data/lexa_with_broken_fixed.csv')

In [5]:
data

Unnamed: 0,smiles,active
0,CC(=O)C1=CC2=CC=CC=C2OC1=O,1
1,C1COC2=CC=CC=C2C1O,1
2,C1=CC=C2C(=C1)C(=CC(=O)O2)O,1
3,C1=CC(=CC=C1C2=COC3=C(C2=O)C=CC(=C3O)O)O,1
4,C1CC1NC2=C3C(=NC(=N2)N)N(C=N3)[C@@H]4C[C@@H](C...,1
...,...,...
337,CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl,1
338,CC(=O)CC(C1=CC=CC=C1)C2=C(C3=CC=CC=C3OC2=O)O,1
339,CC(C1=CC=C(C=C1)C(=O)C2=CC=CS2)C(=O)OC,1
340,C1=CC=C2C(=C1)C(=CC=N2)CO,0


In [6]:
TARGET_COLUMN = 'active'

In [7]:
features = defaultdict(list)
for smi in tqdm(data.smiles):
    mol = MoleculeModel(smi, count=False)
    representation = mol.get_mol_representation()
    
#     fp = representation['fingerprint']
#     for i in fp:
#         bit = fp[i]
#         features['f' + str(i)].append(bit)
    descriptors = representation['descriptors']
    for k, v in descriptors.items():
        features[k].append(v)

for k, v in features.items():
    data[k] = pd.Series(v)

  0%|          | 0/342 [00:00<?, ?it/s]

In [8]:
data

Unnamed: 0,smiles,active,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CC(=O)C1=CC2=CC=CC=C2OC1=O,1,11.287133,-0.572963,11.287133,0.100648,0.507554,188.182,180.118,188.047344,...,0,0,0,0,0,0,0,0,0,0
1,C1COC2=CC=CC=C2C1O,1,9.481343,-0.333380,9.481343,0.333380,0.606947,150.177,140.097,150.068080,...,0,0,0,0,0,0,0,0,0,0
2,C1=CC=C2C(=C1)C(=CC(=O)O2)O,1,10.777596,-0.535926,10.777596,0.039954,0.596945,162.144,156.096,162.031694,...,0,0,0,0,0,0,0,0,0,0
3,C1=CC(=CC=C1C2=COC3=C(C2=O)C=CC(=C3O)O)O,1,12.362099,-0.461244,12.362099,0.057844,0.591055,270.240,260.160,270.052823,...,0,0,0,0,0,0,0,0,0,0
4,C1CC1NC2=C3C(=NC(=N2)N)N(C=N3)[C@@H]4C[C@@H](C...,1,9.257136,0.162245,9.257136,0.162245,0.727230,286.339,268.195,286.154209,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337,CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl,1,6.037305,0.441519,6.037305,0.441519,0.756412,319.880,293.672,319.181526,...,0,0,0,0,0,0,0,0,0,0
338,CC(=O)CC(C1=CC=CC=C1)C2=C(C3=CC=CC=C3OC2=O)O,1,12.412307,-0.614534,12.412307,0.064063,0.747626,308.333,292.205,308.104859,...,0,0,0,0,0,0,0,0,0,0
339,CC(C1=CC=C(C=C1)C(=O)C2=CC=CS2)C(=O)OC,1,12.097398,-0.323250,12.097398,0.006733,0.634641,274.341,260.229,274.066365,...,0,0,0,0,0,0,0,1,0,0
340,C1=CC=C2C(=C1)C(=CC=N2)CO,0,9.007836,0.072454,9.007836,0.072454,0.686922,159.188,150.116,159.068414,...,0,0,0,0,0,0,0,0,0,0


In [9]:
data = data.dropna()

In [10]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

y = data[TARGET_COLUMN]

X = data.drop(columns = [TARGET_COLUMN, 'smiles'])
X = MinMaxScaler().fit_transform(X) # added the scaler

X_train_val,  X_test, y_train_val, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42, shuffle = True, stratify = None)

skf = StratifiedKFold(n_splits=4,  random_state=42, shuffle=True)

## Models creating

In [11]:
def get_model(model):
    '''
    Get the model from six state-of-the-art machine learning models.
    '''
    if model=='svm':
        from sklearn.svm import SVC
        names = ["Linear SVM"]
        classifiers = [
        SVC()    
        ]
    elif model=='ab':
        from sklearn.ensemble import AdaBoostClassifier
        names = ["AdaBoost"]
        classifiers = [
        AdaBoostClassifier() 
        ]
    elif model=='knn':
        from sklearn.neighbors import KNeighborsClassifier
        names = ["K-Nearest Neighbors"]
        classifiers = [
        KNeighborsClassifier()
        ]
    elif model=='dt':
        from sklearn.tree import DecisionTreeClassifier
        names = ["Decision Tree"]
        classifiers = [
        DecisionTreeClassifier()   
        ]
    elif model=='nb':
        from sklearn.naive_bayes import GaussianNB
        names = ["Naive Bayes"]
        classifiers = [
         GaussianNB()   
        ]
    elif model=='rfc':
        from sklearn.ensemble import RandomForestClassifier
        names = ["Random Forest"]
        classifiers = [
        RandomForestClassifier()
        ]
    elif model=='xgboost':
        from xgboost import XGBClassifier
        names = ["XGBoost"]
        classifiers = [
        XGBClassifier()
        ]
    elif model=='mlpclassifier':
        from sklearn.neural_network import MLPClassifier
        names = ["MLPClassifier"]
        classifiers = [
        MLPClassifier()
        ]
    else:
        raise RuntimeError('Unknown classifier')
    
    return classifiers

## Parameters list

In [12]:
parameters = {
              'svm': {'model__C': (1, 5, 10, 50, 100), 'model__probability': [True]}, 
              'ab': {'model__n_estimators': (10, 25, 50, 100)}, 
              'knn': {'model__n_neighbors': (3, 5, 10, 50, 75, 100), 'model__leaf_size': (1, 2, 3, 5, 10, 15, 20), 
                      'model__weights': ['uniform', 'distance']}, 
              'dt': {'model__max_depth': (10, 25, 50, 100), 'model__min_samples_leaf': (1, 3, 5, 10)}, 
              'nb': {}, 
              'rfc': {'model__max_depth': (2, 3, 5, 7, 10), 'model__n_estimators': (50, 100, 150, 200),
                     'model__min_samples_leaf': (1, 3, 5, 10)},
              'mlpclassifier': {'model__hidden_layer_sizes': (
                                  (100, 60, 30, 10), 
                                  (150, 100, 50, 25, 10),
                                  (100, 50, 25, 10)),
                                'model__alpha': (0.0001, 0.001, 0.00001, 0.01), 
                                'model__learning_rate': ['constant', 'adaptive'],
                               },
              'xgboost': {'model__n_estimators': (10, 25, 50, 100)}
            }

## Evaluating

In [13]:
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE

In [14]:
best_clfs_encode = {}
for name, param in parameters.items():
    model = get_model(name)[0]

#     param['over__k_neighbors'] = (1, 3, 5, 10, 15, 20, 25)
#     over = SMOTE(sampling_strategy=0.5)
#     steps = [('over', over), ('model', model)]
    
    steps = [('model', model)]

    pipeline = Pipeline(steps=steps)
    clf = GridSearchCV(pipeline, param, scoring='roc_auc', cv=skf, n_jobs=-1)
    clf.fit(X_train_val, y_train_val)

    print(f'Best params for {name}:', clf.best_params_)
    pipe = clf.best_estimator_['model']

    print('Test ROC AUC for the best model %.2f' % roc_auc_score(y_test, pipe.predict_proba(X_test)[:,1]))
    print('Test accuracy for the best model %.2f' % accuracy_score(y_test, pipe.predict(X_test)))
    print('Test f1-score for the best model %.2f' % f1_score(y_test, pipe.predict(X_test)))
    print()
    best_clfs_encode[name] = clf.best_estimator_

Best params for svm: {'model__C': 1, 'model__probability': True}
Test ROC AUC for the best model 0.78
Test accuracy for the best model 0.82
Test f1-score for the best model 0.90

Best params for ab: {'model__n_estimators': 50}
Test ROC AUC for the best model 0.81
Test accuracy for the best model 0.82
Test f1-score for the best model 0.90

Best params for knn: {'model__leaf_size': 1, 'model__n_neighbors': 75, 'model__weights': 'distance'}
Test ROC AUC for the best model 0.76
Test accuracy for the best model 0.85
Test f1-score for the best model 0.92

Best params for dt: {'model__max_depth': 10, 'model__min_samples_leaf': 10}
Test ROC AUC for the best model 0.72
Test accuracy for the best model 0.75
Test f1-score for the best model 0.84

Best params for nb: {}
Test ROC AUC for the best model 0.75
Test accuracy for the best model 0.29
Test f1-score for the best model 0.31

Best params for rfc: {'model__max_depth': 7, 'model__min_samples_leaf': 1, 'model__n_estimators': 150}
Test ROC AUC f

