In [3]:
# Models
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import itertools
from pprint import pprint
import joblib

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_validate, GridSearchCV, cross_val_score, RandomizedSearchCV 
from sklearn.model_selection import cross_val_predict

from sklearn.pipeline import Pipeline

from sklearn.metrics import make_scorer

#regression matrics
from sklearn.metrics import mean_absolute_error , mean_squared_error, r2_score

#classification metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.base import TransformerMixin
from sklearn.base import clone
from sklearn.model_selection._split import check_cv

In [5]:
from utils import *

## Data

In [6]:
#import labels
train_labels = pd.read_csv('../data/train_test_sets/train_labels.csv', index_col = 'CASRN')

# import all the training features
train_ecfp6_bits = pd.read_csv('../data/Bmodel_features/modeling_train_ecfp6_bits.csv', index_col='CASRN')
train_ecfp6_counts = pd.read_csv('../data/Bmodel_features/modeling_train_ecfp6_counts.csv', index_col='CASRN')
train_maccs = pd.read_csv('../data/Bmodel_features/modeling_train_maccs.csv', index_col='CASRN')
train_rdkit2d = pd.read_csv('../data/Bmodel_features/modeling_train_rdkit2d.csv', index_col='CASRN')
train_mordred = pd.read_csv('../data/Bmodel_features/modeling_train_mordred.csv', index_col='CASRN')

## Models

In [7]:
# regression 
knn_reg = KNeighborsRegressor()
svr = SVR()
xgb_reg = XGBRegressor(random_state =123, n_jobs=6 ,objective ='reg:squarederror')
rf_reg =  RandomForestRegressor(random_state =123, n_jobs=6)


# classification 
svc = SVC(random_state =42)
knn_clf = KNeighborsClassifier()
xgb_clf = XGBClassifier(random_state =123, n_jobs=6)
rf_clf =  RandomForestClassifier(random_state =123, n_jobs=6)

## Search Space

[kNN](https://scikit-learn.org/stable/modules/neighbors.html)

In [7]:
knn_grid_parameters_des = {'n_neighbors': [5,9,15,19,25,35,45,55,71], 'weights': ['distance'],'p': [1,2]}
knn_grid_parameters_fp = {'n_neighbors': [5,9,15,19,23,25,35,45,55,71], 'weights': ['distance'],
                          'metric': ['jaccard', 'dice', 'rogerstanimoto']}
knn_grid_parameters_fpcounts = {'n_neighbors': [5,9,15,19,25,35,45,55,71], 'weights': ['distance'],
                          'metric': ['hamming', 'canberra', 'braycurtis']}

[SVM](https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html)

Intuitively, the `gamma` parameter defines how far the influence of a single training example reaches, with low values meaning ‘far’ and high values meaning ‘close’. The `gamma`parameters can be seen as the inverse of the radius of influence of samples selected by the model as support vectors.

The `C`parameter trades off correct classification of training examples against maximization of the decision function’s margin. For larger values of `C`, a smaller margin will be accepted if the decision function is better at classifying all training points correctly. A lower C will encourage a larger margin, therefore a simpler decision function, at the cost of training accuracy. In other words`C` behaves as a regularization parameter in the SVM.

In [24]:
svm_grid_parameters = [
  {'C': [0.01, 0.1, 1, 10, 100, 200, 400, 1000], 'kernel': ['linear']},
  {'C': [0.01, 0.1, 1, 10, 100, 200, 400, 1000], 'gamma': [100,10,1,1e-1,1e-2, 1e-3], 'kernel': ['rbf']},
 ]

In [35]:
svm_grid_parameters = { 'gamma': [10,1,1e-1,1e-2],
                     'C': [1, 10, 100]}

In [100]:
svm_grid_parameters_1 = { 'gamma': [1,1e-1],
                     'C': [1, 10]}

In [8]:
svm_grid_parameters_2 = [
  {'C': [1, 10], 'kernel': ['linear']},
  {'C': [1, 10], 'gamma': [1,1e-1], 'kernel': ['rbf']},
 ]

[RF]()

In [28]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 1500, num = 2)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 80, num = 6)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4, 6]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
rf_grid_parameters = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(rf_grid_parameters)

{'bootstrap': [True, False],
 'max_depth': [5, 20, 35, 50, 65, 80, None],
 'max_features': ['log2', 'sqrt'],
 'min_samples_leaf': [2, 4, 6],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [500, 1500]}


[xgb]()

In [52]:
print('Default Parameters :\n')
pprint(xgb_reg.get_params())

Default Parameters :

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'importance_type': 'gain',
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': 6,
 'nthread': None,
 'objective': 'reg:linear',
 'random_state': 123,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 1,
 'verbosity': 1}


In [73]:
xgb_grid_parameters = {
    'learning_rate': [0.01,0.1],
    'max_depth': [3,6,10],
    'min_child_weight':[1,3,5], 
    'gamma':[0,1,5],  
    'subsample':[i/10.0 for i in range(6,11)],
    'colsample_bytree':[i/10.0 for i in range(5,11)], 
    'n_estimators': [500,1500]}

In [74]:
xgb_grid_parameters

{'learning_rate': [0.01, 0.1],
 'max_depth': [3, 6, 10],
 'min_child_weight': [1, 3, 5],
 'gamma': [0, 1, 5],
 'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
 'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
 'n_estimators': [500, 1500]}

In [17]:
def result_model_selection(results, name):
    df_results = pd.DataFrame({'model': [name] * len(results.cv_results_['params']),
                               'params': results.cv_results_['params'],
                              'mean score': results.cv_results_['mean_test_score'],
                              'std score': results.cv_results_['std_test_score'],
                               'rank': results.cv_results_['rank_test_score']
                              })
    return df_results

# kNN

### Endpoint 1: VeryToxic

In [7]:
encoder_verytoxic = joblib.load('../data/label_encoders/encoder_verytoxic.joblib')

In [8]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'verytoxic', encoder = encoder_verytoxic)

vt_knn_ecfp6bits = model_selection(knn_clf, knn_grid_parameters_fp, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'dice', 'n_neighbors': 25, 'weights': 'distance'}
Best score: 0.8364398520846357
Grid scores on development set:

0.800 (+/-0.102) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
0.820 (+/-0.103) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
0.830 (+/-0.110) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
0.831 (+/-0.115) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
0.834 (+/-0.117) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
0.836 (+/-0.118) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
0.830 (+/-0.125) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
0.829 (+/-0.126) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
0.830 (+/-0.122) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
0.825 (+/-0.131) for {'metric': 'jaccard', 'n_neighbors': 71, 'wei

In [9]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'verytoxic', encoder = encoder_verytoxic)

vt_knn_maccs = model_selection(knn_clf, knn_grid_parameters_fp, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'dice', 'n_neighbors': 35, 'weights': 'distance'}
Best score: 0.839530791592507
Grid scores on development set:

0.804 (+/-0.108) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
0.824 (+/-0.102) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
0.834 (+/-0.101) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
0.837 (+/-0.099) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
0.838 (+/-0.098) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
0.839 (+/-0.098) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
0.838 (+/-0.095) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
0.833 (+/-0.101) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
0.833 (+/-0.103) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
0.829 (+/-0.107) for {'metric': 'jaccard', 'n_neighbors': 71, 'weig

In [10]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'verytoxic', encoder = encoder_verytoxic)

vt_knn_ecfp6count = model_selection(knn_clf, knn_grid_parameters_fpcounts, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'braycurtis', 'n_neighbors': 35, 'weights': 'distance'}
Best score: 0.8387612550176445
Grid scores on development set:

0.714 (+/-0.048) for {'metric': 'hamming', 'n_neighbors': 5, 'weights': 'distance'}
0.718 (+/-0.054) for {'metric': 'hamming', 'n_neighbors': 9, 'weights': 'distance'}
0.723 (+/-0.056) for {'metric': 'hamming', 'n_neighbors': 15, 'weights': 'distance'}
0.720 (+/-0.058) for {'metric': 'hamming', 'n_neighbors': 19, 'weights': 'distance'}
0.724 (+/-0.070) for {'metric': 'hamming', 'n_neighbors': 25, 'weights': 'distance'}
0.724 (+/-0.073) for {'metric': 'hamming', 'n_neighbors': 35, 'weights': 'distance'}
0.719 (+/-0.076) for {'metric': 'hamming', 'n_neighbors': 45, 'weights': 'distance'}
0.713 (+/-0.087) for {'metric': 'hamming', 'n_neighbors': 55, 'weights': 'distance'}
0.721 (+/-0.086) for {'metric': 'hamming', 'n_neighbors': 71, 'weights': 'distance'}
0.729 (+/-0.039) for {'metric': 'canberra', 'n_neighbors': 5

In [11]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'verytoxic', encoder = encoder_verytoxic)

vt_knn_rdkit2d = model_selection(knn_clf, knn_grid_parameters_des, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
Best score: 0.8262497861356659
Grid scores on development set:

0.789 (+/-0.117) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.776 (+/-0.113) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.810 (+/-0.116) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.792 (+/-0.118) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
0.821 (+/-0.117) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
0.804 (+/-0.128) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
0.822 (+/-0.119) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
0.810 (+/-0.130) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
0.826 (+/-0.121) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
0.812 (+/-0.129) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
0.824 (+/-0.125) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
0.807 (+/-0.138) for {'n_neighbors': 35, 'p': 2, 'w

In [12]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'verytoxic', encoder = encoder_verytoxic)

vt_knn_mordred = model_selection(knn_clf, knn_grid_parameters_des, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'n_neighbors': 45, 'p': 1, 'weights': 'distance'}
Best score: 0.8258538199032514
Grid scores on development set:

0.810 (+/-0.103) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.799 (+/-0.110) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.824 (+/-0.107) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.809 (+/-0.117) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
0.822 (+/-0.126) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
0.820 (+/-0.126) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
0.823 (+/-0.128) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
0.823 (+/-0.124) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
0.825 (+/-0.126) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
0.820 (+/-0.126) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
0.823 (+/-0.128) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
0.818 (+/-0.129) for {'n_neighbors': 35, 'p': 2, 'w

## Endpoint 2: toxic

In [25]:
encoder_toxic = joblib.load('../data/label_encoders/encoder_toxic.joblib')

In [28]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'toxic', encoder = encoder_toxic)

t_knn_ecfp6bits = model_selection(knn_clf, knn_grid_parameters_fp, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'dice', 'n_neighbors': 19, 'weights': 'distance'}
Best score: 0.7763432868206271
Grid scores on development set:

0.760 (+/-0.064) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
0.770 (+/-0.055) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
0.775 (+/-0.051) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
0.774 (+/-0.055) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
0.773 (+/-0.060) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
0.772 (+/-0.061) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
0.766 (+/-0.067) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
0.764 (+/-0.067) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
0.761 (+/-0.065) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
0.756 (+/-0.065) for {'metric': 'jaccard', 'n_neighbors': 71, 'wei

In [29]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'toxic', encoder = encoder_toxic)

t_knn_maccs = model_selection(knn_clf, knn_grid_parameters_fp, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'rogerstanimoto', 'n_neighbors': 23, 'weights': 'distance'}
Best score: 0.799649224423121
Grid scores on development set:

0.791 (+/-0.082) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
0.798 (+/-0.081) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
0.798 (+/-0.085) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
0.798 (+/-0.087) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
0.796 (+/-0.086) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
0.796 (+/-0.086) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
0.792 (+/-0.086) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
0.790 (+/-0.081) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
0.787 (+/-0.081) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
0.784 (+/-0.078) for {'metric': 'jaccard', 'n_neighbors':

In [41]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'toxic', encoder = encoder_toxic)

t_knn_ecfp6count = model_selection(knn_clf, knn_grid_parameters_fpcounts, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'braycurtis', 'n_neighbors': 19, 'weights': 'distance'}
Best score: 0.7775241942906674
Grid scores on development set:

0.694 (+/-0.080) for {'metric': 'hamming', 'n_neighbors': 5, 'weights': 'distance'}
0.700 (+/-0.082) for {'metric': 'hamming', 'n_neighbors': 9, 'weights': 'distance'}
0.699 (+/-0.069) for {'metric': 'hamming', 'n_neighbors': 15, 'weights': 'distance'}
0.693 (+/-0.070) for {'metric': 'hamming', 'n_neighbors': 19, 'weights': 'distance'}
0.690 (+/-0.074) for {'metric': 'hamming', 'n_neighbors': 25, 'weights': 'distance'}
0.685 (+/-0.073) for {'metric': 'hamming', 'n_neighbors': 35, 'weights': 'distance'}
0.689 (+/-0.085) for {'metric': 'hamming', 'n_neighbors': 45, 'weights': 'distance'}
0.680 (+/-0.090) for {'metric': 'hamming', 'n_neighbors': 55, 'weights': 'distance'}
0.681 (+/-0.079) for {'metric': 'hamming', 'n_neighbors': 71, 'weights': 'distance'}
0.719 (+/-0.073) for {'metric': 'canberra', 'n_neighbors': 5

In [42]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'toxic', encoder = encoder_toxic)

t_knn_rdkit2d = model_selection(knn_clf, knn_grid_parameters_des, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
Best score: 0.8029854704696326
Grid scores on development set:

0.795 (+/-0.076) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.776 (+/-0.078) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.801 (+/-0.080) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.782 (+/-0.082) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
0.803 (+/-0.071) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
0.780 (+/-0.079) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
0.801 (+/-0.068) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
0.778 (+/-0.075) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
0.798 (+/-0.068) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
0.776 (+/-0.071) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
0.792 (+/-0.069) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
0.771 (+/-0.065) for {'n_neighbors': 35, 'p': 2, 'w

In [43]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'toxic', encoder = encoder_toxic)

t_knn_mordred = model_selection(knn_clf, knn_grid_parameters_des, a, c, scoring = 'roc_auc', cv=5, GridSearch = True)

Best parameters set found on development set: {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
Best score: 0.8041736775310927
Grid scores on development set:

0.794 (+/-0.072) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.783 (+/-0.075) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.803 (+/-0.071) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.791 (+/-0.083) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
0.804 (+/-0.070) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
0.790 (+/-0.081) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
0.802 (+/-0.074) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
0.789 (+/-0.079) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
0.800 (+/-0.071) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
0.788 (+/-0.075) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
0.797 (+/-0.069) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
0.784 (+/-0.073) for {'n_neighbors': 35, 'p': 2, 'w

## Endpoint 3: EPA

In [56]:
encoder_epa = joblib.load('../data/label_encoders/encoder_epa.joblib')

In [57]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'EPA_category', encoder = encoder_epa)

epa_knn_ecfp6bits = model_selection(knn_clf, knn_grid_parameters_fp, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'dice', 'n_neighbors': 9, 'weights': 'distance'}
Best score: 0.5270581333255882
Grid scores on development set:

0.525 (+/-0.025) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
0.526 (+/-0.036) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
0.523 (+/-0.046) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
0.513 (+/-0.057) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
0.510 (+/-0.054) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
0.509 (+/-0.061) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
0.493 (+/-0.067) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
0.482 (+/-0.058) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
0.470 (+/-0.061) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
0.457 (+/-0.065) for {'metric': 'jaccard', 'n_neighbors': 71, 'weig

In [58]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'EPA_category', encoder = encoder_epa)

epa_knn_maccs = model_selection(knn_clf, knn_grid_parameters_fp, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'rogerstanimoto', 'n_neighbors': 15, 'weights': 'distance'}
Best score: 0.5428375675642761
Grid scores on development set:

0.532 (+/-0.047) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
0.539 (+/-0.052) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
0.533 (+/-0.068) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
0.533 (+/-0.072) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
0.530 (+/-0.070) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
0.531 (+/-0.070) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
0.522 (+/-0.079) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
0.516 (+/-0.079) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
0.508 (+/-0.071) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
0.499 (+/-0.064) for {'metric': 'jaccard', 'n_neighbors'

In [59]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'EPA_category', encoder = encoder_epa)

epa_knn_ecfp6count = model_selection(knn_clf, knn_grid_parameters_fpcounts, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'braycurtis', 'n_neighbors': 9, 'weights': 'distance'}
Best score: 0.526256890725965
Grid scores on development set:

0.449 (+/-0.054) for {'metric': 'hamming', 'n_neighbors': 5, 'weights': 'distance'}
0.443 (+/-0.074) for {'metric': 'hamming', 'n_neighbors': 9, 'weights': 'distance'}
0.425 (+/-0.093) for {'metric': 'hamming', 'n_neighbors': 15, 'weights': 'distance'}
0.409 (+/-0.088) for {'metric': 'hamming', 'n_neighbors': 19, 'weights': 'distance'}
0.409 (+/-0.082) for {'metric': 'hamming', 'n_neighbors': 25, 'weights': 'distance'}
0.408 (+/-0.064) for {'metric': 'hamming', 'n_neighbors': 35, 'weights': 'distance'}
0.405 (+/-0.051) for {'metric': 'hamming', 'n_neighbors': 45, 'weights': 'distance'}
0.404 (+/-0.043) for {'metric': 'hamming', 'n_neighbors': 55, 'weights': 'distance'}
0.390 (+/-0.039) for {'metric': 'hamming', 'n_neighbors': 71, 'weights': 'distance'}
0.478 (+/-0.045) for {'metric': 'canberra', 'n_neighbors': 5, 

In [60]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'EPA_category', encoder = encoder_epa)

epa_knn_rdkit2d = model_selection(knn_clf, knn_grid_parameters_des, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)

Best parameters set found on development set: {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Best score: 0.5360856666601568
Grid scores on development set:

0.533 (+/-0.043) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.523 (+/-0.041) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.536 (+/-0.055) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.522 (+/-0.052) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
0.532 (+/-0.071) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
0.512 (+/-0.062) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
0.528 (+/-0.065) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
0.507 (+/-0.061) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
0.522 (+/-0.072) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
0.504 (+/-0.073) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
0.510 (+/-0.084) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
0.492 (+/-0.079) for {'n_neighbors': 35, 'p': 2, 'we

In [61]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'EPA_category', encoder = encoder_epa)

epa_knn_mordred = model_selection(knn_clf, knn_grid_parameters_des, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)

Best parameters set found on development set: {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Best score: 0.5379409379014389
Grid scores on development set:

0.537 (+/-0.039) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.528 (+/-0.044) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.538 (+/-0.038) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
0.529 (+/-0.053) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
0.534 (+/-0.050) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
0.528 (+/-0.060) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
0.527 (+/-0.064) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
0.523 (+/-0.061) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
0.526 (+/-0.065) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
0.512 (+/-0.062) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
0.509 (+/-0.066) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
0.499 (+/-0.061) for {'n_neighbors': 35, 'p': 2, 'we

## Endpoint 4: logLD50

In [50]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'logLD50_mmolkg', encoder = None)

ld50_knn_ecfp6bits = model_selection(knn_reg, knn_grid_parameters_fp, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'dice', 'n_neighbors': 15, 'weights': 'distance'}
Best score: -0.5328170551774813
Grid scores on development set:

-0.554 (+/-0.155) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
-0.542 (+/-0.169) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
-0.537 (+/-0.176) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
-0.542 (+/-0.173) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
-0.546 (+/-0.172) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
-0.549 (+/-0.171) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
-0.559 (+/-0.168) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
-0.566 (+/-0.168) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
-0.576 (+/-0.172) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
-0.589 (+/-0.184) for {'metric': 'jaccard', 'n_neighbors

In [51]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'logLD50_mmolkg', encoder = None)

ld50_knn_maccs = model_selection(knn_reg, knn_grid_parameters_fp, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'rogerstanimoto', 'n_neighbors': 9, 'weights': 'distance'}
Best score: -0.4895571637640469
Grid scores on development set:

-0.507 (+/-0.086) for {'metric': 'jaccard', 'n_neighbors': 5, 'weights': 'distance'}
-0.501 (+/-0.107) for {'metric': 'jaccard', 'n_neighbors': 9, 'weights': 'distance'}
-0.504 (+/-0.137) for {'metric': 'jaccard', 'n_neighbors': 15, 'weights': 'distance'}
-0.509 (+/-0.146) for {'metric': 'jaccard', 'n_neighbors': 19, 'weights': 'distance'}
-0.516 (+/-0.149) for {'metric': 'jaccard', 'n_neighbors': 23, 'weights': 'distance'}
-0.520 (+/-0.149) for {'metric': 'jaccard', 'n_neighbors': 25, 'weights': 'distance'}
-0.532 (+/-0.152) for {'metric': 'jaccard', 'n_neighbors': 35, 'weights': 'distance'}
-0.543 (+/-0.155) for {'metric': 'jaccard', 'n_neighbors': 45, 'weights': 'distance'}
-0.552 (+/-0.156) for {'metric': 'jaccard', 'n_neighbors': 55, 'weights': 'distance'}
-0.561 (+/-0.157) for {'metric': 'jaccard', 'n_

In [52]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'logLD50_mmolkg', encoder = None)

ld50_knn_ecfp6count = model_selection(knn_reg, knn_grid_parameters_fpcounts, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)

Best parameters set found on development set: {'metric': 'braycurtis', 'n_neighbors': 15, 'weights': 'distance'}
Best score: -0.5298963472744054
Grid scores on development set:

-0.766 (+/-0.252) for {'metric': 'hamming', 'n_neighbors': 5, 'weights': 'distance'}
-0.771 (+/-0.274) for {'metric': 'hamming', 'n_neighbors': 9, 'weights': 'distance'}
-0.788 (+/-0.325) for {'metric': 'hamming', 'n_neighbors': 15, 'weights': 'distance'}
-0.805 (+/-0.353) for {'metric': 'hamming', 'n_neighbors': 19, 'weights': 'distance'}
-0.828 (+/-0.382) for {'metric': 'hamming', 'n_neighbors': 25, 'weights': 'distance'}
-0.858 (+/-0.413) for {'metric': 'hamming', 'n_neighbors': 35, 'weights': 'distance'}
-0.883 (+/-0.436) for {'metric': 'hamming', 'n_neighbors': 45, 'weights': 'distance'}
-0.901 (+/-0.449) for {'metric': 'hamming', 'n_neighbors': 55, 'weights': 'distance'}
-0.915 (+/-0.459) for {'metric': 'hamming', 'n_neighbors': 71, 'weights': 'distance'}
-0.761 (+/-0.259) for {'metric': 'canberra', 'n_ne

In [53]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'logLD50_mmolkg', encoder = None)

ld50_knn_rdkit2d = model_selection(knn_reg, knn_grid_parameters_des, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)

Best parameters set found on development set: {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Best score: -0.49749231245615383
Grid scores on development set:

-0.511 (+/-0.130) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
-0.550 (+/-0.161) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
-0.497 (+/-0.131) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
-0.539 (+/-0.164) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
-0.506 (+/-0.140) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
-0.539 (+/-0.167) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
-0.512 (+/-0.141) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
-0.543 (+/-0.162) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
-0.519 (+/-0.151) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
-0.553 (+/-0.164) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
-0.534 (+/-0.151) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
-0.566 (+/-0.162) for {'n_neighbors': 3

In [54]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'logLD50_mmolkg', encoder = None)

ld50_knn_mordred = model_selection(knn_reg, knn_grid_parameters_des, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)

Best parameters set found on development set: {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Best score: -0.4715369782401039
Grid scores on development set:

-0.477 (+/-0.083) for {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
-0.497 (+/-0.090) for {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
-0.472 (+/-0.119) for {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
-0.492 (+/-0.110) for {'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
-0.481 (+/-0.139) for {'n_neighbors': 15, 'p': 1, 'weights': 'distance'}
-0.496 (+/-0.129) for {'n_neighbors': 15, 'p': 2, 'weights': 'distance'}
-0.487 (+/-0.139) for {'n_neighbors': 19, 'p': 1, 'weights': 'distance'}
-0.503 (+/-0.130) for {'n_neighbors': 19, 'p': 2, 'weights': 'distance'}
-0.494 (+/-0.135) for {'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
-0.508 (+/-0.131) for {'n_neighbors': 25, 'p': 2, 'weights': 'distance'}
-0.510 (+/-0.135) for {'n_neighbors': 35, 'p': 1, 'weights': 'distance'}
-0.520 (+/-0.129) for {'n_neighbors': 35

# SVM

## Endpoint 1: verytoxic

## Endpoint 1: toxic

In [9]:
encoder_toxic = joblib.load('../data/label_encoders/encoder_toxic.joblib')

In [27]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'toxic', encoder = encoder_toxic)

t_svc_ecfp6bits = model_selection(svc, svm_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = True, n_jobs=6)



Best parameters set found on development set: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
Best score: 0.7613998340826534
Grid scores on development set:

0.740 (+/-0.067) for {'C': 0.01, 'kernel': 'linear'}
0.719 (+/-0.050) for {'C': 0.1, 'kernel': 'linear'}
0.685 (+/-0.033) for {'C': 1, 'kernel': 'linear'}
0.666 (+/-0.022) for {'C': 10, 'kernel': 'linear'}
0.662 (+/-0.023) for {'C': 100, 'kernel': 'linear'}
0.662 (+/-0.025) for {'C': 200, 'kernel': 'linear'}
0.662 (+/-0.024) for {'C': 400, 'kernel': 'linear'}
0.662 (+/-0.024) for {'C': 1000, 'kernel': 'linear'}
0.500 (+/-0.001) for {'C': 0.01, 'gamma': 100, 'kernel': 'rbf'}
0.501 (+/-0.002) for {'C': 0.01, 'gamma': 10, 'kernel': 'rbf'}
0.670 (+/-0.069) for {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}
0.702 (+/-0.085) for {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}
0.716 (+/-0.090) for {'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
0.711 (+/-0.076) for {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}
0.500 (+/-0.001) for {'C': 0.1, 'gamma': 100, 'ker

In [10]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'toxic', encoder = encoder_toxic)

t_svc_maccs = model_selection(svc, svm_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = True, n_jobs=1)

Best parameters set found on development set: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
Best score: 0.8139581855724844
Grid scores on development set:

0.748 (+/-0.078) for {'C': 0.01, 'kernel': 'linear'}
0.749 (+/-0.084) for {'C': 0.1, 'kernel': 'linear'}
0.746 (+/-0.083) for {'C': 1, 'kernel': 'linear'}
0.745 (+/-0.082) for {'C': 10, 'kernel': 'linear'}
0.745 (+/-0.082) for {'C': 100, 'kernel': 'linear'}
0.745 (+/-0.082) for {'C': 200, 'kernel': 'linear'}
0.745 (+/-0.082) for {'C': 400, 'kernel': 'linear'}
0.745 (+/-0.082) for {'C': 1000, 'kernel': 'linear'}
0.509 (+/-0.006) for {'C': 0.01, 'gamma': 100, 'kernel': 'rbf'}
0.614 (+/-0.070) for {'C': 0.01, 'gamma': 10, 'kernel': 'rbf'}
0.753 (+/-0.109) for {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}
0.737 (+/-0.122) for {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}
0.715 (+/-0.126) for {'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
0.706 (+/-0.116) for {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}
0.509 (+/-0.006) for {'C': 0.1, 'gamma': 100, 'kern

In [28]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'toxic', encoder = encoder_toxic)

t_svc_ecfp6counts = model_selection(svc, svm_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = True, n_jobs=6)



Best parameters set found on development set: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
Best score: 0.7891994176406041
Grid scores on development set:

0.748 (+/-0.044) for {'C': 0.01, 'kernel': 'linear'}
0.718 (+/-0.033) for {'C': 0.1, 'kernel': 'linear'}
0.679 (+/-0.022) for {'C': 1, 'kernel': 'linear'}
0.666 (+/-0.019) for {'C': 10, 'kernel': 'linear'}
0.662 (+/-0.021) for {'C': 100, 'kernel': 'linear'}
0.662 (+/-0.022) for {'C': 200, 'kernel': 'linear'}
0.661 (+/-0.021) for {'C': 400, 'kernel': 'linear'}
0.661 (+/-0.022) for {'C': 1000, 'kernel': 'linear'}
0.500 (+/-0.000) for {'C': 0.01, 'gamma': 100, 'kernel': 'rbf'}
0.500 (+/-0.001) for {'C': 0.01, 'gamma': 10, 'kernel': 'rbf'}
0.608 (+/-0.030) for {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}
0.714 (+/-0.035) for {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}
0.695 (+/-0.082) for {'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
0.680 (+/-0.070) for {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}
0.500 (+/-0.000) for {'C': 0.1, 'gamma': 100, 'ker

In [11]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'toxic', encoder = encoder_toxic)

t_svc_rdkit2d = model_selection(svc, svm_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = True, n_jobs=1)

Best parameters set found on development set: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Best score: 0.8121959351290623
Grid scores on development set:

0.693 (+/-0.038) for {'C': 0.01, 'kernel': 'linear'}
0.729 (+/-0.038) for {'C': 0.1, 'kernel': 'linear'}
0.751 (+/-0.052) for {'C': 1, 'kernel': 'linear'}
0.756 (+/-0.058) for {'C': 10, 'kernel': 'linear'}
0.756 (+/-0.057) for {'C': 100, 'kernel': 'linear'}
0.757 (+/-0.056) for {'C': 200, 'kernel': 'linear'}
0.757 (+/-0.056) for {'C': 400, 'kernel': 'linear'}
0.758 (+/-0.055) for {'C': 1000, 'kernel': 'linear'}
0.725 (+/-0.089) for {'C': 0.01, 'gamma': 100, 'kernel': 'rbf'}
0.759 (+/-0.078) for {'C': 0.01, 'gamma': 10, 'kernel': 'rbf'}
0.720 (+/-0.066) for {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}
0.686 (+/-0.049) for {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}
0.679 (+/-0.045) for {'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
0.672 (+/-0.047) for {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}
0.728 (+/-0.088) for {'C': 0.1, 'gamma': 100, 'kernel

In [12]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'toxic', encoder = encoder_toxic)

t_svc_mordred = model_selection(svc, svm_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = True, n_jobs=1)

Best parameters set found on development set: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best score: 0.8212399984635705
Grid scores on development set:

0.742 (+/-0.065) for {'C': 0.01, 'kernel': 'linear'}
0.771 (+/-0.065) for {'C': 0.1, 'kernel': 'linear'}
0.778 (+/-0.062) for {'C': 1, 'kernel': 'linear'}
0.772 (+/-0.054) for {'C': 10, 'kernel': 'linear'}
0.765 (+/-0.057) for {'C': 100, 'kernel': 'linear'}
0.763 (+/-0.060) for {'C': 200, 'kernel': 'linear'}
0.762 (+/-0.062) for {'C': 400, 'kernel': 'linear'}
0.760 (+/-0.068) for {'C': 1000, 'kernel': 'linear'}
0.609 (+/-0.085) for {'C': 0.01, 'gamma': 100, 'kernel': 'rbf'}
0.750 (+/-0.089) for {'C': 0.01, 'gamma': 10, 'kernel': 'rbf'}
0.743 (+/-0.067) for {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}
0.720 (+/-0.096) for {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}
0.711 (+/-0.074) for {'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
0.709 (+/-0.070) for {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}
0.616 (+/-0.089) for {'C': 0.1, 'gamma': 100, 'ker

## Endpoint 2: EPA

In [29]:
encoder_epa = joblib.load('../data/label_encoders/encoder_epa.joblib')

In [30]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'EPA_category', encoder = encoder_epa)

epa_svc_ecfp6bits = model_selection(svc, svm_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True, n_jobs=6)



Best parameters set found on development set: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Best score: 0.5016354482846775
Grid scores on development set:

0.450 (+/-0.049) for {'C': 0.01, 'kernel': 'linear'}
0.488 (+/-0.043) for {'C': 0.1, 'kernel': 'linear'}
0.438 (+/-0.021) for {'C': 1, 'kernel': 'linear'}
0.422 (+/-0.019) for {'C': 10, 'kernel': 'linear'}
0.423 (+/-0.024) for {'C': 100, 'kernel': 'linear'}
0.425 (+/-0.028) for {'C': 200, 'kernel': 'linear'}
0.422 (+/-0.027) for {'C': 400, 'kernel': 'linear'}
0.421 (+/-0.029) for {'C': 1000, 'kernel': 'linear'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 100, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 10, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.1, 'gamma': 100, 'ke

In [31]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'EPA_category', encoder = encoder_epa)

epa_svc_ecfp6counts = model_selection(svc, svm_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True, n_jobs=6)



Best parameters set found on development set: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Best score: 0.5228362406712951
Grid scores on development set:

0.464 (+/-0.047) for {'C': 0.01, 'kernel': 'linear'}
0.490 (+/-0.037) for {'C': 0.1, 'kernel': 'linear'}
0.446 (+/-0.025) for {'C': 1, 'kernel': 'linear'}
0.424 (+/-0.022) for {'C': 10, 'kernel': 'linear'}
0.418 (+/-0.033) for {'C': 100, 'kernel': 'linear'}
0.418 (+/-0.030) for {'C': 200, 'kernel': 'linear'}
0.420 (+/-0.028) for {'C': 400, 'kernel': 'linear'}
0.417 (+/-0.023) for {'C': 1000, 'kernel': 'linear'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 100, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 10, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}
0.334 (+/-0.001) for {'C': 0.1, 'gamma': 100, 'ke

In [21]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'EPA_category', encoder = encoder_epa)

epa_svc_maccs = model_selection(svc, svm_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)

Best parameters set found on development set: {'C': 10, 'gamma': 0.1}
Best score: 0.5387228642293098
Grid scores on development set:

0.334 (+/-0.001) for {'C': 0.1, 'gamma': 10}
0.336 (+/-0.006) for {'C': 0.1, 'gamma': 1}
0.367 (+/-0.041) for {'C': 0.1, 'gamma': 0.1}
0.334 (+/-0.001) for {'C': 0.1, 'gamma': 0.01}
0.349 (+/-0.012) for {'C': 1, 'gamma': 10}
0.362 (+/-0.038) for {'C': 1, 'gamma': 1}
0.532 (+/-0.087) for {'C': 1, 'gamma': 0.1}
0.443 (+/-0.019) for {'C': 1, 'gamma': 0.01}
0.349 (+/-0.012) for {'C': 10, 'gamma': 10}
0.375 (+/-0.038) for {'C': 10, 'gamma': 1}
0.539 (+/-0.038) for {'C': 10, 'gamma': 0.1}
0.526 (+/-0.042) for {'C': 10, 'gamma': 0.01}
0.349 (+/-0.012) for {'C': 100, 'gamma': 10}
0.375 (+/-0.038) for {'C': 100, 'gamma': 1}
0.538 (+/-0.032) for {'C': 100, 'gamma': 0.1}
0.509 (+/-0.041) for {'C': 100, 'gamma': 0.01}
0.349 (+/-0.012) for {'C': 500, 'gamma': 10}
0.375 (+/-0.038) for {'C': 500, 'gamma': 1}
0.538 (+/-0.034) for {'C': 500, 'gamma': 0.1}
0.482 (+/-0.035

In [22]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'EPA_category', encoder = encoder_epa)

epa_svc_rdkit2d = model_selection(svc, svm_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)

Best parameters set found on development set: {'C': 10, 'gamma': 1}
Best score: 0.5411610907931664
Grid scores on development set:

0.337 (+/-0.005) for {'C': 0.1, 'gamma': 10}
0.350 (+/-0.021) for {'C': 0.1, 'gamma': 1}
0.335 (+/-0.002) for {'C': 0.1, 'gamma': 0.1}
0.334 (+/-0.001) for {'C': 0.1, 'gamma': 0.01}
0.456 (+/-0.093) for {'C': 1, 'gamma': 10}
0.502 (+/-0.073) for {'C': 1, 'gamma': 1}
0.363 (+/-0.025) for {'C': 1, 'gamma': 0.1}
0.335 (+/-0.001) for {'C': 1, 'gamma': 0.01}
0.480 (+/-0.082) for {'C': 10, 'gamma': 10}
0.541 (+/-0.053) for {'C': 10, 'gamma': 1}
0.467 (+/-0.054) for {'C': 10, 'gamma': 0.1}
0.351 (+/-0.013) for {'C': 10, 'gamma': 0.01}
0.481 (+/-0.082) for {'C': 100, 'gamma': 10}
0.512 (+/-0.053) for {'C': 100, 'gamma': 1}
0.522 (+/-0.050) for {'C': 100, 'gamma': 0.1}
0.432 (+/-0.029) for {'C': 100, 'gamma': 0.01}
0.482 (+/-0.084) for {'C': 500, 'gamma': 10}
0.504 (+/-0.046) for {'C': 500, 'gamma': 1}
0.528 (+/-0.053) for {'C': 500, 'gamma': 0.1}
0.464 (+/-0.056) 

In [12]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'EPA_category', encoder = encoder_epa)

epa_svc_mordred = model_selection(svc, svm_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = True)



Best parameters set found on development set: {'C': 10, 'gamma': 1}
Best score: 0.5373111143740519
Grid scores on development set:

0.334 (+/-0.001) for {'C': 0.1, 'gamma': 10}
0.356 (+/-0.008) for {'C': 0.1, 'gamma': 1}
0.336 (+/-0.003) for {'C': 0.1, 'gamma': 0.1}
0.334 (+/-0.001) for {'C': 0.1, 'gamma': 0.01}
0.348 (+/-0.018) for {'C': 1, 'gamma': 10}
0.524 (+/-0.082) for {'C': 1, 'gamma': 1}
0.450 (+/-0.079) for {'C': 1, 'gamma': 0.1}
0.338 (+/-0.005) for {'C': 1, 'gamma': 0.01}
0.366 (+/-0.039) for {'C': 10, 'gamma': 10}
0.537 (+/-0.054) for {'C': 10, 'gamma': 1}
0.534 (+/-0.056) for {'C': 10, 'gamma': 0.1}
0.425 (+/-0.071) for {'C': 10, 'gamma': 0.01}
0.366 (+/-0.039) for {'C': 100, 'gamma': 10}
0.533 (+/-0.052) for {'C': 100, 'gamma': 1}
0.534 (+/-0.047) for {'C': 100, 'gamma': 0.1}
0.485 (+/-0.074) for {'C': 100, 'gamma': 0.01}
0.366 (+/-0.039) for {'C': 500, 'gamma': 10}
0.533 (+/-0.052) for {'C': 500, 'gamma': 1}
0.508 (+/-0.076) for {'C': 500, 'gamma': 0.1}
0.528 (+/-0.036) 

In [18]:
df_epa_svc_mordred = result_model_selection(epa_svc_mordred, 'epa_svc_mordred')

## Endpoint 3: logLD50

In [36]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'logLD50_mmolkg', encoder = None)

ld50_svr_ecfp6bits = model_selection(svr, svm_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True, n_jobs = 6)

Best parameters set found on development set: {'C': 1, 'gamma': 0.01}
Best score: -0.5501342304980378
Grid scores on development set:

-0.827 (+/-0.309) for {'C': 1, 'gamma': 10}
-0.827 (+/-0.309) for {'C': 1, 'gamma': 1}
-0.627 (+/-0.226) for {'C': 1, 'gamma': 0.1}
-0.550 (+/-0.191) for {'C': 1, 'gamma': 0.01}
-0.823 (+/-0.278) for {'C': 10, 'gamma': 10}
-0.823 (+/-0.277) for {'C': 10, 'gamma': 1}
-0.613 (+/-0.202) for {'C': 10, 'gamma': 0.1}
-0.582 (+/-0.184) for {'C': 10, 'gamma': 0.01}
-0.823 (+/-0.278) for {'C': 100, 'gamma': 10}
-0.823 (+/-0.277) for {'C': 100, 'gamma': 1}
-0.613 (+/-0.202) for {'C': 100, 'gamma': 0.1}
-0.665 (+/-0.141) for {'C': 100, 'gamma': 0.01}
CPU times: user 1min, sys: 550 ms, total: 1min
Wall time: 14min 5s


In [37]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'logLD50_mmolkg', encoder = None)

ld50_svr_ecfp6counts = model_selection(svr, svm_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True, n_jobs = 6)

Best parameters set found on development set: {'C': 10, 'gamma': 0.01}
Best score: -0.5225856535556385
Grid scores on development set:

-0.827 (+/-0.309) for {'C': 1, 'gamma': 10}
-0.827 (+/-0.309) for {'C': 1, 'gamma': 1}
-0.738 (+/-0.287) for {'C': 1, 'gamma': 0.1}
-0.528 (+/-0.208) for {'C': 1, 'gamma': 0.01}
-0.823 (+/-0.278) for {'C': 10, 'gamma': 10}
-0.823 (+/-0.278) for {'C': 10, 'gamma': 1}
-0.726 (+/-0.264) for {'C': 10, 'gamma': 0.1}
-0.523 (+/-0.193) for {'C': 10, 'gamma': 0.01}
-0.823 (+/-0.278) for {'C': 100, 'gamma': 10}
-0.823 (+/-0.278) for {'C': 100, 'gamma': 1}
-0.726 (+/-0.264) for {'C': 100, 'gamma': 0.1}
-0.546 (+/-0.191) for {'C': 100, 'gamma': 0.01}
CPU times: user 1min 2s, sys: 173 ms, total: 1min 2s
Wall time: 13min 55s


In [23]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'logLD50_mmolkg', encoder = None)

ld50_svr_maccs = model_selection(svr, svm_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)

Best parameters set found on development set: {'C': 10, 'gamma': 0.1}
Best score: -0.45288626135182813
Grid scores on development set:

-0.837 (+/-0.322) for {'C': 0.1, 'gamma': 10}
-0.818 (+/-0.304) for {'C': 0.1, 'gamma': 1}
-0.573 (+/-0.176) for {'C': 0.1, 'gamma': 0.1}
-0.623 (+/-0.217) for {'C': 0.1, 'gamma': 0.01}
-0.812 (+/-0.279) for {'C': 1, 'gamma': 10}
-0.766 (+/-0.262) for {'C': 1, 'gamma': 1}
-0.464 (+/-0.141) for {'C': 1, 'gamma': 0.1}
-0.522 (+/-0.141) for {'C': 1, 'gamma': 0.01}
-0.808 (+/-0.254) for {'C': 10, 'gamma': 10}
-0.754 (+/-0.243) for {'C': 10, 'gamma': 1}
-0.453 (+/-0.084) for {'C': 10, 'gamma': 0.1}
-0.478 (+/-0.163) for {'C': 10, 'gamma': 0.01}
-0.808 (+/-0.254) for {'C': 100, 'gamma': 10}
-0.754 (+/-0.243) for {'C': 100, 'gamma': 1}
-0.457 (+/-0.081) for {'C': 100, 'gamma': 0.1}
-0.554 (+/-0.177) for {'C': 100, 'gamma': 0.01}
-0.808 (+/-0.254) for {'C': 500, 'gamma': 10}
-0.754 (+/-0.243) for {'C': 500, 'gamma': 1}
-0.457 (+/-0.081) for {'C': 500, 'gamma':

In [24]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'logLD50_mmolkg', encoder = None)

ld50_svr_rdkit2d = model_selection(svr, svm_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)

Best parameters set found on development set: {'C': 1, 'gamma': 1}
Best score: -0.4950215680734803
Grid scores on development set:

-0.729 (+/-0.293) for {'C': 0.1, 'gamma': 10}
-0.579 (+/-0.175) for {'C': 0.1, 'gamma': 1}
-0.680 (+/-0.267) for {'C': 0.1, 'gamma': 0.1}
-0.786 (+/-0.330) for {'C': 0.1, 'gamma': 0.01}
-0.588 (+/-0.221) for {'C': 1, 'gamma': 10}
-0.495 (+/-0.148) for {'C': 1, 'gamma': 1}
-0.592 (+/-0.171) for {'C': 1, 'gamma': 0.1}
-0.689 (+/-0.268) for {'C': 1, 'gamma': 0.01}
-0.576 (+/-0.207) for {'C': 10, 'gamma': 10}
-0.500 (+/-0.181) for {'C': 10, 'gamma': 1}
-0.533 (+/-0.140) for {'C': 10, 'gamma': 0.1}
-0.624 (+/-0.180) for {'C': 10, 'gamma': 0.01}
-0.576 (+/-0.208) for {'C': 100, 'gamma': 10}
-0.597 (+/-0.228) for {'C': 100, 'gamma': 1}
-0.533 (+/-0.174) for {'C': 100, 'gamma': 0.1}
-0.579 (+/-0.151) for {'C': 100, 'gamma': 0.01}
-0.576 (+/-0.208) for {'C': 500, 'gamma': 10}
-0.682 (+/-0.246) for {'C': 500, 'gamma': 1}
-0.630 (+/-0.245) for {'C': 500, 'gamma': 0.1

In [25]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'logLD50_mmolkg', encoder = None)

ld50_svr_mordred = model_selection(svr, svm_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = True)



Best parameters set found on development set: {'C': 10, 'gamma': 0.1}
Best score: -0.4613227665771701
Grid scores on development set:

-0.837 (+/-0.332) for {'C': 0.1, 'gamma': 10}
-0.578 (+/-0.175) for {'C': 0.1, 'gamma': 1}
-0.584 (+/-0.171) for {'C': 0.1, 'gamma': 0.1}
-0.687 (+/-0.277) for {'C': 0.1, 'gamma': 0.01}
-0.788 (+/-0.294) for {'C': 1, 'gamma': 10}
-0.473 (+/-0.155) for {'C': 1, 'gamma': 1}
-0.496 (+/-0.099) for {'C': 1, 'gamma': 0.1}
-0.601 (+/-0.176) for {'C': 1, 'gamma': 0.01}
-0.776 (+/-0.267) for {'C': 10, 'gamma': 10}
-0.471 (+/-0.140) for {'C': 10, 'gamma': 1}
-0.461 (+/-0.103) for {'C': 10, 'gamma': 0.1}
-0.549 (+/-0.128) for {'C': 10, 'gamma': 0.01}
-0.776 (+/-0.267) for {'C': 100, 'gamma': 10}
-0.476 (+/-0.140) for {'C': 100, 'gamma': 1}
-0.538 (+/-0.164) for {'C': 100, 'gamma': 0.1}
-0.508 (+/-0.120) for {'C': 100, 'gamma': 0.01}
-0.776 (+/-0.267) for {'C': 500, 'gamma': 10}
-0.480 (+/-0.133) for {'C': 500, 'gamma': 1}
-0.683 (+/-0.274) for {'C': 500, 'gamma': 

# Random Forest Model

## Endpoint 1: toxic

In [29]:
encoder_toxic = joblib.load('../data/label_encoders/encoder_toxic.joblib')

In [33]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'toxic', encoder = encoder_toxic)

t_rf_ecfp6bits = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=20,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
Best score: 0.7957568767251008
Grid scores on development set:

0.776 (+/-0.070) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}
0.788 (+/-0.066) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}
0.723 (+/-0.078) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': True}
0.720 (+/-0.078) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': False}
0.788 (+/-0.072) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 80, 'bootstrap': True}
0.7

In [31]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'toxic', encoder = encoder_toxic)

t_rf_maccs = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 35, 'bootstrap': False}
Best score: 0.8223225294394823
Grid scores on development set:

0.818 (+/-0.066) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': False}
0.813 (+/-0.065) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': False}
0.807 (+/-0.065) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': True}
0.813 (+/-0.066) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': False}
0.822 (+/-0.067) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': False}


In [30]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'toxic', encoder = encoder_toxic)

t_rf_rdkit2d = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 65, 'bootstrap': False}
Best score: 0.8386576205979399
Grid scores on development set:

0.839 (+/-0.058) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 65, 'bootstrap': False}
0.837 (+/-0.059) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}
0.829 (+/-0.056) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': True}
0.832 (+/-0.059) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': False}
0.838 (+/-0.057) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': None, 'bootstrap': False

In [32]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'toxic', encoder = encoder_toxic)

t_rf_mordred = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
Best score: 0.8351086174639999
Grid scores on development set:

0.832 (+/-0.059) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 35, 'bootstrap': False}
0.790 (+/-0.060) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': True}
0.830 (+/-0.060) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}
0.782 (+/-0.064) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': False}
0.834 (+/-0.061) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': False}
0.

In [34]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'toxic', encoder = encoder_toxic)

t_rf_ecfp6count = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=20,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': True}
Best score: 0.7978769614922325
Grid scores on development set:

0.779 (+/-0.072) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': False}
0.773 (+/-0.074) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': False}
0.787 (+/-0.068) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}
0.736 (+/-0.078) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': True}
0.725 (+/-0.071) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': True}
0.767

## Endpoint 2: EPA

In [35]:
encoder_epa = joblib.load('../data/label_encoders/encoder_epa.joblib')

In [82]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'EPA_category', encoder = encoder_epa)

epa_rf_ecfp6bits = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
Best score: 0.505328515223785
Grid scores on development set:

0.334 (+/-0.001) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': True}
0.378 (+/-0.022) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': None, 'bootstrap': False}
0.463 (+/-0.069) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': True}
0.380 (+/-0.022) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 65, 'bootstrap': True}
0.463 (+/-0.069) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 65, 'bootstrap': False}
0.4

In [43]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'EPA_category', encoder = encoder_epa)

epa_rf_maccs = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best parameters set found on development set: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': False}
Best score: 0.5401209830917464
Grid scores on development set:

0.367 (+/-0.034) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': False}
0.532 (+/-0.075) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': True}
0.499 (+/-0.079) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': True}
0.525 (+/-0.073) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': False}
0.392 (+/-0.032) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': False}
0.51

In [44]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'EPA_category', encoder = encoder_epa)

epa_rf_ecfp6count = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 65, 'bootstrap': False}
Best score: 0.501145603112801
Grid scores on development set:

0.458 (+/-0.050) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 80, 'bootstrap': False}
0.336 (+/-0.002) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': False}
0.465 (+/-0.058) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}
0.416 (+/-0.035) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': None, 'bootstrap': False}
0.381 (+/-0.020) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': False}
0.4

In [45]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'EPA_category', encoder = encoder_epa)

epa_rf_rdkit2d = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

  'precision', 'predicted', average, warn_for)


Best parameters set found on development set: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
Best score: 0.5466101521057932
Grid scores on development set:

0.516 (+/-0.059) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}
0.510 (+/-0.066) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': True}
0.517 (+/-0.060) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 65, 'bootstrap': True}
0.510 (+/-0.065) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 35, 'bootstrap': True}
0.518 (+/-0.069) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': True}
0.521

In [46]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'EPA_category', encoder = encoder_epa)

epa_rf_mordred = model_selection(rf_clf, rf_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Best score: 0.5444247485651312
Grid scores on development set:

0.526 (+/-0.074) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': False}
0.532 (+/-0.071) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': True}
0.525 (+/-0.076) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 65, 'bootstrap': False}
0.525 (+/-0.075) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 35, 'bootstrap': False}
0.533 (+/-0.075) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': False}
0

## Endpoint 3: logLD50

In [47]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'logLD50_mmolkg', encoder = None)

ld50_rf_ecfp6bits = model_selection(rf_reg, rf_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Best score: -0.5136635794907726
Grid scores on development set:

-0.579 (+/-0.187) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': None, 'bootstrap': False}
-0.560 (+/-0.169) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 65, 'bootstrap': True}
-0.543 (+/-0.167) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
-0.565 (+/-0.169) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': False}
-0.610 (+/-0.197) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 50, 'bootstrap': 

In [48]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'logLD50_mmolkg', encoder = None)

ld50_rf_maccs = model_selection(rf_reg, rf_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
Best score: -0.44620082987210974
Grid scores on development set:

-0.490 (+/-0.127) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': True}
-0.456 (+/-0.110) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
-0.474 (+/-0.117) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 80, 'bootstrap': False}
-0.505 (+/-0.133) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}
-0.509 (+/-0.135) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 20, 'bootstrap':

In [49]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'logLD50_mmolkg', encoder = None)

ld50_rf_ecfp6count = model_selection(rf_reg, rf_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Best score: -0.5082676513713261
Grid scores on development set:

-0.634 (+/-0.201) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 35, 'bootstrap': True}
-0.723 (+/-0.228) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': False}
-0.556 (+/-0.169) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': False}
-0.556 (+/-0.170) for {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 65, 'bootstrap': True}
-0.642 (+/-0.208) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 35, 'bootstrap': 

In [50]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'logLD50_mmolkg', encoder = None)

ld50_rf_rdkit2d = model_selection(rf_reg, rf_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 65, 'bootstrap': False}
Best score: -0.4552310603566815
Grid scores on development set:

-0.500 (+/-0.154) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': None, 'bootstrap': True}
-0.467 (+/-0.138) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 65, 'bootstrap': False}
-0.602 (+/-0.189) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': True}
-0.478 (+/-0.143) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': None, 'bootstrap': False}
-0.602 (+/-0.190) for {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': True

In [51]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'logLD50_mmolkg', encoder = None)

ld50_rf_mordred = model_selection(rf_reg, rf_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'n_estimators': 1500, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 65, 'bootstrap': False}
Best score: -0.4551508147810823
Grid scores on development set:

-0.581 (+/-0.165) for {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': True}
-0.464 (+/-0.132) for {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}
-0.599 (+/-0.171) for {'n_estimators': 1500, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 5, 'bootstrap': False}
-0.476 (+/-0.135) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 35, 'bootstrap': False}
-0.486 (+/-0.143) for {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 80, 'bootstrap': Tru

# XGBoost

## Endpoint 1: logLD50

In [83]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'logLD50_mmolkg', encoder = None)

ld50_xgb_ecfp6bits = model_selection(xgb_reg, xgb_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30, n_jobs=1)

Best parameters set found on development set: {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.7}
Best score: -0.5021409167738714
Grid scores on development set:

-0.548 (+/-0.176) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.6}
-0.540 (+/-0.168) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}
-0.544 (+/-0.152) for {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
-0.543 (+/-0.185) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.7}
-0.530 (+/-0.167) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, '

In [84]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'logLD50_mmolkg', encoder = None)

ld50_xgb_maccs = model_selection(xgb_reg, xgb_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30, n_jobs=1)

Best parameters set found on development set: {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.7}
Best score: -0.4233172282014326
Grid scores on development set:

-0.506 (+/-0.144) for {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.5}
-0.483 (+/-0.138) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.5}
-0.513 (+/-0.144) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 1.0}
-0.559 (+/-0.163) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.7}
-0.490 (+/-0.141) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, 

In [85]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'logLD50_mmolkg', encoder = None)

ld50_xgb_ecfp6count = model_selection(xgb_reg, xgb_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30, n_jobs=1)

Best parameters set found on development set: {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}
Best score: -0.4815733590546198
Grid scores on development set:

-0.507 (+/-0.167) for {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.8}
-0.549 (+/-0.177) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 1.0}
-0.491 (+/-0.132) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.8}
-0.560 (+/-0.185) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 1.0}
-0.552 (+/-0.189) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, '

In [86]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'logLD50_mmolkg', encoder = None)

ld50_xgb_rdkit2d = model_selection(xgb_reg, xgb_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30, n_jobs=1)

Best parameters set found on development set: {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.5}
Best score: -0.45077808405783387
Grid scores on development set:

-0.515 (+/-0.134) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.6}
-0.499 (+/-0.131) for {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}
-0.490 (+/-0.138) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.9}
-0.463 (+/-0.110) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.7}
-0.472 (+/-0.129) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 6,

In [87]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'logLD50_mmolkg', encoder = None)

ld50_xgb_mordred = model_selection(xgb_reg, xgb_grid_parameters, a, c, scoring = 'neg_mean_squared_error', cv=5, GridSearch = False, n_iter=30, n_jobs=1)

Best parameters set found on development set: {'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.8}
Best score: -0.4365641335401894
Grid scores on development set:

-0.462 (+/-0.116) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.5}
-0.476 (+/-0.111) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.5}
-0.453 (+/-0.119) for {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.6}
-0.474 (+/-0.105) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.7}
-0.455 (+/-0.111) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10,

## Endpoint 2: toxic

In [88]:
encoder_toxic = joblib.load('../data/label_encoders/encoder_toxic.joblib')

In [89]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'toxic', encoder = encoder_toxic)

t_xgb_ecfp6bits = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.9}
Best score: 0.7755507035878748
Grid scores on development set:

0.759 (+/-0.055) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.6}
0.766 (+/-0.066) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
0.768 (+/-0.059) for {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.5}
0.763 (+/-0.056) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.6}
0.765 (+/-0.059) for {'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 10, 'learn

In [90]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'toxic', encoder = encoder_toxic)

t_xgb_maccs = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.8}
Best score: 0.8157050296733598
Grid scores on development set:

0.797 (+/-0.078) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.7}
0.781 (+/-0.069) for {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.9}
0.807 (+/-0.072) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.5}
0.792 (+/-0.077) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.7}
0.811 (+/-0.077) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learni

In [91]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'toxic', encoder = encoder_toxic)

t_xgb_rdkit2d = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.6}
Best score: 0.8354318675778982
Grid scores on development set:

0.814 (+/-0.062) for {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
0.820 (+/-0.061) for {'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 1.0}
0.822 (+/-0.060) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.7}
0.819 (+/-0.061) for {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 1.0}
0.831 (+/-0.060) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 6, 'learning_

In [92]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'toxic', encoder = encoder_toxic)

t_xgb_mordred = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.7}
Best score: 0.8362956248572742
Grid scores on development set:

0.805 (+/-0.061) for {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.5}
0.831 (+/-0.066) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.8}
0.812 (+/-0.064) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
0.821 (+/-0.060) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 1.0}
0.803 (+/-0.062) for {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 3, 'learni

In [93]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'toxic', encoder = encoder_toxic)

t_xgb_ecfp6count = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'roc_auc', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.7}
Best score: 0.7909498211135538
Grid scores on development set:

0.777 (+/-0.057) for {'subsample': 1.0, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.7}
0.762 (+/-0.059) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.7}
0.775 (+/-0.056) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.5}
0.745 (+/-0.060) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 1.0}
0.771 (+/-0.064) for {'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 6, 'learn

## Endpoint 3: EPA

In [94]:
encoder_epa = joblib.load('../data/label_encoders/encoder_epa.joblib')

In [95]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_bits, target = 'EPA_category', encoder = encoder_epa)

epa_xgb_ecfp6bits = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.9}
Best score: 0.5257901007118562
Grid scores on development set:

0.514 (+/-0.053) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.5}
0.492 (+/-0.058) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.5}
0.474 (+/-0.051) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.9}
0.468 (+/-0.051) for {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.5}
0.490 (+/-0.060) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 10, 'learn

In [96]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_maccs, target = 'EPA_category', encoder = encoder_epa)

epa_xgb_maccs = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.5}
Best score: 0.5425233859942723
Grid scores on development set:

0.526 (+/-0.027) for {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}
0.532 (+/-0.027) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 1.0}
0.543 (+/-0.043) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.5}
0.507 (+/-0.037) for {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.8}
0.539 (+/-0.041) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'l

In [97]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_ecfp6_counts, target = 'EPA_category', encoder = encoder_epa)

epa_xgb_ecfp6count = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'subsample': 0.8, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 1.0}
Best score: 0.5237005463544709
Grid scores on development set:

0.500 (+/-0.057) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.7}
0.521 (+/-0.051) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.5}
0.514 (+/-0.041) for {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
0.455 (+/-0.053) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.7}
0.453 (+/-0.053) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 3, 'learni

In [98]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_rdkit2d, target = 'EPA_category', encoder = encoder_epa)

epa_xgb_rdkit2d = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.8}
Best score: 0.5529601386031521
Grid scores on development set:

0.545 (+/-0.057) for {'subsample': 1.0, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.6}
0.543 (+/-0.046) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.9}
0.546 (+/-0.052) for {'subsample': 0.9, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
0.508 (+/-0.053) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 0.6}
0.546 (+/-0.055) for {'subsample': 0.6, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 6, 'learn

In [99]:
%%time
a,b,c,d,e = prepare_input(train_labels, train_mordred, target = 'EPA_category', encoder = encoder_epa)

epa_xgb_mordred = model_selection(xgb_clf, xgb_grid_parameters, a, c, scoring = 'f1_weighted', cv=5, GridSearch = False, n_iter=30,n_jobs=1)

Best parameters set found on development set: {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.8}
Best score: 0.5497431119851961
Grid scores on development set:

0.544 (+/-0.054) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.6}
0.521 (+/-0.068) for {'subsample': 0.8, 'n_estimators': 1500, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.8}
0.532 (+/-0.054) for {'subsample': 0.6, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 5, 'colsample_bytree': 0.8}
0.539 (+/-0.073) for {'subsample': 0.7, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 0.8}
0.541 (+/-0.057) for {'subsample': 0.7, 'n_estimators': 1500, 'min_child_weight': 5, 'max_depth': 10, 'learnin