In [1]:
import lazypredict
from lazypredict.Supervised import LazyClassifier

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.model_selection import train_test_split
from DataPreparation import DataPreparation

In [14]:
pics = './images'
data_path = './data'

In [15]:
df = pd.read_csv(f'{data_path}/balanced_dataframe.csv', index_col=None)
df.head()

Unnamed: 0,MaxAbsEStateIndex,MinAbsEStateIndex,qed,SPS,MaxPartialCharge,MinPartialCharge,FpDensityMorgan3,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,...,fr_Nhpyrrole,fr_aldehyde,fr_amidine,fr_aryl_methyl,fr_benzene,fr_benzodiazepine,fr_diazo,fr_halogen,fr_hdrzine,label
0,10.337,-1.273,23.375,505.329,79.919,10.124,9.104,-0.051,3.459,1.375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,10.335,-1.272,23.375,505.329,79.919,10.124,9.103,-0.051,3.451,1.367,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,10.333,-1.271,23.375,505.329,79.919,10.124,9.103,-0.051,3.449,1.361,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,10.356,-1.235,25.219,440.504,16.563,9.898,5.825,-0.051,3.347,1.323,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,6.09,0.226,12.286,362.614,79.919,10.104,9.102,0.553,3.02,1.976,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [3]:
X = df.drop(columns=['label'])
y = df['label']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42,
                                                    stratify=y
                                                    )
y_test.value_counts()

0    915
1    915
Name: label, dtype: int64

In [5]:
data_preparation = DataPreparation(X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = data_preparation.clean_dataset()
feats=data_preparation.feature_names

Index(['MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'SPS',
       'MaxPartialCharge', 'MinPartialCharge', 'BCUT2D_MWHI', 'BCUT2D_MWLOW',
       'BCUT2D_CHGHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW',
       'AvgIpc', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1',
       'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v',
       'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'PEOE_VSA1',
       'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA2', 'PEOE_VSA3',
       'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA8', 'SMR_VSA2', 'SMR_VSA3',
       'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9',
       'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5',
       'SlogP_VSA6', 'SlogP_VSA8', 'TPSA', 'EState_VSA1', 'EState_VSA5',
       'VSA_EState1', 'VSA_EState6', 'VSA_EState7', 'NumAliphaticCarbocycles',
       'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 'NumHeteroatoms',
       'NumRotatableBonds', 

In [7]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
pd.set_option("display.precision", 3)
pd.set_option("display.float_format", lambda x: "%.3f" % x)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

 97%|████████████████████████████████████████▌ | 28/29 [00:13<00:00,  3.55it/s]

[LightGBM] [Info] Number of positive: 2135, number of negative: 2135
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001868 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11984
[LightGBM] [Info] Number of data points in the train set: 4270, number of used features: 83
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


100%|██████████████████████████████████████████| 29/29 [00:13<00:00,  2.16it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
XGBClassifier                     0.977              0.977    0.977     0.977   
RandomForestClassifier            0.977              0.977    0.977     0.977   
ExtraTreesClassifier              0.977              0.977    0.977     0.977   
LGBMClassifier                    0.974              0.974    0.974     0.974   
SVC                               0.970              0.970    0.970     0.970   
BaggingClassifier                 0.969              0.969    0.969     0.969   
LabelPropagation                  0.966              0.966    0.966     0.966   
LabelSpreading                    0.966              0.966    0.966     0.966   
DecisionTreeClassifier            0.948              0.948    0.948     0.948   
KNeighborsClassifier              0.946              0.946    0.946     0.946   
LogisticRegression          




## After feature selection N = 20

In [20]:
df = pd.read_csv(f'{data_path}/feat_sel_data.csv')
df

Unnamed: 0,PEOE_VSA6,HallKierAlpha,BCUT2D_MWHI,PEOE_VSA1,AvgIpc,Chi0v,fr_HOCCN,Chi3v,SlogP_VSA3,qed,...,MinAbsEStateIndex,fr_COO,MinPartialCharge,BCUT2D_MWLOW,SlogP_VSA2,PEOE_VSA8,MaxPartialCharge,SlogP_VSA12,SMR_VSA3,label
0,22.066803,24.503014,-0.051090,6.010465,23.209642,4.983979,1.0,24.793308,0.116048,23.375000,...,-1.272753,0.0,10.123678,3.459112,8.746772,164.56,79.918731,30.819914,18.544404,1
1,22.066803,24.503014,-0.051090,6.010465,23.209642,4.983979,1.0,24.793308,0.117146,23.375000,...,-1.271941,0.0,10.124302,3.450909,8.762836,164.56,79.918731,30.820274,24.191582,1
2,22.066803,24.503014,-0.051090,6.010465,23.209642,4.983979,1.0,24.793308,0.117998,23.375000,...,-1.271281,0.0,10.124439,3.448581,8.760205,164.56,79.918731,30.823484,24.191582,1
3,11.163878,19.519035,-0.051059,0.000000,17.692941,4.983979,1.0,20.056445,0.553914,25.218750,...,-1.235039,0.0,9.897992,3.346646,11.029353,151.57,16.562573,32.991820,23.656974,1
4,22.711555,9.967957,0.552888,22.711555,10.429763,9.967957,0.0,4.417151,0.898357,12.285714,...,0.226017,1.0,10.103988,3.020157,10.627217,64.94,79.918731,0.681553,15.452396,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6095,10.216621,14.535057,0.492694,0.000000,5.131558,9.778516,0.0,0.000000,0.579532,11.304348,...,-0.411430,0.0,10.208623,2.836069,1.322729,79.78,32.133549,3.654509,17.494178,0
6096,0.000000,14.630917,-0.845814,0.000000,0.000000,13.212334,0.0,18.221247,-0.018440,29.375000,...,-3.541177,0.0,10.227966,2.894325,4.457043,98.88,32.233449,8.479265,48.334870,0
6097,0.000000,20.357340,-0.667596,0.000000,0.000000,4.983979,0.0,19.428958,1.136124,22.542857,...,-0.204735,0.0,9.704724,3.393030,0.963588,108.18,35.495692,10.247890,35.129773,0
6098,5.687386,24.981162,-0.126220,5.687386,5.156436,4.794537,0.0,14.325937,0.505647,18.250000,...,-0.464767,0.0,10.296962,3.346443,0.925978,92.59,32.166684,11.569946,31.459713,0


In [21]:
X = df.drop(columns=['label'])
y = df['label']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(R, y, 
                                                    test_size=0.3, 
                                                    random_state=42,
                                                    stratify=y
                                                    )

In [10]:
data_preparation = DataPreparation(X_train, X_test, y_train, y_test)
X_train, X_test, y_train, y_test = data_preparation.clean_dataset()


Index(['PEOE_VSA6', 'HallKierAlpha', 'BCUT2D_MWHI', 'PEOE_VSA1', 'Chi0v',
       'AvgIpc', 'SlogP_VSA3', 'fr_HOCCN', 'Chi3v', 'BCUT2D_MWLOW',
       'NumAliphaticCarbocycles', 'qed', 'MinAbsEStateIndex', 'fr_COO',
       'EState_VSA1', 'SlogP_VSA12', 'MinPartialCharge', 'SMR_VSA3',
       'PEOE_VSA8', 'fr_benzodiazepine'],
      dtype='object')


In [11]:
X_train.shape

(4270, 20)

### Only 20 features left after cleaning

In [12]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
pd.set_option("display.precision", 3)
pd.set_option("display.float_format", lambda x: "%.3f" % x)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|██████████████████████████████████████████| 29/29 [00:08<00:00,  3.30it/s]

[LightGBM] [Info] Number of positive: 2135, number of negative: 2135
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000454 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3538
[LightGBM] [Info] Number of data points in the train set: 4270, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
ExtraTreesClassifier              0.978              0.978    0.978     0.978   
RandomForestClassifier            0.974              0.974    0.974     0.974   
LGBMClassifier                    0.969              0.969    0.969     0.969   
XGBClassifier                     0.968              0.968    0.968     0.968   
LabelPropagation                  0.964              0.964    0.964     0.964 


