In [1]:
from rdkit.Chem import AllChem
import warnings
from sklearn.metrics import fbeta_score, make_scorer
import random
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import pandas as pd
import math
from rdkit import Chem
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
sns.set(font_scale=1.2)
import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import rdkit.Chem.rdMolDescriptors as d
import rdkit.Chem.Fragments as f
import rdkit.Chem.Lipinski as l
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import BorderlineSMOTE
#from wordcloud import WordCloud



In [2]:
def feature_selection(train):
    train=train.drop(labels=['INDEX'], axis=1)
    
    train['SMILES'] = train['SMILES'].apply(lambda x: Chem.MolFromSmiles(x))
    train['NumAtoms'] = train['SMILES'].apply(lambda x: x.GetNumAtoms())
    train['HeavyAtomCount']=train['SMILES'].apply(lambda x: l.HeavyAtomCount(x))
    train['CalcExactMolWt']=train['SMILES'].apply(lambda x: d.CalcExactMolWt(x))
    train['fr_Al_COO']=train['SMILES'].apply(lambda x: f.fr_Al_COO(x))
    #train['NumRotatableBonds']=train['SMILES'].apply(lambda x: l.NumRotatableBonds(x))
    train['HsNumAtoms'] = train["SMILES"].apply(lambda x: Chem.AddHs(x).GetNumAtoms())
    train_features_T = train.T
    train_features_T.shape
    unique_features = train_features_T.drop_duplicates(keep='first').T
    
    correlated_features = set()
    correlation_matrix = train.corr()
    display(correlation_matrix)
    for i in range(len(correlation_matrix .columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > 0.95:
                colname = correlation_matrix.columns[i]
                correlated_features.add(colname)

    #display(correlated_features)

    train=train.drop(labels=correlated_features, axis=1)
    finger=[np.array(AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=124)) for x in train['SMILES']]
    finger=pd.DataFrame(finger)
    finger.columns=['fp_'+str(x) for x in finger.columns]    
    train = pd.concat([train, finger], axis=1, join='inner')
    train=train.drop(labels=['SMILES'], axis=1)
    
    return train

In [3]:
def split_data(df):
    train_df, val_df, train_labels, val_labels=train_test_split(df.drop(labels=['ACTIVE'], axis=1),df['ACTIVE'],test_size=0.3,random_state=41)
    return train_df, val_df, train_labels, val_labels


In [4]:
def create_preprocess(df):
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_mean.fit(df[['NumAtoms','HsNumAtoms']])
    df[['NumAtoms','HsNumAtoms']]=imp_mean.transform(df[['NumAtoms','HsNumAtoms']])
    scaler = MinMaxScaler()
    scaler.fit(df[['NumAtoms','HsNumAtoms']])
    df[['NumAtoms','HsNumAtoms']]=scaler.transform(df[['NumAtoms','HsNumAtoms']])
    est = KBinsDiscretizer(n_bins=10, encode='ordinal')
    est.fit(df[['NumAtoms','HsNumAtoms']])
    df[['NumAtoms','HsNumAtoms']]=est.transform(df[['NumAtoms','HsNumAtoms']])
    return df,imp_mean,scaler,est

In [5]:
def apply_preprocess(df,imp_mean,scaler,est):
    df[['NumAtoms','HsNumAtoms']]=imp_mean.transform(df[['NumAtoms','HsNumAtoms']])
    df[['NumAtoms','HsNumAtoms']]=scaler.transform(df[['NumAtoms','HsNumAtoms']])
    df[['NumAtoms','HsNumAtoms']]=est.transform(df[['NumAtoms','HsNumAtoms']])
    return df


In [6]:
train = pd.read_csv("training_smiles.csv")
train_df=feature_selection(train)
train_df, val_df, train_labels, val_labels=split_data(train_df)
train_df,imp_mean,scaler,est=create_preprocess(train_df)
apply_preprocess(val_df,imp_mean,scaler,est)



Unnamed: 0,ACTIVE,NumAtoms,HeavyAtomCount,CalcExactMolWt,fr_Al_COO,HsNumAtoms
ACTIVE,1.0,-0.03175,-0.03175,-0.035982,-0.008681,-0.040342
NumAtoms,-0.03175,1.0,0.999995,0.963085,-0.029442,0.91065
HeavyAtomCount,-0.03175,0.999995,1.0,0.963089,-0.02944,0.910652
CalcExactMolWt,-0.035982,0.963085,0.963089,1.0,-0.023483,0.857853
fr_Al_COO,-0.008681,-0.029442,-0.02944,-0.023483,1.0,-0.009786
HsNumAtoms,-0.040342,0.91065,0.910652,0.857853,-0.009786,1.0


Unnamed: 0,NumAtoms,fr_Al_COO,HsNumAtoms,fp_0,fp_1,fp_2,fp_3,fp_4,fp_5,fp_6,...,fp_114,fp_115,fp_116,fp_117,fp_118,fp_119,fp_120,fp_121,fp_122,fp_123
87715,7.0,0,6.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
90167,8.0,0,8.0,0,1,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
142945,2.0,0,2.0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,1,1,0
80044,3.0,0,1.0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
117189,3.0,0,2.0,0,1,1,1,0,1,0,...,0,0,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9829,7.0,0,6.0,1,0,1,1,0,1,1,...,0,0,0,0,0,0,1,0,1,0
114685,7.0,0,5.0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,1,1,1,0
39320,5.0,0,4.0,0,0,1,1,0,1,0,...,0,0,0,0,0,1,1,0,0,1
85588,5.0,0,5.0,0,0,0,1,0,1,1,...,0,0,0,1,0,0,0,0,1,0


In [7]:
def RandomForest(train_df, val_df, train_labels, val_labels):
    clf = RandomForestClassifier(max_depth=2, random_state=0)
    clf.fit(train_df,train_labels)
    proba=clf.predict_proba(val_df)
    print("Test set AUC: {}".format(roc_auc_score(val_labels,proba[:,-1])))
    

In [8]:
RandomForest(train_df, val_df, train_labels, val_labels)

Test set AUC: 0.8146733008726227


In [27]:
#from sklearn import tree
def DecisionTree(train_df, val_df, train_labels, val_labels):
    #dt = DecisionTreeClassifier(random_state=0)
    dt=tree.DecisionTreeClassifier()
    dt.fit(train_df,train_labels)
    probb=dt.predict_proba(val_df)
    print("Test set AUC: {}".format(roc_auc_score(val_labels,probb[:,-1])))

In [28]:
DecisionTree(train_df, val_df, train_labels, val_labels)

Test set AUC: 0.580432833249747


In [29]:
def KNN(train_df, val_df, train_labels, val_labels):
    num_rows, num_cols = train_df.shape
    neigh = KNeighborsClassifier(n_neighbors=int(math.sqrt(num_rows)))
    neigh.fit(train_df,train_labels)
    
    probc=neigh.predict_proba(val_df)
    print("Test set AUC: {}".format(roc_auc_score(val_labels,probc[:,-1])))
    

In [30]:
KNN(train_df, val_df, train_labels, val_labels)

Test set AUC: 0.8539300770878197


In [13]:
def MLP(train_df, val_df, train_labels, val_labels):
    clf = MLPClassifier(random_state=1, max_iter=300).fit(train_df,train_labels)
    probd=clf.predict_proba(val_df)
    print("Test set AUC: {}".format(roc_auc_score(val_labels,probd[:,-1])))
    

In [14]:
MLP(train_df, val_df, train_labels, val_labels)

Test set AUC: 0.877371481545931


In [33]:
def crossvalidation_rf(train_df, val_df, train_labels, val_labels):
    foldl=KFold(n_splits=10,random_state=25,shuffle=True)
    clf=RandomForestClassifier()
    display(cross_val_score(clf,train_df,train_labels,cv=10,scoring='roc_auc'))

In [34]:
crossvalidation_rf(train_df, val_df, train_labels, val_labels)

array([0.89383198, 0.76770321, 0.85216682, 0.86567745, 0.831549  ,
       0.80202889, 0.79503774, 0.86129365, 0.89184733, 0.86066364])

In [37]:
def crossvalidation_dt(train_df, val_df, train_labels, val_labels):
    foldl=KFold(n_splits=10,random_state=25,shuffle=True)
    clf=tree.DecisionTreeClassifier()
    display(cross_val_score(clf,train_df,train_labels,cv=10,scoring='roc_auc'))

In [38]:
crossvalidation_dt(train_df, val_df, train_labels, val_labels)

array([0.54393326, 0.52956211, 0.54446373, 0.59220679, 0.62287809,
       0.54616873, 0.57658164, 0.54504226, 0.54451173, 0.52874204])

In [39]:
def crossvalidation_knn(train_df, val_df, train_labels, val_labels):
    num_rows, num_cols = train_df.shape
    foldl=KFold(n_splits=10,random_state=25,shuffle=True)
    neigh = KNeighborsClassifier(n_neighbors=int(math.sqrt(num_rows/10)))
    display(cross_val_score(neigh,train_df,train_labels,cv=10,scoring='roc_auc'))

In [40]:
crossvalidation_knn(train_df, val_df, train_labels, val_labels)

array([0.88053385, 0.84712728, 0.90856029, 0.92544217, 0.84485014,
       0.77942055, 0.89060541, 0.85298754, 0.83178294, 0.8901005 ])

In [41]:
def crossvalidation_mlp(train_df, val_df, train_labels, val_labels):
    foldl=KFold(n_splits=10,random_state=25,shuffle=True)
    clf = MLPClassifier(random_state=1, max_iter=300)
    display(cross_val_score(clf,train_df,train_labels,cv=10,scoring='roc_auc'))

In [42]:
crossvalidation_mlp(train_df, val_df, train_labels, val_labels)

array([0.89257812, 0.83007511, 0.89310559, 0.92053072, 0.88265577,
       0.77307659, 0.85621142, 0.86986351, 0.88468518, 0.88889626])

In [43]:
def withgridsearch(train_df, val_df, train_labels, val_labels):
    clf=RandomForestClassifier()
    p={'n_estimators':[50,100,150,200],'max_depth':[5,10,15,20,25],'random_state':[0,25]}
    g=GridSearchCV(clf,param_grid=p,cv=10,scoring='roc_auc').fit(train_df,train_labels)
    display(g.best_score_)
    display(g.best_params_)

In [44]:
withgridsearch(train_df, val_df, train_labels, val_labels)

KeyboardInterrupt: 

In [102]:
def lol_test(train_df, val_df, train_labels, val_labels):
    test_df = pd.read_csv("test_smiles.csv")
    clf = MLPClassifier(random_state=1, max_iter=300).fit(train_df,train_labels)
    #probd=clf.predict_proba(val_df)
    probn=clf.predict(feature_selection(test_df))
    #print("Test set AUC: {}".format(roc_auc_score(val_labels,probn[:,-1])))
    display(probn)
    return probn
    

In [103]:
res=lol_test(train_df, val_df, train_labels, val_labels)

Unnamed: 0,NumAtoms,HeavyAtomCount,CalcExactMolWt,fr_Al_COO,HsNumAtoms
NumAtoms,1.0,1.0,0.964,-0.041588,0.911128
HeavyAtomCount,1.0,1.0,0.964,-0.041588,0.911128
CalcExactMolWt,0.964,0.964,1.0,-0.034315,0.858954
fr_Al_COO,-0.041588,-0.041588,-0.034315,1.0,-0.026585
HsNumAtoms,0.911128,0.911128,0.858954,-0.026585,1.0


array([0., 0., 0., ..., 0., 0., 0.])

In [104]:
np.savetxt('output.txt',res)

In [82]:
display(train_df)

Unnamed: 0,NumAtoms,fr_Al_COO,HsNumAtoms,fp_0,fp_1,fp_2,fp_3,fp_4,fp_5,fp_6,...,fp_114,fp_115,fp_116,fp_117,fp_118,fp_119,fp_120,fp_121,fp_122,fp_123
7846,3.0,0,4.0,1,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,1
129892,1.0,0,1.0,0,0,1,1,0,1,1,...,0,0,0,0,0,0,1,1,1,0
103892,3.0,0,3.0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,1,0
134784,0.0,0,1.0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,1,1,0
145198,8.0,0,6.0,0,1,1,0,0,1,0,...,0,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53491,1.0,0,2.0,0,0,1,1,1,1,1,...,0,0,0,0,0,1,0,1,1,1
106817,9.0,0,9.0,0,0,1,0,1,1,1,...,0,0,1,0,0,1,1,0,1,1
61324,3.0,0,1.0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,1,0
132003,3.0,0,2.0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0


In [83]:
display(val_df)

Unnamed: 0,NumAtoms,fr_Al_COO,HsNumAtoms,fp_0,fp_1,fp_2,fp_3,fp_4,fp_5,fp_6,...,fp_114,fp_115,fp_116,fp_117,fp_118,fp_119,fp_120,fp_121,fp_122,fp_123
87715,7.0,0,6.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,1,0
90167,8.0,0,8.0,0,1,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
142945,2.0,0,2.0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,1,1,0
80044,3.0,0,1.0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
117189,3.0,0,2.0,0,1,1,1,0,1,0,...,0,0,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9829,7.0,0,6.0,1,0,1,1,0,1,1,...,0,0,0,0,0,0,1,0,1,0
114685,7.0,0,5.0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,1,1,1,0
39320,5.0,0,4.0,0,0,1,1,0,1,0,...,0,0,0,0,0,1,1,0,0,1
85588,5.0,0,5.0,0,0,0,1,0,1,1,...,0,0,0,1,0,0,0,0,1,0


In [84]:
display(res)

array([0., 0., 0., ..., 0., 0., 0.])

In [9]:
GRID = [
    {'scaler': [StandardScaler()],
     'estimator': [MLPClassifier(random_state=0)],
     'estimator__solver': ['adam'],
     'estimator__learning_rate_init': [0.0001],
     'estimator__max_iter': [300],
     'estimator__hidden_layer_sizes': [(500, 400, 300, 200, 100), (400, 400, 400, 400, 400)],
     'estimator__activation': ['logistic', 'tanh', 'relu'],
     'estimator__alpha': [0.0001, 0.001, 0.005],
     'estimator__early_stopping': [True, False]
     }
]

PIPELINE = Pipeline([('scaler', None), ('estimator', MLPClassifier())])

In [None]:
grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID, 
                            scoring=make_scorer(accuracy_score),# average='macro'), 
                            n_jobs=-1, cv=5, refit=True, verbose=1, 
                            return_train_score=False)

grid_search.fit(train_df, train_labels)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [11]:
def randomoversampling(train_df, val_df, train_labels, val_labels):
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(train_df, train_labels)
    clf = MLPClassifier(random_state=1, max_iter=300).fit(X_resampled,y_resampled)
    probd=clf.predict_proba(val_df)
    print("Test set AUC: {}".format(roc_auc_score(val_labels,probd[:,-1])))
    

In [12]:
randomoversampling(train_df, val_df, train_labels, val_labels)

Test set AUC: 0.8761057889334755


In [17]:
def SMOTE_analysis(train_df, val_df, train_labels, val_labels):
    
    X_resampled, y_resampled = BorderlineSMOTE().fit_resample(train_df, train_labels)
    clf = MLPClassifier(random_state=1, max_iter=300).fit(X_resampled,y_resampled)
    probd=clf.predict_proba(val_df)
    print("Test set AUC: {}".format(roc_auc_score(val_labels,probd[:,-1])))

In [18]:
SMOTE_analysis(train_df, val_df, train_labels, val_labels)

Test set AUC: 0.8519043575676606
