<a href="https://colab.research.google.com/github/RohithOfRivia/SMILES-Toxicity-Prediction/blob/main/28-03-23-k3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [70]:
!pip install rdkit
!pip install -U mlxtend
!pip install Boruta
!pip install skfeature-chappers

from skfeature.function.similarity_based import fisher_score

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from boruta import BorutaPy

import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import BorderlineSMOTE

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import MACCSkeys

from sklearn.cluster import KMeans

from sklearn.svm import OneClassSVM
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.ensemble import (GradientBoostingClassifier, 
                              HistGradientBoostingClassifier, IsolationForest)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.utils import resample

from sklearn.metrics import f1_score, recall_score

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.feature_selection import SelectKBest, chi2

from xgboost import XGBClassifier
import time
import os
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Preprocessing and pipeline

In [71]:
train_data_url = "https://raw.githubusercontent.com/RohithOfRivia/SMILES-Toxicity-Prediction/main/Data/train_II.csv"
test_data_url = "https://raw.githubusercontent.com/RohithOfRivia/SMILES-Toxicity-Prediction/main/Data/test_II.csv"

df = pd.read_csv(train_data_url)

In [72]:
#transforming each compound into their canonical SMILES format. Optional.
def canonicalSmiles(smile):
    try:
        return Chem.MolToSmiles(Chem.MolFromSmiles(smile))
    except:
        return(Chem.MolToSmiles(Chem.MolFromSmiles("[Na+].[Na+].F[Si--](F)(F)(F)(F)F")))

In [73]:
#Read data and split up the given features
class FileReadTransform(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    #training and test data are slightly different, hence passing optional test param
    def transform(self, X, test=False):
        
        try:
          # if test == False:
            X['SMILES'] = X['Id'].apply(lambda x: x.split(';')[0])
            X['assay'] = X['Id'].apply(lambda x: x.split(';')[1])
        
        except KeyError:
            X['SMILES'] = X['x'].apply(lambda x: x.split(';')[0])
            X['assay'] = X['x'].apply(lambda x: x.split(';')[1])
          
        print("FileReadTransform done")
        
        #correct smiles for this compound found through https://www.molport.com/shop/index
        #X["SMILES"] = X["SMILES"].replace({"F[Si-2](F)(F)(F)(F)F.[Na+].[Na+]":"[Na+].[Na+].F[Si--](F)(F)(F)(F)F"})
        
        #Deleting invalid compound from the data
        X = X.loc[X.SMILES != "F[Si-2](F)(F)(F)(F)F.[Na+].[Na+]"]
        return X
    
    
class CanonicalGenerator(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['SMILES'] = X['SMILES'].apply(canonicalSmiles)
        print("CanonicalGenerator done")
        return X


class Scaler(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        temp_df = X[X.columns[-208:]]
        ss = StandardScaler()
        temp_df = pd.DataFrame(ss.fit_transform(temp_df), columns = temp_df.columns[-208:])

        return pd.concat([X[X.columns[:4]], temp_df], axis=1)

    
class FingerprintGenerator(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
          #tracks each unique compound and its fingerprints
          tracker = []
          fps = []
          assays = []
          unique = len(X['SMILES'].unique())
          counter = 0

          for index, columns in X[["SMILES", "assay"]].iterrows():

              #skip if already in tracker
              if columns[0] in tracker:
                  continue

              #append each unique compound and thier respective fingerprints
              else:
                  tracker.append(columns[0])
                  assays.append(columns[1])

                  mol = Chem.MolFromSmiles(columns[0])
                  fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=256)
                  fps.append(fp.ToList())

                  counter += 1

                  # print(f"compound {counter}/{unique}...

          #Combining all compounds, assays and fingerprints into one dataframe 
          cols = a = ["x" + str(i) for i in range (1, 257)]
          smiles_df = pd.DataFrame(columns=['SMILES'], data=tracker)
          fingerprints = pd.DataFrame(columns=cols, data=fps)

          df = pd.concat([smiles_df, fingerprints], axis=1)

          print("FingerprintGenerator done")
          return pd.merge(X, df, on='SMILES') 


class FingerprintGeneratorM(BaseEstimator, TransformerMixin):
    

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
      
          #tracks each unique compound and its fingerprints
          tracker = []
          fps = []
          assays = []
          unique = len(X['SMILES'].unique())
          counter = 0

          for index, columns in X[["SMILES", "assay"]].iterrows():

              #skip if already in tracker
              if columns[0] in tracker:
                  continue

              #append each unique compound and thier respective fingerprints
              else:
                  

                  tracker.append(columns[0])
                  assays.append(columns[1])

                  mol = Chem.MolFromSmiles(columns[0])
                  fp = MACCSkeys.GenMACCSKeys(mol)
                  fps.append(fp.ToList())

                  counter += 1

                  # print(f"compound {counter}/{unique}...

          #Combining all compounds, assays and fingerprints into one dataframe 
          cols = a = ["x" + str(i) for i in range (1, 168)]
          smiles_df = pd.DataFrame(columns=['SMILES'], data=tracker)
          fingerprints = pd.DataFrame(columns=cols, data=fps)

          df = pd.concat([smiles_df, fingerprints], axis=1)

          print("FingerprintGenerator done")
          return pd.merge(X, df, on='SMILES') 


#Feature reduction with variance threshold 
class VarianceThresh(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, thresh=.8):
      
      #Looks to columns to determine whether X is training or testing data 
      cols = X.columns
      if 'x' in cols:
        temp_df = X.drop(columns=["x", "assay", "SMILES"])
        cols = ["x", "assay", "SMILES"]
      else:
        temp_df = X.drop(columns=["Id", "Expected","assay", "SMILES"])
        cols = ["Id", "Expected","assay", "SMILES"]

      #Selecting features based on the variance threshold
      selector = VarianceThreshold(threshold=(thresh * (1 - thresh))) 
      selector.fit(temp_df)

      #This line transforms the data while keeping the column names 
      temp_df = temp_df.loc[:, selector.get_support()]

      #Attaching the ids, assays, smiles etc. that is still required for model
      return pd.concat([X[cols], temp_df], axis=1) , selector


#Scale descriptors 
class Scaler(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
      return self
    
    def transform(self, X):
      scaler = StandardScaler()

      if 'Id' in X.columns:
        temp_df = X.drop(columns=["Id", "Expected", "assay", "SMILES"])
        cols = ["Id", "Expected","assay", "SMILES"]

        X_scaled = pd.DataFrame(scaler.fit_transform(temp_df), columns=temp_df.columns)
        X = pd.concat([X[cols].reset_index(drop=True), X_scaled], axis=1)
          
        return X

      else:
        temp_df = X.drop(columns=["x", "assay", "SMILES"])
        cols = ["x", "assay", "SMILES"]

        X_scaled = pd.DataFrame(scaler.fit_transform(temp_df), columns=temp_df.columns)
        X = pd.concat([X[cols].reset_index(drop=True), X_scaled], axis=1)

        return X


#Scale descriptors 
class Encode(BaseEstimator, TransformerMixin):

    encoder = LabelEncoder()
    
    def fit(self, X, y=None):
      return self
    
    def transform(self, X):
      enc = LabelEncoder()

      if 'Id' in X.columns:
        temp_df = X.drop(columns=["Id", "Expected", "assay", "SMILES"])
        cols = ["Id", "Expected","assay", "SMILES"]

        X_scaled = pd.DataFrame(scaler.fit_transform(temp_df), columns=temp_df.columns)
        X = pd.concat([X[cols].reset_index(drop=True), X_scaled], axis=1)
          
        return X

      else:
        temp_df = X.drop(columns=["x", "assay", "SMILES"])
        cols = ["x", "assay", "SMILES"]

        X_scaled = pd.DataFrame(scaler.fit_transform(temp_df), columns=temp_df.columns)
        X = pd.concat([X[cols].reset_index(drop=True), X_scaled], axis=1)

        return X


# Generating descriptors

In [74]:
class DescriptorGenerator(BaseEstimator, TransformerMixin):

  def fit(self, X, y=None):
        return self
    
  def transform(self, X):
    #Initializing descriptor calculator
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()

    #Tracking each unique compound and generating descriptors 
    tracker = []
    descriptors = []
    for compound in X['SMILES']:

      if compound in tracker:
        continue

      else:
        tracker.append(compound)

        mol = Chem.MolFromSmiles(compound)
        current_descriptors = calc.CalcDescriptors(mol)
        descriptors.append(current_descriptors)

    # Combining X, SMILES, and generated descriptors 
    df = pd.DataFrame(descriptors,columns=desc_names)
    temp_df = pd.DataFrame(tracker, columns=["SMILES"])
    df = pd.concat([df, temp_df], axis=1)

    print("DescriptorGenerator done")
    return pd.merge(X, df, on='SMILES')

In [75]:
feature_generation_pipeline = Pipeline(steps=[
    ('read', FileReadTransform()),
     ('canon', CanonicalGenerator()),
     ('fpr', FingerprintGenerator()),
     ('desc', DescriptorGenerator())
     ])

feature_selector_pipeline = Pipeline(steps=[
    ('vtr', VarianceThresh()),
     ])
df_processed = feature_generation_pipeline.fit_transform(df)
test_processed = feature_generation_pipeline.fit_transform(pd.read_csv(test_data_url))

FileReadTransform done
CanonicalGenerator done
FingerprintGenerator done
DescriptorGenerator done
FileReadTransform done
CanonicalGenerator done
FingerprintGenerator done
DescriptorGenerator done


## Feature Selection

In [76]:
df_processed
df_processed = df_processed.drop(columns=['BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW',
                      'BCUT2D_MRHI', 'BCUT2D_MRLOW'])

In [77]:
df_processed = df_processed.dropna()

train, X_test, y_train, y_test = train_test_split(df_processed, df_processed['Expected'], test_size=0.2, random_state=0, stratify=df_processed['Expected'])

In [78]:

train

Unnamed: 0,Id,Expected,SMILES,assay,x1,x2,x3,x4,x5,x6,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
68926,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-];2,2,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12,0
74478,CCCCCCCCCCCC(=O)N(C)CC(=O)O;1857,2,CCCCCCCCCCCC(=O)N(C)CC(=O)O,1857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,0
16625,CC(C)[C@@H](C1=CC=C(C=C1)Cl)C(=O)O[C@H](C#N)C2...,1,CC(C)[C@H](C(=O)O[C@H](C#N)c1cccc(Oc2ccccc2)c1...,1387,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
68659,CCOCCOCCOCCO;2453,2,CCOCCOCCOCCO,2453,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,0
40143,COP(=O)(OC)OC=C(Cl)Cl;1619,2,COP(=O)(OC)OC=C(Cl)Cl,1619,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21458,CC(C)N(C(C)C)C(=O)SCC(=C(Cl)Cl)Cl;11,2,CC(C)N(C(=O)SCC(Cl)=C(Cl)Cl)C(C)C,11,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
42040,CCCCC(CN1C=NC=N1)(C2=C(C=C(C=C2)Cl)Cl)O;1681,2,CCCCC(O)(Cn1cncn1)c1ccc(Cl)cc1Cl,1681,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35371,CO[C@H]1[C@@H](C[C@@H]2CN3CCC4=C([C@H]3C[C@@H]...,2,COC(=O)[C@H]1[C@H]2C[C@@H]3c4[nH]c5cc(OC)ccc5c...,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37179,CCCCOCCOCCOCC1=CC2=C(C=C1CCC)OCO2;1375,1,CCCCOCCOCCOCc1cc2c(cc1CCC)OCO2,1375,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8,0


In [79]:
train

Unnamed: 0,Id,Expected,SMILES,assay,x1,x2,x3,x4,x5,x6,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
68926,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-];2,2,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12,0
74478,CCCCCCCCCCCC(=O)N(C)CC(=O)O;1857,2,CCCCCCCCCCCC(=O)N(C)CC(=O)O,1857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,0
16625,CC(C)[C@@H](C1=CC=C(C=C1)Cl)C(=O)O[C@H](C#N)C2...,1,CC(C)[C@H](C(=O)O[C@H](C#N)c1cccc(Oc2ccccc2)c1...,1387,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
68659,CCOCCOCCOCCO;2453,2,CCOCCOCCOCCO,2453,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,0
40143,COP(=O)(OC)OC=C(Cl)Cl;1619,2,COP(=O)(OC)OC=C(Cl)Cl,1619,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21458,CC(C)N(C(C)C)C(=O)SCC(=C(Cl)Cl)Cl;11,2,CC(C)N(C(=O)SCC(Cl)=C(Cl)Cl)C(C)C,11,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
42040,CCCCC(CN1C=NC=N1)(C2=C(C=C(C=C2)Cl)Cl)O;1681,2,CCCCC(O)(Cn1cncn1)c1ccc(Cl)cc1Cl,1681,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35371,CO[C@H]1[C@@H](C[C@@H]2CN3CCC4=C([C@H]3C[C@@H]...,2,COC(=O)[C@H]1[C@H]2C[C@@H]3c4[nH]c5cc(OC)ccc5c...,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37179,CCCCOCCOCCOCC1=CC2=C(C=C1CCC)OCO2;1375,1,CCCCOCCOCCOCc1cc2c(cc1CCC)OCO2,1375,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8,0


In [80]:
corr = train.corr()
corr

Unnamed: 0,Expected,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
Expected,1.000000,-0.015700,-0.005324,-0.026867,-0.002286,-0.012145,-0.010981,-0.034100,-0.012699,-0.009811,...,-0.000168,0.030823,0.006046,-0.002575,-0.010532,-0.016597,-0.009889,-0.017880,-0.006848,0.032591
x1,-0.015700,1.000000,-0.008843,0.064111,0.073885,0.074542,0.061279,0.042562,-0.071645,-0.008091,...,-0.074734,0.208330,0.002843,-0.030406,0.055855,0.043637,-0.019013,0.006981,-0.077245,0.158053
x2,-0.005324,-0.008843,1.000000,-0.047461,0.049433,0.055327,0.024001,0.020192,-0.001141,0.203555,...,0.061403,-0.014212,-0.002660,-0.009432,0.067033,-0.041582,-0.031942,0.051348,-0.054019,-0.055899
x3,-0.026867,0.064111,-0.047461,1.000000,0.011315,0.296553,0.094786,0.059346,-0.055444,-0.006824,...,0.048047,-0.009349,-0.027671,-0.011773,-0.009554,-0.005395,-0.017325,0.068698,-0.051734,0.025205
x4,-0.002286,0.073885,0.049433,0.011315,1.000000,0.035184,0.048659,-0.014111,0.064543,0.061667,...,0.007029,0.055064,-0.024219,0.005273,0.050847,0.025469,-0.014553,0.016374,0.050776,-0.012560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fr_thiazole,-0.016597,0.043637,-0.041582,-0.005395,0.025469,0.008419,0.088189,0.099717,-0.013099,-0.025988,...,0.110097,-0.012730,-0.008815,-0.009680,-0.003820,1.000000,0.154675,-0.008120,-0.018682,-0.022660
fr_thiocyan,-0.009889,-0.019013,-0.031942,-0.017325,-0.014553,-0.019113,-0.010505,0.063963,-0.016198,-0.015382,...,0.378320,-0.011942,-0.005084,-0.005583,-0.002203,0.154675,1.000000,-0.004684,0.029536,-0.013070
fr_thiophene,-0.017880,0.006981,0.051348,0.068698,0.016374,0.017600,0.010942,0.062177,-0.024834,-0.005910,...,-0.019733,0.017696,-0.007795,-0.008560,0.250950,-0.008120,-0.004684,1.000000,-0.023703,0.007081
fr_unbrch_alkane,-0.006848,-0.077245,-0.054019,-0.051734,0.050776,-0.053193,-0.034848,-0.050821,0.042313,-0.011250,...,-0.034790,-0.056161,-0.025729,-0.028256,-0.010288,-0.018682,0.029536,-0.023703,1.000000,-0.064651


In [87]:
kfeatures = SelectKBest(k=200)
kfeatures.fit_transform(train.drop(['Expected', 'SMILES', 'Id'], axis=1), train['Expected'])

array([['2', 0, 0, ..., 1, 0, 0],
       ['1857', 0, 0, ..., 0, 0, 0],
       ['1387', 0, 0, ..., 0, 0, 0],
       ...,
       ['2', 0, 0, ..., 0, 0, 0],
       ['1375', 0, 0, ..., 0, 0, 0],
       ['1857', 0, 0, ..., 0, 0, 0]], dtype=object)

In [92]:
cols_idxs = kfeatures.get_support(indices=True)
kbest = pd.concat([train[['SMILES', 'Expected']], train.drop(['Expected', 'SMILES', 'Id'], axis=1).iloc[:,cols_idxs]], axis=1)
kbest

Unnamed: 0,SMILES,Expected,assay,x3,x7,x15,x16,x18,x19,x22,...,fr_bicyclic,fr_halogen,fr_imidazole,fr_phenol,fr_phenol_noOrthoHbond,fr_piperdine,fr_pyridine,fr_quatN,fr_sulfonamd,fr_urea
68926,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],2,2,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
74478,CCCCCCCCCCCC(=O)N(C)CC(=O)O,2,1857,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16625,CC(C)[C@H](C(=O)O[C@H](C#N)c1cccc(Oc2ccccc2)c1...,1,1387,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
68659,CCOCCOCCOCCO,2,2453,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40143,COP(=O)(OC)OC=C(Cl)Cl,2,1619,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21458,CC(C)N(C(=O)SCC(Cl)=C(Cl)Cl)C(C)C,2,11,0,0,0,0,0,0,0,...,0,3,0,0,0,0,0,0,0,0
42040,CCCCC(O)(Cn1cncn1)c1ccc(Cl)cc1Cl,2,1681,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
35371,COC(=O)[C@H]1[C@H]2C[C@@H]3c4[nH]c5cc(OC)ccc5c...,2,2,0,0,0,0,0,0,0,...,6,0,0,0,0,1,0,0,0,0
37179,CCCCOCCOCCOCc1cc2c(cc1CCC)OCO2,1,1375,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [93]:
kbest_cols = kbest.columns
kbest_cols

Index(['SMILES', 'Expected', 'assay', 'x3', 'x7', 'x15', 'x16', 'x18', 'x19',
       'x22',
       ...
       'fr_bicyclic', 'fr_halogen', 'fr_imidazole', 'fr_phenol',
       'fr_phenol_noOrthoHbond', 'fr_piperdine', 'fr_pyridine', 'fr_quatN',
       'fr_sulfonamd', 'fr_urea'],
      dtype='object', length=202)

In [94]:
train[kbest_cols]

Unnamed: 0,SMILES,Expected,assay,x3,x7,x15,x16,x18,x19,x22,...,fr_bicyclic,fr_halogen,fr_imidazole,fr_phenol,fr_phenol_noOrthoHbond,fr_piperdine,fr_pyridine,fr_quatN,fr_sulfonamd,fr_urea
68926,CCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],2,2,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
74478,CCCCCCCCCCCC(=O)N(C)CC(=O)O,2,1857,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16625,CC(C)[C@H](C(=O)O[C@H](C#N)c1cccc(Oc2ccccc2)c1...,1,1387,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
68659,CCOCCOCCOCCO,2,2453,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40143,COP(=O)(OC)OC=C(Cl)Cl,2,1619,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21458,CC(C)N(C(=O)SCC(Cl)=C(Cl)Cl)C(C)C,2,11,0,0,0,0,0,0,0,...,0,3,0,0,0,0,0,0,0,0
42040,CCCCC(O)(Cn1cncn1)c1ccc(Cl)cc1Cl,2,1681,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
35371,COC(=O)[C@H]1[C@H]2C[C@@H]3c4[nH]c5cc(OC)ccc5c...,2,2,0,0,0,0,0,0,0,...,6,0,0,0,0,1,0,0,0,0
37179,CCCCOCCOCCOCc1cc2c(cc1CCC)OCO2,1,1375,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [97]:
model = XGBClassifier(seed=20, max_depth=8, n_estimators=600)

train['assay'] = train['assay'].astype('int64')

model.fit(train[kbest_cols].drop(columns=['SMILES',	'Expected']), train['Expected'].map({2:0, 1:1}))

In [102]:
X_test['assay'] = X_test['assay'].astype('int64')


preds = model.predict(X_test[kbest_cols].drop(columns=['SMILES',	'Expected']))
f1_score(preds, y_test.map({2:0, 1:1}))

0.6407766990291263

In [112]:
model = XGBClassifier(seed=20, max_depth=10, n_estimators=800)

df_processed['assay'] = df_processed['assay'].astype('int64')
test_processed['assay'] = test_processed['assay'].astype('int64')

model.fit(df_processed[kbest_cols].drop(columns=['SMILES',	'Expected']), df_processed['Expected'].map({2:0, 1:1}))

In [113]:
testkbest = ['SMILES','assay','x3','x7','x15','x16','x18','x19','x22','x30','x31','x33','x35','x37','x39','x43','x44','x46','x49','x54','x55','x58','x60','x62','x65','x67','x68','x70','x72','x77','x78','x79','x80','x81','x82','x83','x85','x86','x89','x98','x99','x101','x106','x108','x109','x111','x113','x115','x120','x122','x126','x130','x131','x134','x135','x137','x138','x139','x140','x142','x145','x150','x152','x155','x156','x158','x159','x162','x163','x164','x165','x168','x172','x174','x176','x178','x180','x186','x189','x193','x196','x202','x203','x206','x207','x211','x214','x215','x217','x220','x222','x223','x226','x229','x232','x234','x236','x239','x243','x245','x249','x251','x252','MinAbsEStateIndex','MolWt','HeavyAtomMolWt','ExactMolWt','NumValenceElectrons','MinPartialCharge','MaxAbsPartialCharge','FpDensityMorgan1','FpDensityMorgan2','BalabanJ','BertzCT','Chi0','Chi0n','Chi0v','Chi1','Chi1n','Chi1v','Chi2n','Chi2v','Chi3n','Chi3v','Chi4n','Chi4v','HallKierAlpha','Kappa1','Kappa2','LabuteASA','PEOE_VSA1','PEOE_VSA10','PEOE_VSA3','PEOE_VSA5','PEOE_VSA6','PEOE_VSA7','PEOE_VSA8','PEOE_VSA9','SMR_VSA10','SMR_VSA5','SMR_VSA7','SMR_VSA9','SlogP_VSA11','SlogP_VSA12','SlogP_VSA3','SlogP_VSA4','SlogP_VSA5','SlogP_VSA6','SlogP_VSA8','EState_VSA2','EState_VSA3','EState_VSA4','EState_VSA5','EState_VSA6','EState_VSA7','EState_VSA8','EState_VSA9','VSA_EState10','VSA_EState2','VSA_EState3','VSA_EState4','VSA_EState6','VSA_EState7','VSA_EState8','VSA_EState9','FractionCSP3','HeavyAtomCount','NumAliphaticCarbocycles','NumAliphaticHeterocycles','NumAliphaticRings','NumAromaticCarbocycles','NumAromaticRings','NumHeteroatoms','NumRotatableBonds','NumSaturatedCarbocycles','NumSaturatedRings','RingCount','MolLogP','MolMR','fr_Al_COO','fr_Ar_COO','fr_Ar_OH','fr_COO','fr_COO2','fr_C_O','fr_NH1','fr_Ndealkylation1','fr_allylic_oxid','fr_amide','fr_aniline','fr_benzene','fr_bicyclic','fr_halogen','fr_imidazole','fr_phenol','fr_phenol_noOrthoHbond','fr_piperdine','fr_pyridine','fr_quatN','fr_sulfonamd','fr_urea']

test_preds = model.predict(test_processed[testkbest].drop(columns=['SMILES']))
np.unique(test_preds, return_counts=True)

(array([0, 1]), array([9650, 1344]))

In [114]:
res = pd.DataFrame({})
res['Id'] = test_processed['x']
res['Predicted'] = test_preds
res['Predicted'] = res['Predicted'].map({0:2, 1:1})
res

Unnamed: 0,Id,Predicted
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,2
1,CC1=CC(=C(C=C1)C(C)(C)C)O;2451,2
2,CC1=CC(=C(C=C1)C(C)(C)C)O;2442,1
3,CC1=CC(=C(C=C1)C(C)(C)C)O;32,2
4,CC1=CC(=C(C=C1)C(C)(C)C)O;1382,2
...,...,...
10989,CC(=CC(=O)C)C;1856,2
10990,CCCCCCCCCC[N+](C)(C)CC1=CC=CC=C1.[Cl-];1848,2
10991,CC1=C(C(=O)N(N1C)C2=CC=CC=C2)N(C)CS(=O)(=O)[O-...,2
10992,COC1=CC=CC(=C1)C=O;2,2


In [116]:
from google.colab import files

res.to_csv('28-03-23-k3.csv', encoding = 'utf-8-sig', index=False) 
files.download('28-03-23-k3.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>