<a href="https://colab.research.google.com/github/RohithOfRivia/SMILES-Toxicity-Prediction/blob/main/Notebooks/1902_F1-49_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!pip install rdkit
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (GradientBoostingClassifier, 
                              HistGradientBoostingClassifier)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.utils import resample


from sklearn.metrics import f1_score, recall_score

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier
import time
import os

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [104]:
train_data_url = "https://raw.githubusercontent.com/RohithOfRivia/SMILES-Toxicity-Prediction/main/Data/train_II.csv"
test_data_url = "https://raw.githubusercontent.com/RohithOfRivia/SMILES-Toxicity-Prediction/main/Data/test_II.csv"

df = pd.read_csv(train_data_url)

In [36]:
#transforming each compound into their canonical SMILES format. Optional.
def canonicalSmiles(smile):
    try:
        return Chem.MolToSmiles(Chem.MolFromSmiles(smile))
    except:
        return(Chem.MolToSmiles(Chem.MolFromSmiles("[Na+].[Na+].F[Si--](F)(F)(F)(F)F")))

In [37]:
#Read data and split up the given features
class FileReadTransform(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    #training and test data are slightly different, hence passing optional test param
    def transform(self, X, test=False):
        
        try:
          # if test == False:
            X['SMILES'] = X['Id'].apply(lambda x: x.split(';')[0])
            X['assay'] = X['Id'].apply(lambda x: x.split(';')[1])
        
        except KeyError:
            X['SMILES'] = X['x'].apply(lambda x: x.split(';')[0])
            X['assay'] = X['x'].apply(lambda x: x.split(';')[1])
          
        print("FileReadTransform done")
        
        #correct smiles for this compound found through https://www.molport.com/shop/index
        #X["SMILES"] = X["SMILES"].replace({"F[Si-2](F)(F)(F)(F)F.[Na+].[Na+]":"[Na+].[Na+].F[Si--](F)(F)(F)(F)F"})
        
        #Deleting invalid compound from the data
        X = X.loc[X.SMILES != "F[Si-2](F)(F)(F)(F)F.[Na+].[Na+]"]
        return X
    
    
class CanonicalGenerator(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['SMILES'] = X['SMILES'].apply(canonicalSmiles)
        print("CanonicalGenerator done")
        return X

    
class FingerprintGenerator(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
          #tracks each unique compound and its fingerprints
          tracker = []
          fps = []
          assays = []
          unique = len(X['SMILES'].unique())
          counter = 0

          for index, columns in X[["SMILES", "assay"]].iterrows():

              #skip if already in tracker
              if columns[0] in tracker:
                  continue

              #append each unique compound and thier respective fingerprints
              else:
                  tracker.append(columns[0])
                  assays.append(columns[1])

                  mol = Chem.MolFromSmiles(columns[0])
                  fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
                  fps.append(fp.ToList())

                  counter += 1

                  # print(f"compound {counter}/{unique}...

          #Combining all compounds, assays and fingerprints into one dataframe 
          cols = a = ["x" + str(i) for i in range (1, 1025)]
          smiles_df = pd.DataFrame(columns=['SMILES'], data=tracker)
          fingerprints = pd.DataFrame(columns=cols, data=fps)

          df = pd.concat([smiles_df, fingerprints], axis=1)

          print("FingerprintGenerator done")
          return pd.merge(X, df, on='SMILES') 


#Feature reduction with variance threshold 
class VarianceThresh(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, thresh=.8):
      
      #Looks to columns to determine whether X is training or testing data 
      cols = X.columns
      if 'x' in cols:
        temp_df = X.drop(columns=["x", "assay", "SMILES"])
        cols = ["x", "assay", "SMILES"]
      else:
        temp_df = X.drop(columns=["Id", "Expected","assay", "SMILES"])
        cols = ["Id", "Expected","assay", "SMILES"]

      #Selecting features based on the variance threshold
      selector = VarianceThreshold(threshold=(thresh * (1 - thresh))) 
      selector.fit(temp_df)

      #This line transforms the data while keeping the column names 
      temp_df = temp_df.loc[:, selector.get_support()]

      #Attaching the ids, assays, smiles etc. that is still required for model
      return pd.concat([X[cols], temp_df], axis=1) , selector

      

## Generating descriptors

In [38]:
class DescriptorGenerator(BaseEstimator, TransformerMixin):

  def fit(self, X, y=None):
        return self
    
  def transform(self, X):
    #Initializing descriptor calculator
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()

    #Tracking each unique compound and generating descriptors 
    tracker = []
    descriptors = []
    for compound in X['SMILES']:

      if compound in tracker:
        continue

      else:
        tracker.append(compound)

        mol = Chem.MolFromSmiles(compound)
        current_descriptors = calc.CalcDescriptors(mol)
        descriptors.append(current_descriptors)

    # Combining X, SMILES, and generated descriptors 
    df = pd.DataFrame(descriptors,columns=desc_names)
    temp_df = pd.DataFrame(tracker, columns=["SMILES"])
    df = pd.concat([df, temp_df], axis=1)

    print("DescriptorGenerator done")
    return pd.merge(X, df, on='SMILES')

In [39]:
feature_generation_pipeline = Pipeline(steps=[
    ('read', FileReadTransform()),
     ('canon', CanonicalGenerator()),
     ('fpr', FingerprintGenerator()),
     ('desc', DescriptorGenerator())
     ])

feature_selector_pipeline = Pipeline(steps=[
    ('vtr', VarianceThresh()),
     ])
df_processed = feature_generation_pipeline.fit_transform(df)
df_processed

FileReadTransform done
CanonicalGenerator done
FingerprintGenerator done
DescriptorGenerator done


Unnamed: 0,Id,Expected,SMILES,assay,x1,x2,x3,x4,x5,x6,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1644,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1630,2,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1630,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;29,2,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,29,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1618,2,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1618,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1638,2,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1638,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75372,C1=CC=C(C=C1)NC(=S)N;1852,1,NC(=S)Nc1ccccc1,1852,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
75373,CN1CN(CN(C1)C)C;2,2,CN1CN(C)CN(C)C1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
75374,CCCCC1CCC(=O)O1;1852,1,CCCCC1CCC(=O)O1,1852,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
75375,CCOC(=O)CCC1=CC=CC=C1;2,2,CCOC(=O)CCc1ccccc1,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
df_processedV, selector = feature_selector_pipeline.fit_transform(df_processed)
df_processedV

Unnamed: 0,Id,Expected,assay,SMILES,x2,x34,x65,x81,x115,x129,...,fr_benzene,fr_bicyclic,fr_ester,fr_ether,fr_halogen,fr_methoxy,fr_para_hydroxylation,fr_phenol,fr_phenol_noOrthoHbond,fr_unbrch_alkane
0,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1644,2,1644,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1,0,0,0,1,0,...,2,0,0,0,3,0,0,2,2,0
1,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1630,2,1630,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1,0,0,0,1,0,...,2,0,0,0,3,0,0,2,2,0
2,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;29,2,29,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1,0,0,0,1,0,...,2,0,0,0,3,0,0,2,2,0
3,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1618,2,1618,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1,0,0,0,1,0,...,2,0,0,0,3,0,0,2,2,0
4,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1638,2,1638,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1,0,0,0,1,0,...,2,0,0,0,3,0,0,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75372,C1=CC=C(C=C1)NC(=S)N;1852,1,1852,NC(=S)Nc1ccccc1,0,0,1,0,0,1,...,1,0,0,0,0,0,1,0,0,0
75373,CN1CN(CN(C1)C)C;2,2,2,CN1CN(C)CN(C)C1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
75374,CCCCC1CCC(=O)O1;1852,1,1852,CCCCC1CCC(=O)O1,0,1,0,1,0,0,...,0,0,1,1,0,0,0,0,0,0
75375,CCOC(=O)CCC1=CC=CC=C1;2,2,2,CCOC(=O)CCc1ccccc1,0,1,1,1,0,0,...,1,0,1,1,0,0,0,0,0,0


In [44]:
for i in df_processedV.columns:
  print(i)

Id
Expected
assay
SMILES
x2
x34
x65
x81
x115
x129
x176
x295
x357
x379
x390
x562
x651
x660
x696
x727
x808
x850
x876
x894
x927
x936
MaxEStateIndex
MinEStateIndex
MaxAbsEStateIndex
MinAbsEStateIndex
MolWt
HeavyAtomMolWt
ExactMolWt
NumValenceElectrons
FpDensityMorgan2
FpDensityMorgan3
BCUT2D_MWHI
BCUT2D_MWLOW
BCUT2D_MRHI
BCUT2D_MRLOW
BalabanJ
BertzCT
Chi0
Chi0n
Chi0v
Chi1
Chi1n
Chi1v
Chi2n
Chi2v
Chi3n
Chi3v
Chi4n
Chi4v
HallKierAlpha
Ipc
Kappa1
Kappa2
Kappa3
LabuteASA
PEOE_VSA1
PEOE_VSA10
PEOE_VSA11
PEOE_VSA12
PEOE_VSA13
PEOE_VSA14
PEOE_VSA2
PEOE_VSA3
PEOE_VSA4
PEOE_VSA5
PEOE_VSA6
PEOE_VSA7
PEOE_VSA8
PEOE_VSA9
SMR_VSA1
SMR_VSA10
SMR_VSA2
SMR_VSA3
SMR_VSA4
SMR_VSA5
SMR_VSA6
SMR_VSA7
SMR_VSA9
SlogP_VSA1
SlogP_VSA10
SlogP_VSA11
SlogP_VSA12
SlogP_VSA2
SlogP_VSA3
SlogP_VSA4
SlogP_VSA5
SlogP_VSA6
SlogP_VSA7
SlogP_VSA8
TPSA
EState_VSA1
EState_VSA10
EState_VSA11
EState_VSA2
EState_VSA3
EState_VSA4
EState_VSA5
EState_VSA6
EState_VSA7
EState_VSA8
EState_VSA9
VSA_EState1
VSA_EState10
VSA_EState2
VSA_E

In [52]:
df_processedV.groupby(['Expected', 'assay']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Id,SMILES,x2,x34,x65,x81,x115,x129,x176,x295,...,fr_benzene,fr_bicyclic,fr_ester,fr_ether,fr_halogen,fr_methoxy,fr_para_hydroxylation,fr_phenol,fr_phenol_noOrthoHbond,fr_unbrch_alkane
Expected,assay,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,11,23,23,23,23,23,23,23,23,23,23,...,23,23,23,23,23,23,23,23,23,23
1,1372,110,110,110,110,110,110,110,110,110,110,...,110,110,110,110,110,110,110,110,110,110
1,1373,121,121,121,121,121,121,121,121,121,121,...,121,121,121,121,121,121,121,121,121,121
1,1374,110,110,110,110,110,110,110,110,110,110,...,110,110,110,110,110,110,110,110,110,110
1,1375,68,68,68,68,68,68,68,68,68,68,...,68,68,68,68,68,68,68,68,68,68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,5,255,255,255,255,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,255,255
2,6,283,283,283,283,283,283,283,283,283,283,...,283,283,283,283,283,283,283,283,283,283
2,7,262,262,262,262,262,262,262,262,262,262,...,262,262,262,262,262,262,262,262,262,262
2,8,267,267,267,267,267,267,267,267,267,267,...,267,267,267,267,267,267,267,267,267,267


In [56]:
df_processedV.loc[df_processedV['Expected'] == 1].groupby("assay").count().sort_values("Id")

Unnamed: 0_level_0,Id,Expected,SMILES,x2,x34,x65,x81,x115,x129,x176,...,fr_benzene,fr_bicyclic,fr_ester,fr_ether,fr_halogen,fr_methoxy,fr_para_hydroxylation,fr_phenol,fr_phenol_noOrthoHbond,fr_unbrch_alkane
assay,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1622,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1646,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1611,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1635,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
8,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1507,434,434,434,434,434,434,434,434,434,434,...,434,434,434,434,434,434,434,434,434,434
1850,514,514,514,514,514,514,514,514,514,514,...,514,514,514,514,514,514,514,514,514,514
1857,597,597,597,597,597,597,597,597,597,597,...,597,597,597,597,597,597,597,597,597,597
1856,611,611,611,611,611,611,611,611,611,611,...,611,611,611,611,611,611,611,611,611,611


In [95]:
currentassay = 1857
test = df_processed.loc[df_processedV['assay'] == str(currentassay)]


Unnamed: 0,Id,Expected,SMILES,assay,x1,x2,x3,x4,x5,x6,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
32,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1857,1,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1857,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
146,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1857,1,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12,0
249,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;1857,2,O=[N+]([O-])NC1=NCCN1Cc1ccc(Cl)nc1,1857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
289,[Na+].[I-];1857,2,[I-].[Na+],1857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
358,CCCSP(=O)(OCC)SCCC;1857,1,CCCSP(=O)(OCC)SCCC,1857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74763,C(NC(=O)NC1C(=O)NC(=O)N1CO)NC(=O)NC2C(=O)NC(=O...,2,O=C(NCNC(=O)NC1C(=O)NC(=O)N1CO)NC1C(=O)NC(=O)N1CO,1857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
74794,CC(C)(C1=CC(=C(C=C1)O)C2=CC=CC=C2)C3=CC(=C(C=C...,2,CC(C)(c1ccc(O)c(-c2ccccc2)c1)c1ccc(O)c(-c2cccc...,1857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74889,CC1=NN(C(=O)C1N=NC2=CC=CC=C2)C3=C(C=CC(=C3)S(=...,2,CC1=NN(c2cc(S(=O)(=O)[O-])ccc2Cl)C(=O)C1N=Nc1c...,1857,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
74982,C1CCC(C1)O;1857,1,OC1CCCC1,1857,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
test.groupby('Expected').count()

Unnamed: 0_level_0,Id,SMILES,assay,x1,x2,x3,x4,x5,x6,x7,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
Expected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,597,597,597,597,597,597,597,597,597,597,...,597,597,597,597,597,597,597,597,597,597
2,944,944,944,944,944,944,944,944,944,944,...,944,944,944,944,944,944,944,944,944,944


In [63]:
vtr = VarianceThresh()
testV, selector = vtr.fit_transform(test)
testV

Unnamed: 0,Id,Expected,assay,SMILES,x2,x34,x65,x81,x148,x176,...,fr_aryl_methyl,fr_benzene,fr_bicyclic,fr_ester,fr_ether,fr_halogen,fr_methoxy,fr_phenol,fr_phenol_noOrthoHbond,fr_unbrch_alkane
32,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1857,1,1857,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1,0,0,0,0,1,...,0,2,0,0,0,3,0,2,2,0
146,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1857,1,1857,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,12
249,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;1857,2,1857,O=[N+]([O-])NC1=NCCN1Cc1ccc(Cl)nc1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
289,[Na+].[I-];1857,2,1857,[I-].[Na+],0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
358,CCCSP(=O)(OCC)SCCC;1857,1,1857,CCCSP(=O)(OCC)SCCC,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74763,C(NC(=O)NC1C(=O)NC(=O)N1CO)NC(=O)NC2C(=O)NC(=O...,2,1857,O=C(NCNC(=O)NC1C(=O)NC(=O)N1CO)NC1C(=O)NC(=O)N1CO,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
74794,CC(C)(C1=CC(=C(C=C1)O)C2=CC=CC=C2)C3=CC(=C(C=C...,2,1857,CC(C)(c1ccc(O)c(-c2ccccc2)c1)c1ccc(O)c(-c2cccc...,0,1,1,0,0,1,...,0,4,0,0,0,0,0,2,2,0
74889,CC1=NN(C(=O)C1N=NC2=CC=CC=C2)C3=C(C=CC(=C3)S(=...,2,1857,CC1=NN(c2cc(S(=O)(=O)[O-])ccc2Cl)C(=O)C1N=Nc1c...,0,1,1,0,0,1,...,0,2,0,0,0,1,0,0,0,0
74982,C1CCC(C1)O;1857,1,1857,OC1CCCC1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [66]:
model = HistGradientBoostingClassifier()
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
X = testV.drop(columns=['Id', 'Expected','assay', "SMILES"])
n_scores = cross_val_score(model, X, testV['Expected'], scoring="f1", cv=cv, n_jobs=-1)
# report performance
print('F1: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

F1: 0.657 (0.021)


In [105]:
d = FileReadTransform()
can = CanonicalGenerator()
f = FingerprintGenerator()

df = d.fit_transform(df)
df = can.fit_transform(df)
df = f.fit_transform(df)

test = df.loc[df['assay'] == str(currentassay)]
test

FileReadTransform done
CanonicalGenerator done
FingerprintGenerator done


Unnamed: 0,Id,Expected,SMILES,assay,x1,x2,x3,x4,x5,x6,...,x1015,x1016,x1017,x1018,x1019,x1020,x1021,x1022,x1023,x1024
32,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1857,1,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1857,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
146,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1857,1,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],1857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
249,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;1857,2,O=[N+]([O-])NC1=NCCN1Cc1ccc(Cl)nc1,1857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
289,[Na+].[I-];1857,2,[I-].[Na+],1857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
358,CCCSP(=O)(OCC)SCCC;1857,1,CCCSP(=O)(OCC)SCCC,1857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74763,C(NC(=O)NC1C(=O)NC(=O)N1CO)NC(=O)NC2C(=O)NC(=O...,2,O=C(NCNC(=O)NC1C(=O)NC(=O)N1CO)NC1C(=O)NC(=O)N1CO,1857,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
74794,CC(C)(C1=CC(=C(C=C1)O)C2=CC=CC=C2)C3=CC(=C(C=C...,2,CC(C)(c1ccc(O)c(-c2ccccc2)c1)c1ccc(O)c(-c2cccc...,1857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74889,CC1=NN(C(=O)C1N=NC2=CC=CC=C2)C3=C(C=CC(=C3)S(=...,2,CC1=NN(c2cc(S(=O)(=O)[O-])ccc2Cl)C(=O)C1N=Nc1c...,1857,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
74982,C1CCC(C1)O;1857,1,OC1CCCC1,1857,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [106]:
vtr = VarianceThresh()
testV, selector = vtr.fit_transform(test)
testV

Unnamed: 0,Id,Expected,assay,SMILES,x2,x34,x65,x81,x148,x176,...,x357,x390,x651,x696,x727,x808,x850,x876,x894,x927
32,C1=CC(=CC=C1C(C2=CC=C(C=C2)O)C(Cl)(Cl)Cl)O;1857,1,1857,Oc1ccc(C(c2ccc(O)cc2)C(Cl)(Cl)Cl)cc1,1,0,0,0,0,1,...,1,0,0,0,1,1,1,0,0,0
146,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-];1857,1,1857,CCCCCCCCCC[N+](C)(C)CCCCCCCCCC.[Cl-],0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
249,C1CN(C(=N1)N[N+](=O)[O-])CC2=CN=C(C=C2)Cl;1857,2,1857,O=[N+]([O-])NC1=NCCN1Cc1ccc(Cl)nc1,0,0,0,1,0,0,...,1,0,1,0,1,0,1,0,0,1
289,[Na+].[I-];1857,2,1857,[I-].[Na+],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
358,CCCSP(=O)(OCC)SCCC;1857,1,1857,CCCSP(=O)(OCC)SCCC,0,1,0,1,0,0,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74763,C(NC(=O)NC1C(=O)NC(=O)N1CO)NC(=O)NC2C(=O)NC(=O...,2,1857,O=C(NCNC(=O)NC1C(=O)NC(=O)N1CO)NC1C(=O)NC(=O)N1CO,0,0,0,1,0,0,...,1,0,1,0,0,1,0,0,1,0
74794,CC(C)(C1=CC(=C(C=C1)O)C2=CC=CC=C2)C3=CC(=C(C=C...,2,1857,CC(C)(c1ccc(O)c(-c2ccccc2)c1)c1ccc(O)c(-c2cccc...,0,1,1,0,0,1,...,1,1,0,0,1,1,1,1,0,0
74889,CC1=NN(C(=O)C1N=NC2=CC=CC=C2)C3=C(C=CC(=C3)S(=...,2,1857,CC1=NN(c2cc(S(=O)(=O)[O-])ccc2Cl)C(=O)C1N=Nc1c...,0,1,1,0,0,1,...,1,1,1,0,1,0,1,1,0,0
74982,C1CCC(C1)O;1857,1,1857,OC1CCCC1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [117]:


# create random forest classifier model
model = HistGradientBoostingClassifier(random_state=1)
X_train, X_test, y_train, y_test = train_test_split(testV.drop(columns=['Expected', "Id", "SMILES", "assay"]), testV["Expected"].values, test_size=0.3, random_state=5, stratify=testV['Expected'])

model.fit(X_train, y_train)

y_predRF = model.predict(X_test)
   
print(f"model: {model} Accuracy: {accuracy_score(y_test, y_predRF)} F1 Score: {f1_score(y_test, y_predRF, zero_division=1)}")

model: HistGradientBoostingClassifier(random_state=1) Accuracy: 0.6479481641468683 F1 Score: 0.48580441640378547


In [142]:
encode_map = {
    2: 0,
    1: 1
}


testV["exp"] = testV['Expected'].replace(encode_map)
testV["exp"]

32       1
146      1
249      0
289      0
358      1
        ..
74763    0
74794    0
74889    0
74982    1
75126    0
Name: exp, Length: 1541, dtype: int64

In [143]:
X_train, X_test, y_train, y_test = train_test_split(testV.drop(columns=['exp', 'Expected', "Id", "SMILES", "assay"]), testV["exp"].values, test_size=0.3, random_state=5, stratify=testV['exp'])

In [144]:
# Make device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [145]:
import torch
from torch import nn
from torch import optim

In [168]:
xtrain = torch.from_numpy(X_train.values).type(torch.float)
ytrain = torch.from_numpy(y_train).type(torch.float)
xtest = torch.from_numpy(X_test.values).type(torch.float)
ytest = torch.from_numpy(y_test).type(torch.float)

xtrain[:5]

tensor([[1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
        [0., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
        [0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 0.]])

In [147]:
xtrain.shape

torch.Size([1078, 17])

In [294]:
class model(nn.Module):
  def __init__(self):
        super().__init__()
        # Number of input features is 17.
        self.layer_1 = nn.Linear(17, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)

  def forward(self, inputs):
      x = self.relu(self.layer_1(inputs))
      x = self.batchnorm1(x)
      x = self.relu(self.layer_2(x))
      x = self.batchnorm2(x)
      x = self.dropout(x)
      x = self.layer_out(x)
      
      return x

In [295]:
model1 = model().to(device)
model1

model(
  (layer_1): Linear(in_features=17, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [296]:
model1.state_dict()

OrderedDict([('layer_1.weight',
              tensor([[ 0.1810, -0.0153,  0.1663,  ...,  0.0359, -0.1683,  0.2055],
                      [ 0.1463,  0.0279,  0.1156,  ...,  0.1713,  0.2415, -0.0107],
                      [-0.1000, -0.1302, -0.1266,  ..., -0.0106, -0.1463, -0.0443],
                      ...,
                      [ 0.1673,  0.0083,  0.1231,  ..., -0.1124,  0.2322, -0.0413],
                      [-0.0818,  0.1147, -0.1996,  ..., -0.2263,  0.0902,  0.0090],
                      [ 0.1489,  0.1214, -0.1946,  ..., -0.0204, -0.2352, -0.1027]],
                     device='cuda:0')),
             ('layer_1.bias',
              tensor([ 0.0740, -0.1033, -0.0797, -0.1002,  0.2085,  0.2053, -0.1892, -0.0748,
                       0.2197, -0.0140, -0.1181,  0.0293,  0.1919, -0.0309, -0.2289, -0.0811,
                       0.1207, -0.0358,  0.1714,  0.0834,  0.0021,  0.0926, -0.0327, -0.2079,
                      -0.2079, -0.1908, -0.0386, -0.1754,  0.0355,  0.0278,  0.1845,

In [297]:
untrained = model1(xtest.to(device))

untrained[:5]

tensor([[-0.0723],
        [-0.0769],
        [-0.3718],
        [-0.7349],
        [-0.2018]], device='cuda:0', grad_fn=<SliceBackward0>)

In [298]:
def accuracy_fn(y_true, y_pred):
    # correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    # acc = (correct / len(y_pred)) * 100 
    return f1_score(y_true, y_pred)

In [299]:
accuracy_fn(untrained[:5].round().squeeze().cpu().detach(), ytest[:5].cpu().detach().squeeze().numpy())

ValueError: ignored

In [300]:
untrained[:5].cpu().detach().squeeze().numpy()
# ytest[:5].cpu().detach().squeeze().numpy()

array([-0.07225721, -0.07685688, -0.3718426 , -0.7349011 , -0.20176879],
      dtype=float32)

In [301]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model1.parameters(), lr=0.01)

In [302]:
torch.manual_seed(42)

# Set the number of epochs
epochs = 1000

# Put data to target device
xtrain, ytrain = xtrain.to(device), ytrain.to(device)
xtest, ytest = xtest.to(device), ytest.to(device)

# Build training and evaluation loop
for epoch in range(epochs):
    ### Training
    model1.train()

    # 1. Forward pass (model outputs raw logits)
    y_logits = model1(xtrain).squeeze() # squeeze to remove extra `1` dimensions, this won't work unless model and data are on same device 
    y_pred = torch.round(torch.sigmoid(y_logits)) # turn logits -> pred probs -> pred labls
  
    # 2. Calculate loss/accuracy
    # loss = loss_fn(torch.sigmoid(y_logits), # Using nn.BCELoss you need torch.sigmoid()
    #                y_train) 
    loss = criterion(y_logits, # Using nn.BCEWithLogitsLoss works with raw logits
                   ytrain) 
    acc = accuracy_fn(y_true=ytrain.cpu().detach().squeeze().numpy(), 
                      y_pred=y_pred.cpu().detach().squeeze().numpy()) 

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Loss backwards
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    ### Testing
    model1.eval()
    with torch.inference_mode():
        # 1. Forward pass
        test_logits = model1(xtest).squeeze() 
        test_pred = torch.round(torch.sigmoid(test_logits))
        # 2. Caculate loss/accuracy
        test_loss = criterion(test_logits,
                            ytest)
        test_acc = accuracy_fn(y_true=ytest.detach().cpu().numpy(),
                               y_pred=test_pred.cpu())

    # Print out what's happening every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss}, Accuracy: {acc}% | Test loss: {test_loss}, Test acc: {test_acc}%")

Epoch: 0 | Loss: 0.7345202565193176, Accuracy: 0.47295864262990456% | Test loss: 0.6998975276947021, Test acc: 0.5705024311183144%
Epoch: 10 | Loss: 0.49832624197006226, Accuracy: 0.66078184110971% | Test loss: 0.6388767957687378, Test acc: 0.14925373134328357%
Epoch: 20 | Loss: 0.4089822471141815, Accuracy: 0.707152496626181% | Test loss: 0.683180570602417, Test acc: 0.11282051282051282%
Epoch: 30 | Loss: 0.3445320725440979, Accuracy: 0.7770859277708592% | Test loss: 0.7190486192703247, Test acc: 0.25641025641025644%
Epoch: 40 | Loss: 0.3045940101146698, Accuracy: 0.7959442332065907% | Test loss: 0.82828289270401, Test acc: 0.3107569721115538%
Epoch: 50 | Loss: 0.28352099657058716, Accuracy: 0.8051282051282053% | Test loss: 1.0504859685897827, Test acc: 0.3321033210332104%
Epoch: 60 | Loss: 0.2937595546245575, Accuracy: 0.783375314861461% | Test loss: 1.2211146354675293, Test acc: 0.4180064308681672%
Epoch: 70 | Loss: 0.2830210030078888, Accuracy: 0.797011207970112% | Test loss: 1.341

In [8]:
# model = HistGradientBoostingClassifier()
# # define the evaluation procedure
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# # evaluate the model and collect the scores
# X = df_processed.drop(columns=['Id', 'Expected', "SMILES"])
# n_scores = cross_val_score(model, X, df_processed['Expected'], scoring="f1", cv=cv, n_jobs=-1)
# # report performance
# print('F1: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(df_fp.drop(columns=['Expected', "Id", "SMILES"]), df_fp["Expected"].values, test_size=0.3, stratify=df_fp['Expected'],random_state=5)

# # RFmodel = RandomForestClassifier(n_estimators=500, random_state=1, max_depth=7, )
# # RFmodel.fit(X_train, y_train)
# # y_predRF = RFmodel.predict(X_test)

# KNN = KNeighborsClassifier()
# KNN.fit(X_train, y_train)
# y_predRF = KNN.predict(X_test)

# # from sklearn.linear_model import SGDClassifier
# # lr = SGDClassifier(loss='perceptron', random_state=5)
# # lr.fit(X_train, y_train)

# # lr_params = {'alpha' : [10**(-x) for x in range(7)],
# #              'penalty' : ['l1', 'l2', 'elasticnet'],
# #              'l1_ratio' : [0.15, 0.25, 0.5, 0.75]}

# # y_predRF = lr.predict(X_train)

# # Print the accuracy score
# print(accuracy_score(y_test, y_predRF))
# print(f1_score(y_test, y_predRF, zero_division=0))

In [None]:
# # Id = X['x'].values

# #Switch off the line below when using training data
# X = df_fp.drop(["Expected", "Id", "SMILES"], axis=1)

# #Switch off the line below when using testing data
# # X = df_fp.drop(["x", "SMILES"], axis=1)

# # Initialize the scaler
# scaler = StandardScaler()

# # Fit the scaler to the data
# scaler.fit(X)

# # Scale the data
# X_scaled = scaler.transform(X)


In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, df_fp["Expected"].values, test_size=0.3, stratify=df_fp['Expected'],random_state=5)

# # RFmodel = RandomForestClassifier(n_estimators=500, random_state=1, max_depth=7, )
# # RFmodel.fit(X_train, y_train)
# # y_predRF = RFmodel.predict(X_test)

# hgbm = HistGradientBoostingClassifier(random_state=42)
# hgbm.fit(X_train, y_train)
# hgbm.score(X_test, y_test)


# print(accuracy_score(y_test, y_predRF))
# print(f1_score(y_test, y_predRF, zero_division=0))

In [None]:
# y_predRF = hgbm.predict(X_test)
# print(f1_score(y_test, y_predRF, zero_division=0))