AI-QSAR model for carcinogenicity: single task classification model

Author: Chi-Yun Chen; Advisor: Zhoumeng Lin; Date created: Nov.13.2024

Description: Traditional machine learning model development

##Install necessary libraries

In [None]:
!pip install --upgrade pip

Defaulting to user installation because normal site-packages is not writeable
[0m

In [None]:
# Install python pacakges
!pip install -q rdkit-pypi==2023.3.1b1
!pip install scikit-optimize==0.10.2
!pip install seaborn

[0mDefaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable
[0m

In [None]:
#Install basic python pcakges
import pandas as pd
import numpy as np
import seaborn as sns
import time

# Molecular Descriptors Calculation
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys, Descriptors, DataStructs
from rdkit.DataStructs import ExplicitBitVect
from rdkit.ML.Descriptors import MoleculeDescriptors

# Modeling
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, RandomizedSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler        # Feature scaling for RDKit descriptors
from sklearn.decomposition import PCA
from pandas.core.common import random_state
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline

# Import the SMILES contained dataset

In [None]:
df = pd.read_csv('SMILES_cancer df.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 957 entries, 0 to 956
Data columns (total 84 columns):
 #   Column                                             Non-Null Count  Dtype 
---  ------                                             --------------  ----- 
 0   CAS                                                957 non-null    object
 1   Alimentary system                                  957 non-null    int64 
 2   Endocrine system                                   957 non-null    int64 
 3   Exocrine system                                    957 non-null    int64 
 4   Haematolymphoid system                             957 non-null    int64 
 5   Hepatobiliary system                               957 non-null    int64 
 6   Reproductive system                                957 non-null    int64 
 7   Respiratory system                                 957 non-null    int64 
 8   Skin and soft tissue                               957 non-null    int64 
 9   Special Senses System

In [None]:
duplicates = df[df.duplicated(subset='SMILES', keep=False)]

# Show the removed chemicals (duplicates)
print(duplicates)

            CAS  Alimentary system  Endocrine system  Exocrine system  \
23     156-59-2                  0                 0                0   
24     156-60-5                  0                 0                0   
47   60348-60-9                  0                 0                0   
652  12789-03-6                  0                 1                0   
661    319-84-6                  0                 0                0   
662    319-85-7                  0                 0                0   
663  32534-81-9                  0                 1                0   
675     60-57-1                  0                 0                0   
676    608-73-1                  0                 0                0   
687     72-20-8                  0                 0                0   
867     57-74-9                  0                 0                0   
869     58-89-9                  0                 0                0   

     Haematolymphoid system  Hepatobiliary system 

In [None]:
# Outcomes show 12 chemicals with 5 duplicates, therefore 957 compounds --> 950 unique compounds

# Check and remove the Y of duplicate SMILES
Since these duplicates contain different y and in vitro data, we need to remove all the duplicated chemicals (n = 12) from the original dataframe

In [None]:
# Filter out rows from df where the SMILES column matches any SMILES in duplicates
df_finalized = df[~df['CAS'].isin(duplicates['CAS'])]
df_finalized = df_finalized.reset_index(drop=True)
df_finalized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 945 entries, 0 to 944
Data columns (total 84 columns):
 #   Column                                             Non-Null Count  Dtype 
---  ------                                             --------------  ----- 
 0   CAS                                                945 non-null    object
 1   Alimentary system                                  945 non-null    int64 
 2   Endocrine system                                   945 non-null    int64 
 3   Exocrine system                                    945 non-null    int64 
 4   Haematolymphoid system                             945 non-null    int64 
 5   Hepatobiliary system                               945 non-null    int64 
 6   Reproductive system                                945 non-null    int64 
 7   Respiratory system                                 945 non-null    int64 
 8   Skin and soft tissue                               945 non-null    int64 
 9   Special Senses System

In [None]:
print(df_finalized)

            CAS  Alimentary system  Endocrine system  Exocrine system  \
0      101-55-3                  0                 0                0   
1      101-68-8                  0                 0                0   
2    10102-45-1                  0                 0                0   
3     1024-57-3                  0                 0                0   
4      106-44-5                  0                 0                0   
..          ...                ...               ...              ...   
940     99-56-9                  0                 0                0   
941     99-57-0                  0                 0                0   
942     99-59-2                  0                 0                0   
943     99-99-0                  0                 0                0   
944    999-81-5                  0                 0                0   

     Haematolymphoid system  Hepatobiliary system  Reproductive system  \
0                         0                     0

# Fingerprints preparing

In [None]:
# Define a function that transforms SMILES string into 2D RDKIT descriptors
def cal_rdkit_descr(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles]
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()

    mol_descriptors = []
    for mol in mols:
        mol = Chem.AddHs(mol)
        descriptors = calc.CalcDescriptors(mol)
        mol_descriptors.append(descriptors)

    return pd.DataFrame(mol_descriptors, index=smiles, columns=["rdkit_" + str(i) for i in desc_names])


# Define a function that transforms a SMILES string into an FCFP (if use_features = TRUE) or--
# --the Extended-Connectivity Fingerprints (ECFP) descriptors (if use_features = FALSE)

def cal_ECFP6_descr(smiles,
            R = 3,               # no default value, usually set 2 for similarity search and 3 for machine learning
            nBits = 2**10, # nBits = 1024; default is 2048
            use_features = False,
            use_chirality = False):

   '''
   Inputs:
   - smiles...SMILES string of input compounds
   - R....Maximum radius of circular substructures--By using this radius parameter, we compute ECFP6 (the equivalent of radius 3)
   - nBits....number of bits, default is 2048. 1024 is also widely used.
   - use_features...if true then use pharmacophoric atom features (FCFPs), if false then use standard DAYLIGHT atom features (ECFP)
   - use_chirality...if true then append tetrahedral chirality flags to atom features
   Outputs:
   - pd.DataFrame...ECFP or FCFPs with length nBits and maximum radus R

   '''
   mols = [AllChem.MolFromSmiles(i) for i in smiles]

   ecfp_descriptors = []
   for mol in mols:
        ecfp = AllChem.GetMorganFingerprintAsBitVect(mol,
                                radius = R,
                                nBits = nBits,
                                useFeatures = use_features,
                                useChirality = use_chirality)
        array = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(ecfp, array)
        ecfp_descriptors.append(ecfp)

   return pd.DataFrame([list(l) for l in ecfp_descriptors], index = smiles, columns=[f'ECFP6_Bit_{i}' for i in range(nBits)])


# Define a function that transforms a SMILES string into an FCFP (if use_features = TRUE)
def cal_FCFP6_descr(smiles,
            R = 3,
            nBits = 2**10, # nBits = 1024
            use_features = True,
            use_chirality = False):

   mols = [AllChem.MolFromSmiles(i) for i in smiles]

   fcfp_descriptors = []
   for mol in mols:
        fcfp = AllChem.GetMorganFingerprintAsBitVect(mol,
                                radius = R,
                                nBits = nBits,
                                useFeatures = use_features,
                                useChirality = use_chirality)
        array = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fcfp, array)
        fcfp_descriptors.append(fcfp)

   return pd.DataFrame([list(l) for l in fcfp_descriptors], index = smiles, columns=[f'FCFP6_Bit_{i}' for i in range(nBits)])


# Define a function that transforms a SMILES string into an MACCS fingerprints

def cal_MACCS_descr(smiles):

   mols = [Chem.MolFromSmiles(i) for i in smiles]
   MACCS_descriptors = []
   for mol in mols:
        fp = MACCSkeys.GenMACCSKeys (mol)
        array = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, array)
        MACCS_descriptors.append(fp)

   return pd.DataFrame([list(l) for l in MACCS_descriptors], index = smiles, columns=[f'MACCS_Bit_{i}' for i in range(167)])

In [None]:
# Calculating RDKit descriptors
rdkit_descrs = cal_rdkit_descr(df_finalized['SMILES'])
# Check if there are any missing values
missing_summary = rdkit_descrs.isnull().sum()
missing_summary = missing_summary[missing_summary > 0]

print("Columns with missing values and their counts:")
print(missing_summary)

Columns with missing values and their counts:
rdkit_MaxPartialCharge       14
rdkit_MinPartialCharge       14
rdkit_MaxAbsPartialCharge    14
rdkit_MinAbsPartialCharge    14
rdkit_BCUT2D_MWHI            59
rdkit_BCUT2D_MWLOW           59
rdkit_BCUT2D_CHGHI           59
rdkit_BCUT2D_CHGLO           59
rdkit_BCUT2D_LOGPHI          59
rdkit_BCUT2D_LOGPLOW         59
rdkit_BCUT2D_MRHI            59
rdkit_BCUT2D_MRLOW           59
dtype: int64


In [None]:
# Calculating RDKit descriptors
rdkit_descrs = cal_rdkit_descr(df_finalized['SMILES']).drop(['rdkit_MaxPartialCharge','rdkit_MinPartialCharge','rdkit_MaxAbsPartialCharge', 'rdkit_MinAbsPartialCharge', 'rdkit_BCUT2D_MWHI', 'rdkit_BCUT2D_MWLOW', 'rdkit_BCUT2D_CHGHI', 'rdkit_BCUT2D_CHGLO', 'rdkit_BCUT2D_LOGPHI', 'rdkit_BCUT2D_LOGPLOW', 'rdkit_BCUT2D_MRHI', 'rdkit_BCUT2D_MRLOW'], axis=1)
rdkit_descrs

Unnamed: 0_level_0,rdkit_MaxEStateIndex,rdkit_MinEStateIndex,rdkit_MaxAbsEStateIndex,rdkit_MinAbsEStateIndex,rdkit_qed,rdkit_MolWt,rdkit_HeavyAtomMolWt,rdkit_ExactMolWt,rdkit_NumValenceElectrons,rdkit_NumRadicalElectrons,...,rdkit_fr_sulfide,rdkit_fr_sulfonamd,rdkit_fr_sulfone,rdkit_fr_term_acetylene,rdkit_fr_tetrazole,rdkit_fr_thiazole,rdkit_fr_thiocyan,rdkit_fr_thiophene,rdkit_fr_unbrch_alkane,rdkit_fr_urea
SMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1=CC=C(C=C1)OC2=CC=C(C=C2)Br,7.814480,-0.628907,7.814480,0.054712,0.772870,249.107,240.035,247.983677,70,0,...,0,0,0,0,0,0,0,0,0,0
C1=CC(=CC=C1CC2=CC=C(C=C2)N=C=O)N=C=O,10.471720,-3.021449,10.471720,0.662247,0.617540,250.257,240.177,250.074228,92,0,...,0,0,0,0,0,0,0,0,0,0
[N+](=O)([O-])[O-].[Tl+],8.250000,-1.750000,8.250000,0.000000,0.333331,266.387,266.387,266.962245,26,2,...,0,0,0,0,0,0,0,0,0,0
C12C(C(C3C1O3)Cl)C4(C(=C(C2(C4(Cl)Cl)Cl)Cl)Cl)Cl,8.818441,-2.713642,8.818441,0.418935,0.440550,389.320,384.280,385.816009,100,0,...,0,0,0,0,0,0,0,0,0,0
CC1=CC=C(C=C1)O,7.418449,-2.735790,7.418449,0.580556,0.535935,108.140,100.076,108.057515,42,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C1=CC(=C(C=C1[N+](=O)[O-])N)N,10.684692,-1.071708,10.684692,0.159174,0.353370,153.141,146.085,153.053826,58,0,...,0,0,0,0,0,0,0,0,0,0
C1=CC(=C(C=C1[N+](=O)[O-])N)O,10.622830,-1.052150,10.622830,0.169954,0.271725,154.125,148.077,154.037842,58,0,...,0,0,0,0,0,0,0,0,0,0
COC1=C(C=C(C=C1)[N+](=O)[O-])N,10.755498,-3.059947,10.755498,0.269954,0.408307,168.152,160.088,168.053492,64,0,...,0,0,0,0,0,0,0,0,0,0
CC1=CC=C(C=C1)[N+](=O)[O-],10.566282,-2.852286,10.566282,0.807315,0.437912,137.138,130.082,137.047678,52,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
has_missing = rdkit_descrs.isnull().any().any()
print(f"Does the DataFrame have missing values? {has_missing}")

Does the DataFrame have missing values? False


In [None]:
# Calculating ECFP6 fingerprints
ECFP6_descrs = cal_ECFP6_descr(df_finalized['SMILES'])
ECFP6_descrs

Unnamed: 0_level_0,ECFP6_Bit_0,ECFP6_Bit_1,ECFP6_Bit_2,ECFP6_Bit_3,ECFP6_Bit_4,ECFP6_Bit_5,ECFP6_Bit_6,ECFP6_Bit_7,ECFP6_Bit_8,ECFP6_Bit_9,...,ECFP6_Bit_1014,ECFP6_Bit_1015,ECFP6_Bit_1016,ECFP6_Bit_1017,ECFP6_Bit_1018,ECFP6_Bit_1019,ECFP6_Bit_1020,ECFP6_Bit_1021,ECFP6_Bit_1022,ECFP6_Bit_1023
SMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1=CC=C(C=C1)OC2=CC=C(C=C2)Br,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C1=CC(=CC=C1CC2=CC=C(C=C2)N=C=O)N=C=O,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
[N+](=O)([O-])[O-].[Tl+],0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
C12C(C(C3C1O3)Cl)C4(C(=C(C2(C4(Cl)Cl)Cl)Cl)Cl)Cl,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
CC1=CC=C(C=C1)O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C1=CC(=C(C=C1[N+](=O)[O-])N)N,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C1=CC(=C(C=C1[N+](=O)[O-])N)O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COC1=C(C=C(C=C1)[N+](=O)[O-])N,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CC1=CC=C(C=C1)[N+](=O)[O-],0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
has_missing = ECFP6_descrs.isnull().any().any()
print(f"Does the DataFrame have missing values? {has_missing}")

Does the DataFrame have missing values? False


In [None]:
# Calculating FCFP6 fingerprints
FCFP6_descrs = cal_FCFP6_descr(df_finalized['SMILES'])
FCFP6_descrs

Unnamed: 0_level_0,FCFP6_Bit_0,FCFP6_Bit_1,FCFP6_Bit_2,FCFP6_Bit_3,FCFP6_Bit_4,FCFP6_Bit_5,FCFP6_Bit_6,FCFP6_Bit_7,FCFP6_Bit_8,FCFP6_Bit_9,...,FCFP6_Bit_1014,FCFP6_Bit_1015,FCFP6_Bit_1016,FCFP6_Bit_1017,FCFP6_Bit_1018,FCFP6_Bit_1019,FCFP6_Bit_1020,FCFP6_Bit_1021,FCFP6_Bit_1022,FCFP6_Bit_1023
SMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1=CC=C(C=C1)OC2=CC=C(C=C2)Br,0,0,1,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
C1=CC(=CC=C1CC2=CC=C(C=C2)N=C=O)N=C=O,1,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
[N+](=O)([O-])[O-].[Tl+],1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C12C(C(C3C1O3)Cl)C4(C(=C(C2(C4(Cl)Cl)Cl)Cl)Cl)Cl,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
CC1=CC=C(C=C1)O,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C1=CC(=C(C=C1[N+](=O)[O-])N)N,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C1=CC(=C(C=C1[N+](=O)[O-])N)O,0,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COC1=C(C=C(C=C1)[N+](=O)[O-])N,1,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
CC1=CC=C(C=C1)[N+](=O)[O-],1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
has_missing = FCFP6_descrs.isnull().any().any()
print(f"Does the DataFrame have missing values? {has_missing}")

Does the DataFrame have missing values? False


In [None]:
# Calculating MACCS fingerprints
MACCS_descrs = cal_MACCS_descr(df_finalized['SMILES'])
MACCS_descrs

Unnamed: 0_level_0,MACCS_Bit_0,MACCS_Bit_1,MACCS_Bit_2,MACCS_Bit_3,MACCS_Bit_4,MACCS_Bit_5,MACCS_Bit_6,MACCS_Bit_7,MACCS_Bit_8,MACCS_Bit_9,...,MACCS_Bit_157,MACCS_Bit_158,MACCS_Bit_159,MACCS_Bit_160,MACCS_Bit_161,MACCS_Bit_162,MACCS_Bit_163,MACCS_Bit_164,MACCS_Bit_165,MACCS_Bit_166
SMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C1=CC=C(C=C1)OC2=CC=C(C=C2)Br,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,1,1,1,0
C1=CC(=CC=C1CC2=CC=C(C=C2)N=C=O)N=C=O,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,1,0
[N+](=O)([O-])[O-].[Tl+],0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,1,0,1
C12C(C(C3C1O3)Cl)C4(C(=C(C2(C4(Cl)Cl)Cl)Cl)Cl)Cl,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,1,1,0
CC1=CC=C(C=C1)O,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C1=CC(=C(C=C1[N+](=O)[O-])N)N,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,1,0
C1=CC(=C(C=C1[N+](=O)[O-])N)O,0,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
COC1=C(C=C(C=C1)[N+](=O)[O-])N,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
CC1=CC=C(C=C1)[N+](=O)[O-],0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0


In [None]:
has_missing = MACCS_descrs.isnull().any().any()
print(f"Does the DataFrame have missing values? {has_missing}")

Does the DataFrame have missing values? False


# Feature scaling
Only RDKit descriptors are not binary

In [None]:
# Applying feature scaling for RDKit descriptors

scaler = MinMaxScaler()  # scaled to a fixed range (e.g., [0, 1])
X_rdkit_descrs = rdkit_descrs
X_rdkit_descrs_scal = scaler.fit_transform(X_rdkit_descrs)
X_rdkit_descrs_scal = pd.DataFrame(X_rdkit_descrs_scal, columns = rdkit_descrs.columns.values.tolist())
X_rdkit_descrs_scal

Unnamed: 0,rdkit_MaxEStateIndex,rdkit_MinEStateIndex,rdkit_MaxAbsEStateIndex,rdkit_MinAbsEStateIndex,rdkit_qed,rdkit_MolWt,rdkit_HeavyAtomMolWt,rdkit_ExactMolWt,rdkit_NumValenceElectrons,rdkit_NumRadicalElectrons,...,rdkit_fr_sulfide,rdkit_fr_sulfonamd,rdkit_fr_sulfone,rdkit_fr_term_acetylene,rdkit_fr_tetrazole,rdkit_fr_thiazole,rdkit_fr_thiocyan,rdkit_fr_thiophene,rdkit_fr_unbrch_alkane,rdkit_fr_urea
0,0.252207,0.871866,0.252207,0.010834,0.816642,0.125322,0.125410,0.124737,0.099359,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.413434,0.688662,0.413434,0.131135,0.648200,0.126015,0.125498,0.125997,0.134615,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.278632,0.786021,0.278632,0.000000,0.339999,0.135731,0.141770,0.136176,0.028846,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.313122,0.712232,0.313122,0.082955,0.456269,0.209780,0.214961,0.207811,0.147436,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.228178,0.710536,0.228178,0.114959,0.559707,0.040410,0.038520,0.040401,0.054487,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,0.426356,0.837960,0.426356,0.031519,0.361730,0.067517,0.067084,0.067521,0.080128,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.422603,0.839457,0.422603,0.033653,0.273193,0.068109,0.068321,0.068114,0.080128,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.430652,0.685714,0.430652,0.053455,0.421305,0.076559,0.075777,0.076562,0.089744,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
943,0.419172,0.701616,0.419172,0.159860,0.453409,0.057877,0.057149,0.057874,0.070513,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Split dataframe to X and y
Separate X into CAS and in vitro assays

In [None]:
# Single task
Ya_2d = df_finalized.loc[:, ['Alimentary system']]
Yen_2d = df_finalized.loc[:, ['Endocrine system']]
Yex_2d = df_finalized.loc[:, ['Exocrine system']]
Yha_2d = df_finalized.loc[:, ['Haematolymphoid system']]
Yhe_2d = df_finalized.loc[:, ['Hepatobiliary system']]
Yrep_2d = df_finalized.loc[:, ['Reproductive system']]
Yres_2d = df_finalized.loc[:, ['Respiratory system']]
Ysk_2d = df_finalized.loc[:, ['Skin and soft tissue']]
Ysp_2d = df_finalized.loc[:, ['Special Senses System']]
Yu_2d = df_finalized.loc[:, ['Urinary system']]

In [None]:
# Combine all 10 columns into a single column using the "any" condition
df_finalized['whole'] = (df_finalized.iloc[:, 1:10].sum(axis=1) > 0).astype(int)

In [None]:
Y_2d = df_finalized.loc[:, ['whole']]

In [None]:
# Convert Y as a column vector to a 1D array
Ya = Ya_2d.values.ravel()
Yen = Yen_2d.values.ravel()
Yex = Yex_2d.values.ravel()
Yha = Yha_2d.values.ravel()
Yhe = Yhe_2d.values.ravel()
Yrep = Yrep_2d.values.ravel()
Yres = Yres_2d.values.ravel()
Ysk = Ysk_2d.values.ravel()
Ysp = Ysp_2d.values.ravel()
Yu = Yu_2d.values.ravel()
Y = Y_2d.values.ravel()
print(Ya.shape)
print(Yen.shape)
print(Yex.shape)
print(Yha.shape)
print(Yhe.shape)
print(Yrep.shape)
print(Yres.shape)
print(Ysk.shape)
print(Ysp.shape)
print(Yu.shape)
print(Y.shape)

(945,)
(945,)
(945,)
(945,)
(945,)
(945,)
(945,)
(945,)
(945,)
(945,)
(945,)


In [None]:
# X features: in vitro assays
X_vitro = df_finalized.iloc[:, 11:82]
X_vitro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 945 entries, 0 to 944
Data columns (total 71 columns):
 #   Column                                             Non-Null Count  Dtype
---  ------                                             --------------  -----
 0   tox21-ache-p3_ache-inhibitor_1                     945 non-null    int64
 1   tox21-ache-p5_ache-inhibitor_1                     945 non-null    int64
 2   tox21-ahr-p1_ahr-agonist_1                         945 non-null    int64
 3   tox21-ap1-agonist-p1_ap1-agonist_1                 945 non-null    int64
 4   tox21-ar-bla-agonist-p1_ar-agonist_1               945 non-null    int64
 5   tox21-ar-bla-antagonist-p1_ar-antagonist_1         945 non-null    int64
 6   tox21-are-bla-p1_nrf2-agonist_1                    945 non-null    int64
 7   tox21-aromatase-p1_aromatase-inhibitor_1           945 non-null    int64
 8   tox21-car-agonist-p1_car-agonist_1                 945 non-null    int64
 9   tox21-car-antagonist-p1_car-anta

In [None]:
# Concatenating dataframes horizontally (along columns): in vitro assays and descriptors
# X_rdkit: X_vitro and X_rdkit_descrs_scal
X_rdkit = pd.concat([X_vitro, X_rdkit_descrs_scal.reset_index(drop=True)], axis=1)
# X_ECFP6: X_vitro and ECFP6_descrs
X_ECFP6 = pd.concat([X_vitro, ECFP6_descrs.reset_index(drop=True)], axis=1)
# X_FCFP6: X_vitro and FCFP6_descrs
X_FCFP6 = pd.concat([X_vitro, FCFP6_descrs.reset_index(drop=True)], axis=1)
# X_MACCS: X_vitro and MACCS_descrs
X_MACCS = pd.concat([X_vitro, MACCS_descrs.reset_index(drop=True)], axis=1)
# X_All: X_vitro and four descriptors
X_All = pd.concat([X_vitro, X_rdkit_descrs_scal.reset_index(drop=True), ECFP6_descrs.reset_index(drop=True), FCFP6_descrs.reset_index(drop=True), MACCS_descrs.reset_index(drop=True)], axis = 1)

In [None]:
print(X_rdkit.shape)
print(X_ECFP6.shape)
print(X_FCFP6.shape)
print(X_MACCS.shape)
print(X_All.shape)

(945, 268)
(945, 1095)
(945, 1095)
(945, 238)
(945, 2483)


# Build Machine Learning Modeling Pipeline

In [None]:
!pip install imblearn
!pip install catboost

Defaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable
[0m

In [None]:
from sklearn.model_selection import StratifiedKFold, train_test_split, StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
import warnings
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, balanced_accuracy_score, roc_curve
from sklearn.metrics import (precision_recall_curve, auc, matthews_corrcoef, f1_score, precision_score, recall_score)
import os
import matplotlib.pyplot as plt
import matplotlib.cm as cm  # Import the colormap module
from scipy.stats import randint, uniform
from itertools import product
from collections import Counter

import joblib
from joblib import dump

In [None]:
# Define the logistic regression model with class weights
log_reg = LogisticRegression(class_weight='balanced', solver='liblinear', max_iter = 5000)

# Define hyperparameter grids
# AdaBoost parameter distribution for hyperparameter tuning
Ada_parm_search = {
    'n_estimators': randint(2, 150),
    'learning_rate': uniform(0.005, 1.5)
}

# KNN parameter distribution for hyperparameter tuning
KNN_parm_search = {
    'n_neighbors': randint(1, 20),  # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weight function used in prediction
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metric
}

# RandomForest parameter distribution for hyperparameter tuning
rf_parm_search = {
        'n_estimators': randint(1, 200),     # Number of trees in the forest
        'max_depth': randint(2, 20),         # Maximum depth of the trees
        'min_samples_split': randint(2, 15), # Minimum number of samples required to split an internal node
        'min_samples_leaf': randint(1, 15),  # Minimum number of samples required to be a leaf node
        'max_features': ['sqrt', 'log2']
}

# SVC parameters distribution for hyperparameter tuning
svc_parm_search = {
        'C': uniform(0.1, 100),       # Regularization parameter
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernel type for the algorithm
        'gamma': ['scale', 'auto'],   # Kernel coefficient
        'degree': randint(2, 5)       # Degree of the polynomial kernel function
    }

# LogisticRegression parameter distribution for hyperparameter tuning
LR_parm_search = {
        'C': uniform(0.01, 100),
        'solver': ['lbfgs', 'liblinear', 'newton-cg', 'saga']
}

# XGBoost parameter distribution for hyperparameter tuning
xgb_parm_search = {
    'n_estimators': randint(40, 500),          # Number of trees in the forest
    'max_depth': randint(3, 20),               # Maximum depth of trees
    'learning_rate': uniform(0.01, 0.3),       # Step size shrinkage
    'subsample': uniform(0.5, 0.2),            # Fraction of samples used for fitting
    'colsample_bytree': uniform(0.5, 0.2),     # Fraction of features used for fitting
    'gamma': uniform(0, 10),                   # Minimum loss reduction required to make a further partition
    'scale_pos_weight': uniform(1, 10)         # Adjust weight of positive class
}

# Catboost parameter distribution for hyperparameter tuning: A gradient boosting model similar to XGBoost and LightGBM, but it often handles categorical and imbalanced data better
catboost_parm_search = {
    'iterations': randint(50, 240),          # Number of boosting iterations
    'depth': randint(3, 10),                 # Depth of the tree
    'learning_rate': uniform(0.01, 0.3),     # Learning rate
    'l2_leaf_reg': uniform(1, 10),           # L2 regularization
    'bagging_temperature': uniform(0, 1),    # Bagging temperature
    'border_count': randint(40, 260)         # Number of splits for numerical features
}

# Define models
models = {
    'Ada': (AdaBoostClassifier(algorithm='SAMME'), Ada_parm_search),
    'KNN': (KNeighborsClassifier(), KNN_parm_search),
    'RF': (RandomForestClassifier(), rf_parm_search),
    'SVC': (SVC(probability=True), svc_parm_search),    # Enable probability estimates for SVC
    'wLR': (log_reg, LR_parm_search),
    'XGB': (XGBClassifier(), xgb_parm_search),  # Setting `use_label_encoder=False` to suppress warning
    'CatBoost': (CatBoostClassifier(verbose=0, thread_count=5), catboost_parm_search)
    }

# Scenario I: without feature selection

In [None]:
# Dictionary mapping names to actual data
features_dict = {
    'RDKit': X_rdkit,
    'ECFP6': X_ECFP6,
    'FCFP6': X_FCFP6,
    'MACCS': X_MACCS,
    'All_included': X_All
}

labels_dict = {
    'EN': Yen,
    'EX': Yex,
    'HE': Yhe,
    'RES': Yres,
    'U': Yu,
    'Whole': Y
}

# Split data into target Y and features X
Y: each organ system;
X: descriptors and in vitro assays

80% train, 20% test

Cross validation by using StratifiedKFold to ensure that each fold maintains the same proportion of classes as in the original dataset

In [None]:
# Hyperparameter tuning for 8 algorithms
def tuning_hyper(X, y, model_name):
    model, parm_search = models[model_name]

    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)
    test_indices = df_finalized.loc[X_test.index, 'CAS']    # Save original test indices

# Apply SMOTE to training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    cv_accuracy_mean = []
    cv_accuracy_sd = []
    cv_precision_mean = []
    cv_precision_sd = []
    cv_recall_mean = []
    cv_recall_sd = []
    cv_roc_auc_mean = []
    cv_roc_auc_sd  = []

    random_search = RandomizedSearchCV(
        model,
        param_distributions=parm_search,
        n_iter=100,
        cv=skf,
        scoring='f1_weighted',   # Trade-off between precision and recall; useful when dealing with class imbalances
        n_jobs=-1,
        random_state=42,
        error_score='raise'  # Raises an error for debugging
        )

    # Fit model
    random_search.fit(X_resampled, y_resampled)

    # Store results
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    y_hat_train = best_model.predict(X_resampled)
    y_hat_test = best_model.predict(X_test)

    # Calculate probability predictions for positive class if supported
    y_prob_train, y_prob_test = None, None
    if hasattr(best_model, "predict_proba"):
        y_prob_train = best_model.predict_proba(X_resampled)[:, 1]
        y_prob_test = best_model.predict_proba(X_test)[:, 1]

    # Calculate confusion matrix on test set
    TN1, FP1, FN1, TP1 = confusion_matrix(y_resampled, y_hat_train).ravel()
    tn2, fp2, fn2, tp2 = confusion_matrix(y_test, y_hat_test).ravel()
    # Calculate metrics on cross-validation using best model
    accuracy_skfolds = cross_val_score(best_model, X_resampled, y_resampled, cv = skf, n_jobs=-1, scoring='accuracy')
    precision_skfolds = cross_val_score(best_model, X_resampled, y_resampled, cv = skf, n_jobs=-1, scoring='precision')
    recall_skfolds = cross_val_score(best_model, X_resampled, y_resampled, cv = skf, n_jobs=-1, scoring='recall')
    roc_auc_skfolds = cross_val_score(best_model, X_resampled, y_resampled, cv = skf, n_jobs=-1, scoring='roc_auc')

    accuracy_train = accuracy_score(y_resampled, y_hat_train)
    accuracy_test = accuracy_score(y_test, y_hat_test)
    balanced_accuracy_train = balanced_accuracy_score(y_resampled, y_hat_train)
    balanced_accuracy_test = balanced_accuracy_score(y_test, y_hat_test)
    roc_auc_train = roc_auc_score(y_resampled, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)
    mcc_train = matthews_corrcoef(y_resampled, y_hat_train)   # Matthews Correlation Coefficient (MCC)
    mcc_test = matthews_corrcoef(y_test, y_hat_test)
    f1_weighted_train = f1_score(y_resampled, y_hat_train, average='weighted')   # F1 Score (Weighted) for imbalanced class weighting.
    f1_weighted_test = f1_score(y_test, y_hat_test, average='weighted')

    # Calculate PR-AUC (Precision-Recall AUC) if probabilities are available
    pr_auc_test = None
    if y_prob_test is not None:
        precision, recall, _ = precision_recall_curve(y_test, y_prob_test)
        pr_auc_test = auc(recall, precision)

    results = {
        'TruePos': [TP1],
        'TrueNeg': [TN1],
        'FalsePos': [FP1],
        'FalseNeg': [FN1],
        'tp2': [tp2],
        'tn2': [tn2],
        'fp2': [fp2],
        'fn2': [fn2],
        'cv_accuracy_mean': [accuracy_skfolds.mean()],
        'cv_accuracy_sd': [accuracy_skfolds.std()],
        'cv_precision_mean': [precision_skfolds.mean()],
        'cv_precision_sd': [precision_skfolds.std()],
        'cv_recall_mean': [recall_skfolds.mean()],
        'cv_recall_sd': [recall_skfolds.std()],
        'cv_roc_auc_mean': [roc_auc_skfolds.mean()],
        'cv_roc_auc_sd': [roc_auc_skfolds.std()],
        'accuracy_train': [accuracy_train],
        'accuracy_test': [accuracy_test],
        'balanced_accuracy_train': [balanced_accuracy_train],
        'balanced_accuracy_test': [balanced_accuracy_test],
        'roc_auc_train': [roc_auc_train],
        'roc_auc_test': [roc_auc_test],
        'mcc_train': [mcc_train],
        'mcc_test': [mcc_test],
        'f1_weighted_train': [f1_weighted_train],
        'f1_weighted_test': [f1_weighted_test],
        'pr_auc_test': [pr_auc_test]  # PR-AUC for test set
    }

    return best_model, best_params, results, X_resampled, X_test, y_resampled, y_test, y_hat_train, y_hat_test, test_indices, y_prob_test

In [None]:
# Define label and feature types
feature_types = ["RDKit", "ECFP6", "FCFP6", "MACCS", "All_included"]
label_types = ["EN", "EX", "HE", "RES", "U", "Whole"]

features = [X_rdkit, X_ECFP6, X_FCFP6, X_MACCS, X_All]
labels = [Yen, Yex, Yhe, Yres, Yu, Y]

In [None]:
# Adjust warnings filtering (optional)
warnings.filterwarnings("ignore", category=UserWarning, module='joblib')

# Prepare an empty list to store all results
all_results = []

# Print the accuracy, precision, recall, and roc-auc results for hyperparameter tuning simultaneously for three algorithms: Logisticregression, RF, and SVC with five types of descriptors
for label_type, y in zip(label_types, [Yen, Yex, Yhe, Yres, Yu, Y]):
    print(f"========== Label Type: {label_type} ==========")

    for feature_type, X in zip(feature_types, [X_rdkit, X_ECFP6, X_FCFP6, X_MACCS, X_All]):
        print(f"========== Feature Type: {feature_type} ==========")

        for model_name, (model, _) in models.items():
            print(f"Model: {model_name}")
            best_model, best_params, results, X_resampled, X_test, y_resampled, y_test, y_hat_train, y_hat_test, test_indices, y_prob_test = tuning_hyper(X=X, y=y, model_name=model_name)

            dump(best_model, f"{model_name}_best_model_{label_type}_{feature_type}.joblib")

            # Flatten the results dictionary into a row format for the DataFrame
            row = {
                'Label Type': label_type,
                'Feature Type': feature_type,
                'Model': model_name,
                'Best Params': best_params,
                'TruePos_train': results['TruePos'][0],
                'TrueNeg_train': results['TrueNeg'][0],
                'FalsePos_train': results['FalsePos'][0],
                'FalseNeg_train': results['FalseNeg'][0],
                'TruePos_test': results['tp2'][0],
                'TrueNeg_test': results['tn2'][0],
                'FalsePos_test': results['fp2'][0],
                'FalseNeg_test': results['fn2'][0],
                'CV_accuracy_mean': results['cv_accuracy_mean'][0],
                'CV_accuracy_sd': results['cv_accuracy_sd'][0],
                'CV_precision_mean': results['cv_precision_mean'][0],
                'CV_precision_sd': results['cv_precision_sd'][0],
                'CV_recall_mean': results['cv_recall_mean'][0],
                'CV_recall_sd': results['cv_recall_sd'][0],
                'CV_ROC AUC_mean': results['cv_roc_auc_mean'][0],
                'CV_ROC AUC_sd': results['cv_roc_auc_sd'][0],
                'Accuracy Train': results['accuracy_train'][0],
                'Accuracy Test': results['accuracy_test'][0],
                'Balanced Accuracy Train': results['balanced_accuracy_train'][0],
                'Balanced Accuracy Test': results['balanced_accuracy_test'][0],
                'MCC Train': results['mcc_train'][0],
                'MCC Test': results['mcc_test'][0],
                'F1 Train': results['f1_weighted_train'][0],
                'F1 Test': results['f1_weighted_test'][0],
                'ROC AUC Train': results['roc_auc_train'][0],
                'ROC AUC Test': results['roc_auc_test'][0],
                'PR-AUC Test': results['pr_auc_test'][0],
                'test_chem': test_indices.tolist(),
                'y_test': y_test.tolist(),
                'y_pred_test': y_hat_test.tolist(),
                'y_prob_test': y_prob_test.tolist()
            }

            # Append the row to the list of all results
            all_results.append(row)

# Convert the list of results into a DataFrame
df_results = pd.DataFrame(all_results)

# Optionally, save the DataFrame to a CSV file or display it
df_results.to_csv('Model_results.csv', index=False)
print(df_results)

Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost






Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost












Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost




















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost




















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost












Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
























Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost













Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost








Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost




















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost








Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost









Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost







Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
























Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost



















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost










Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost






Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost






Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost












Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost








Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost



















    Label Type  Feature Type     Model  \
0           EN         RDKit       Ada   
1           EN         RDKit       KNN   
2           EN         RDKit        RF   
3           EN         RDKit       SVC   
4           EN         RDKit       wLR   
..         ...           ...       ...   
205      Whole  All_included        RF   
206      Whole  All_included       SVC   
207      Whole  All_included       wLR   
208      Whole  All_included       XGB   
209      Whole  All_included  CatBoost   

                                           Best Params  TruePos_train  \
0    {'learning_rate': 1.146177572925346, 'n_estima...            573   
1    {'metric': 'manhattan', 'n_neighbors': 2, 'wei...            625   
2    {'max_depth': 10, 'max_features': 'log2', 'min...            638   
3    {'C': 95.40718470239531, 'degree': 2, 'gamma':...            638   
4          {'C': 91.50596755437809, 'solver': 'lbfgs'}            585   
..                                                 ...   

# Scenario II: Feature selection

# Feature selection

Use SelectKBest and calculate the optimal k based on the variance explained by cumulative feature importance. This method dynamically adjusts the number of features to include those that explain most of the variance.

In [None]:
# Examine label and feature types
feature_types = ["RDKit", "ECFP6", "FCFP6", "MACCS", "All_included"]
label_types = ["EN", "EX", "HE", "RES", "U", "Whole"]

features = [X_rdkit, X_ECFP6, X_FCFP6, X_MACCS, X_All]
labels = [Yen, Yex, Yhe, Yres, Yu, Y]

In [None]:
# Target cumulative importance threshold
cumulative_importance_threshold = 0.3

# Dictionary to store the selected features for each combination
selected_features = {}

csv_data = []

# Loop through each combination of features (X) and labels (y)
for i, X_data in enumerate(features):
    for j, y_data in enumerate(labels):

        # Split training and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X_data, y_data, test_size=0.2, random_state=42, stratify=y_data
        )

        # Remove features with zero variance
        variance = X_train.var()
        X_train = X_train.loc[:, variance != 0]

        # Apply SelectKBest with k='all' to get scores for all features
        selector = SelectKBest(score_func=chi2, k='all')
        selector.fit(X_train, y_train)

        # Calculate cumulative importance and determine optimal k
        scores = selector.scores_
        cumulative_importance = np.cumsum(scores / np.sum(scores))
        optimal_k = np.argmax(cumulative_importance >= cumulative_importance_threshold) + 1  # Number of top features

        # Re-apply SelectKBest with optimal k for feature selection
        selector = SelectKBest(score_func=chi2, k=optimal_k)
        selector.fit(X_train, y_train)

        # Get indices of selected features
        top_features_indices = selector.get_support(indices=True)

        # Apply the selected features to the entire dataset (X_data)
        selected_features_for_combo = X_data.iloc[:, top_features_indices]

        # Get the names of the top features
        top_features_names = X_data.columns[top_features_indices]

        # Store the selected features in the dictionary with a descriptive key
        key = f"{feature_types[i]}_to_{label_types[j]}"
        selected_features[key] = {
            "features": selected_features_for_combo,
            "feature_names": top_features_names,
            "scores": selector.scores_[top_features_indices]
        }

        # Dynamically create a variable for the top features only
        top_feature_variable_name = f"{key}_top_features"
        globals()[top_feature_variable_name] = selected_features_for_combo

        # Print the created variable name and number of features selected
        print(f"\nCreated variable: {top_feature_variable_name}, Optimal number of features: {optimal_k}")

        # Append the data for CSV
        csv_data.append({
            "Variable Name": top_feature_variable_name,
            "Number of Selected Features": optimal_k,
            "Feature Names": list(top_features_names),
            "Scores": list(selector.scores_[top_features_indices])
        })

# Convert the data into a DataFrame
csv_df = pd.DataFrame(csv_data)

# Save DataFrame to CSV
csv_df.to_csv("Selected features.csv", index=False)

print("CSV file 'Selected features.csv' has been saved.")


Created variable: RDKit_to_EN_top_features, Optimal number of features: 46

Created variable: RDKit_to_EX_top_features, Optimal number of features: 54

Created variable: RDKit_to_HE_top_features, Optimal number of features: 29

Created variable: RDKit_to_RES_top_features, Optimal number of features: 63

Created variable: RDKit_to_U_top_features, Optimal number of features: 56

Created variable: RDKit_to_Whole_top_features, Optimal number of features: 33

Created variable: ECFP6_to_EN_top_features, Optimal number of features: 247

Created variable: ECFP6_to_EX_top_features, Optimal number of features: 333

Created variable: ECFP6_to_HE_top_features, Optimal number of features: 255

Created variable: ECFP6_to_RES_top_features, Optimal number of features: 363

Created variable: ECFP6_to_U_top_features, Optimal number of features: 340

Created variable: ECFP6_to_Whole_top_features, Optimal number of features: 285

Created variable: FCFP6_to_EN_top_features, Optimal number of features: 259

# SMOTE applied

In [None]:
# Prepare an empty list to store all results
fea_results = []

# Print the accuracy, precision, recall, and roc-auc results for hyperparameter tuning
for label_type, y in zip(label_types, labels):
    print(f"========== Label Type: {label_type} ==========")

    for feature_type in feature_types:
        # Get the selected features for the current feature-label combination from the selected_features dictionary
        key = f"{feature_type}_to_{label_type}"
        X_selected_features = selected_features[key]["features"]

        print(f"========== Feature Type: {feature_type} ==========")

        for model_name, (model, _) in models.items():
            print(f"Model: {model_name}")
            best_model, best_params, results, X_resampled, X_test, y_resampled, y_test, y_hat_train, y_hat_test, test_indices, y_prob_test = tuning_hyper(X=X_selected_features, y=y, model_name=model_name)

            dump(best_model, f"{model_name}_feature_model_{label_type}_{feature_type}.joblib")

            # Flatten the results dictionary into a row format for the DataFrame
            row = {
                'Label Type': label_type,
                'Feature Type': feature_type,
                'Model': model_name,
                'Best Params': best_params,
                'TruePos_train': results['TruePos'][0],
                'TrueNeg_train': results['TrueNeg'][0],
                'FalsePos_train': results['FalsePos'][0],
                'FalseNeg_train': results['FalseNeg'][0],
                'TruePos_test': results['tp2'][0],
                'TrueNeg_test': results['tn2'][0],
                'FalsePos_test': results['fp2'][0],
                'FalseNeg_test': results['fn2'][0],
                'CV_accuracy_mean': results['cv_accuracy_mean'][0],
                'CV_accuracy_sd': results['cv_accuracy_sd'][0],
                'CV_precision_mean': results['cv_precision_mean'][0],
                'CV_precision_sd': results['cv_precision_sd'][0],
                'CV_recall_mean': results['cv_recall_mean'][0],
                'CV_recall_sd': results['cv_recall_sd'][0],
                'CV_ROC AUC_mean': results['cv_roc_auc_mean'][0],
                'CV_ROC AUC_sd': results['cv_roc_auc_sd'][0],
                'Accuracy Train': results['accuracy_train'][0],
                'Accuracy Test': results['accuracy_test'][0],
                'Balanced Accuracy Train': results['balanced_accuracy_train'][0],
                'Balanced Accuracy Test': results['balanced_accuracy_test'][0],
                'MCC Train': results['mcc_train'][0],
                'MCC Test': results['mcc_test'][0],
                'F1 Train': results['f1_weighted_train'][0],
                'F1 Test': results['f1_weighted_test'][0],
                'ROC AUC Train': results['roc_auc_train'][0],
                'ROC AUC Test': results['roc_auc_test'][0],
                'PR-AUC Test': results['pr_auc_test'][0],
                'test_chem': test_indices.tolist(),
                'y_test': y_test.tolist(),
                'y_pred_test': y_hat_test.tolist(),
                'y_prob_test': y_prob_test.tolist() if y_prob_test is not None else None  # Convert y_prob_test to list format
            }

            # Append the row to the list of all results
            fea_results.append(row)

# Convert the list of results into a DataFrame
df_TOPresults = pd.DataFrame(fea_results)

# Optionally, save the DataFrame to a CSV file or display it
df_TOPresults.to_csv('Model_results_top10X.csv', index=False)
print(df_TOPresults)

Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB




Model: CatBoost








Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





Model: Ada




Model: KNN




Model: RF
Model: SVC
Model: wLR
Model: XGB




Model: CatBoost



















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost









Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost






Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost



















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR




Model: XGB




Model: CatBoost

















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost











Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost







Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost













Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB




Model: CatBoost












Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





Model: Ada
Model: KNN




Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost












Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost









Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost


















Model: Ada




Model: KNN
Model: RF
Model: SVC
Model: wLR




Model: XGB




Model: CatBoost



















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost









Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost


















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB




Model: CatBoost













Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost












Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost














Model: Ada




Model: KNN




Model: RF
Model: SVC
Model: wLR
Model: XGB




Model: CatBoost












    Label Type  Feature Type     Model  \
0           EN         RDKit       Ada   
1           EN         RDKit       KNN   
2           EN         RDKit        RF   
3           EN         RDKit       SVC   
4           EN         RDKit       wLR   
..         ...           ...       ...   
205      Whole  All_included        RF   
206      Whole  All_included       SVC   
207      Whole  All_included       wLR   
208      Whole  All_included       XGB   
209      Whole  All_included  CatBoost   

                                           Best Params  TruePos_train  \
0    {'learning_rate': 1.4801347113422643, 'n_estim...            505   
1    {'metric': 'manhattan', 'n_neighbors': 1, 'wei...            637   
2    {'max_depth': 18, 'max_features': 'sqrt', 'min...            592   
3    {'C': 78.06910002727692, 'degree': 2, 'gamma':...            583   
4     {'C': 3.4488521115218393, 'solver': 'liblinear'}            402   
..                                                 ...   

# No SMOTE apllication

In [None]:
# Hyperparameter tuning for 8 algorithms
def tuning_hyper2(X, y, model_name):
    model, parm_search = models[model_name]

    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)
    test_indices = df_finalized.loc[X_test.index, 'CAS']    # Save original test indices

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    cv_accuracy_mean = []
    cv_accuracy_sd = []
    cv_precision_mean = []
    cv_precision_sd = []
    cv_recall_mean = []
    cv_recall_sd = []
    cv_roc_auc_mean = []
    cv_roc_auc_sd  = []

    random_search = RandomizedSearchCV(
        model,
        param_distributions=parm_search,
        n_iter=100,
        cv=skf,
        scoring='f1_weighted',   # Trade-off between precision and recall; useful when dealing with class imbalances
        n_jobs=-1,
        random_state=42,
        error_score='raise'  # Raises an error for debugging
        )

    # Fit model
    random_search.fit(X_train, y_train)

    # Store results
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    y_hat_train = best_model.predict(X_train)
    y_hat_test = best_model.predict(X_test)

    # Calculate probability predictions for positive class if supported
    y_prob_train, y_prob_test = None, None
    if hasattr(best_model, "predict_proba"):
        y_prob_train = best_model.predict_proba(X_train)[:, 1]
        y_prob_test = best_model.predict_proba(X_test)[:, 1]

    # Calculate confusion matrix on test set
    TN1, FP1, FN1, TP1 = confusion_matrix(y_train, y_hat_train).ravel()
    tn2, fp2, fn2, tp2 = confusion_matrix(y_test, y_hat_test).ravel()
    # Calculate metrics on cross-validation using best model
    accuracy_skfolds = cross_val_score(best_model, X_train, y_train, cv = skf, n_jobs=-1, scoring='accuracy')
    precision_skfolds = cross_val_score(best_model, X_train, y_train, cv = skf, n_jobs=-1, scoring='precision')
    recall_skfolds = cross_val_score(best_model, X_train, y_train, cv = skf, n_jobs=-1, scoring='recall')
    roc_auc_skfolds = cross_val_score(best_model, X_train, y_train, cv = skf, n_jobs=-1, scoring='roc_auc')

    accuracy_train = accuracy_score(y_train, y_hat_train)
    accuracy_test = accuracy_score(y_test, y_hat_test)
    balanced_accuracy_train = balanced_accuracy_score(y_train, y_hat_train)
    balanced_accuracy_test = balanced_accuracy_score(y_test, y_hat_test)
    roc_auc_train = roc_auc_score(y_train, y_prob_train)
    roc_auc_test = roc_auc_score(y_test, y_prob_test)
    mcc_train = matthews_corrcoef(y_train, y_hat_train)   # Matthews Correlation Coefficient (MCC)
    mcc_test = matthews_corrcoef(y_test, y_hat_test)
    f1_weighted_train = f1_score(y_train, y_hat_train, average='weighted')   # F1 Score (Weighted) for imbalanced class weighting.
    f1_weighted_test = f1_score(y_test, y_hat_test, average='weighted')

    # Calculate PR-AUC (Precision-Recall AUC) if probabilities are available
    pr_auc_test = None
    if y_prob_test is not None:
        precision, recall, _ = precision_recall_curve(y_test, y_prob_test)
        pr_auc_test = auc(recall, precision)

    results = {
        'TruePos': [TP1],
        'TrueNeg': [TN1],
        'FalsePos': [FP1],
        'FalseNeg': [FN1],
        'tp2': [tp2],
        'tn2': [tn2],
        'fp2': [fp2],
        'fn2': [fn2],
        'cv_accuracy_mean': [accuracy_skfolds.mean()],
        'cv_accuracy_sd': [accuracy_skfolds.std()],
        'cv_precision_mean': [precision_skfolds.mean()],
        'cv_precision_sd': [precision_skfolds.std()],
        'cv_recall_mean': [recall_skfolds.mean()],
        'cv_recall_sd': [recall_skfolds.std()],
        'cv_roc_auc_mean': [roc_auc_skfolds.mean()],
        'cv_roc_auc_sd': [roc_auc_skfolds.std()],
        'accuracy_train': [accuracy_train],
        'accuracy_test': [accuracy_test],
        'balanced_accuracy_train': [balanced_accuracy_train],
        'balanced_accuracy_test': [balanced_accuracy_test],
        'roc_auc_train': [roc_auc_train],
        'roc_auc_test': [roc_auc_test],
        'mcc_train': [mcc_train],
        'mcc_test': [mcc_test],
        'f1_weighted_train': [f1_weighted_train],
        'f1_weighted_test': [f1_weighted_test],
        'pr_auc_test': [pr_auc_test]  # PR-AUC for test set
    }

    return best_model, best_params, results, X_train, X_test, y_train, y_test, y_hat_train, y_hat_test, test_indices, y_prob_test

# Without in vitro data (descriptor only)

In [None]:
X_rdkit_descrs_scal.reset_index(drop=True)
ECFP6_descrs = ECFP6_descrs.reset_index(drop=True)
FCFP6_descrs = FCFP6_descrs.reset_index(drop=True)
MACCS_descrs = MACCS_descrs.reset_index(drop=True)
# All: four descriptors
All = pd.concat([X_rdkit_descrs_scal.reset_index(drop=True), ECFP6_descrs.reset_index(drop=True), FCFP6_descrs.reset_index(drop=True), MACCS_descrs.reset_index(drop=True)], axis = 1)

In [None]:
print(X_rdkit_descrs_scal.shape)
print(ECFP6_descrs.shape)
print(FCFP6_descrs.shape)
print(MACCS_descrs.shape)
print(All.shape)

(945, 197)
(945, 1024)
(945, 1024)
(945, 167)
(945, 2412)


In [None]:
# Examine label and feature types
feature_types = ["RDKit", "ECFP6", "FCFP6", "MACCS", "All_included"]
label_types = ["EN", "EX", "HE", "RES", "U", "Whole"]

features = [X_rdkit_descrs_scal, ECFP6_descrs, FCFP6_descrs, MACCS_descrs, All]
labels = [Yen, Yex, Yhe, Yres, Yu, Y]

In [None]:
# Adjust warnings filtering (optional)
warnings.filterwarnings("ignore", category=UserWarning, module='joblib')

# Prepare an empty list to store all results
QSAR_results = []

# Print the accuracy, precision, recall, and roc-auc results for hyperparameter tuning simultaneously for three algorithms: Logisticregression, RF, and SVC with five types of descriptors
for label_type, y in zip(label_types, [Yen, Yex, Yhe, Yres, Yu, Y]):
    print(f"========== Label Type: {label_type} ==========")

    for feature_type, X in zip(feature_types, [X_rdkit_descrs_scal, ECFP6_descrs, FCFP6_descrs, MACCS_descrs, All]):
        print(f"========== Feature Type: {feature_type} ==========")

        for model_name, (model, _) in models.items():
            print(f"Model: {model_name}")
            best_model, best_params, results, X_resampled, X_test, y_resampled, y_test, y_hat_train, y_hat_test, test_indices, y_prob_test = tuning_hyper(X=X, y=y, model_name=model_name)
            dump(best_model, f"{model_name}_best_model QSAR_{label_type}_{feature_type}.joblib")
            # Flatten the results dictionary into a row format for the DataFrame
            row = {
                'Label Type': label_type,
                'Feature Type': feature_type,
                'Model': model_name,
                'Best Params': best_params,
                'TruePos_train': results['TruePos'][0],
                'TrueNeg_train': results['TrueNeg'][0],
                'FalsePos_train': results['FalsePos'][0],
                'FalseNeg_train': results['FalseNeg'][0],
                'TruePos_test': results['tp2'][0],
                'TrueNeg_test': results['tn2'][0],
                'FalsePos_test': results['fp2'][0],
                'FalseNeg_test': results['fn2'][0],
                'CV_accuracy_mean': results['cv_accuracy_mean'][0],
                'CV_accuracy_sd': results['cv_accuracy_sd'][0],
                'CV_precision_mean': results['cv_precision_mean'][0],
                'CV_precision_sd': results['cv_precision_sd'][0],
                'CV_recall_mean': results['cv_recall_mean'][0],
                'CV_recall_sd': results['cv_recall_sd'][0],
                'CV_ROC AUC_mean': results['cv_roc_auc_mean'][0],
                'CV_ROC AUC_sd': results['cv_roc_auc_sd'][0],
                'Accuracy Train': results['accuracy_train'][0],
                'Accuracy Test': results['accuracy_test'][0],
                'Balanced Accuracy Train': results['balanced_accuracy_train'][0],
                'Balanced Accuracy Test': results['balanced_accuracy_test'][0],
                'MCC Train': results['mcc_train'][0],
                'MCC Test': results['mcc_test'][0],
                'F1 Train': results['f1_weighted_train'][0],
                'F1 Test': results['f1_weighted_test'][0],
                'ROC AUC Train': results['roc_auc_train'][0],
                'ROC AUC Test': results['roc_auc_test'][0],
                'PR-AUC Test': results['pr_auc_test'][0],
                'test_chem': test_indices.tolist(),
                'y_test': y_test.tolist(),
                'y_pred_test': y_hat_test.tolist(),
                'y_prob_test': y_prob_test.tolist() if y_prob_test is not None else None  # Convert y_prob_test to list format
            }

            # Append the row to the list of all results
            QSAR_results.append(row)

# Convert the list of results into a DataFrame
QSAR_results = pd.DataFrame(QSAR_results)

# Optionally, save the DataFrame to a CSV file or display it
QSAR_results.to_csv('QSARModel_results.csv', index=False)
print(QSAR_results)

Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
























Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





























































































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost







































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost













Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost

























































































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost








































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost














Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost



















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost









Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost

























































































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost



































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost














Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





























Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost



















































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost























































































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost


























Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost

































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost



















































































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





























Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost




















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost








































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
























































































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost






























    Label Type  Feature Type     Model  \
0           EN         RDKit       Ada   
1           EN         RDKit       KNN   
2           EN         RDKit        RF   
3           EN         RDKit       SVC   
4           EN         RDKit       wLR   
..         ...           ...       ...   
205      Whole  All_included        RF   
206      Whole  All_included       SVC   
207      Whole  All_included       wLR   
208      Whole  All_included       XGB   
209      Whole  All_included  CatBoost   

                                           Best Params  TruePos_train  \
0    {'learning_rate': 1.0717242986570266, 'n_estim...            575   
1    {'metric': 'manhattan', 'n_neighbors': 2, 'wei...            627   
2    {'max_depth': 18, 'max_features': 'log2', 'min...            631   
3    {'C': 95.40718470239531, 'degree': 2, 'gamma':...            633   
4      {'C': 97.38555188414593, 'solver': 'newton-cg'}            520   
..                                                 ...   

# NO SMOTE for all

# All: Descriptor + in vitro data

In [None]:
# Adjust warnings filtering (optional)
warnings.filterwarnings("ignore", category=UserWarning, module='joblib')

# Prepare an empty list to store all results
all_results1 = []

# Print the accuracy, precision, recall, and roc-auc results for hyperparameter tuning simultaneously for three algorithms: Logisticregression, RF, and SVC with five types of descriptors
for label_type, y in zip(label_types, [Yen, Yex, Yhe, Yres, Yu, Y]):
    print(f"========== Label Type: {label_type} ==========")

    for feature_type, X in zip(feature_types, [X_rdkit, X_ECFP6, X_FCFP6, X_MACCS, X_All]):
        print(f"========== Feature Type: {feature_type} ==========")

        for model_name, (model, _) in models.items():
            print(f"Model: {model_name}")
            best_model, best_params, results, X_train, X_test, y_train, y_test, y_hat_train, y_hat_test, test_indices, y_prob_test = tuning_hyper2(X=X, y=y, model_name=model_name)

            dump(best_model, f"{model_name}_noSMOTE_{label_type}_{feature_type}.joblib")
            # Flatten the results dictionary into a row format for the DataFrame
            row = {
                'Label Type': label_type,
                'Feature Type': feature_type,
                'Model': model_name,
                'Best Params': best_params,
                'TruePos_train': results['TruePos'][0],
                'TrueNeg_train': results['TrueNeg'][0],
                'FalsePos_train': results['FalsePos'][0],
                'FalseNeg_train': results['FalseNeg'][0],
                'TruePos_test': results['tp2'][0],
                'TrueNeg_test': results['tn2'][0],
                'FalsePos_test': results['fp2'][0],
                'FalseNeg_test': results['fn2'][0],
                'CV_accuracy_mean': results['cv_accuracy_mean'][0],
                'CV_accuracy_sd': results['cv_accuracy_sd'][0],
                'CV_precision_mean': results['cv_precision_mean'][0],
                'CV_precision_sd': results['cv_precision_sd'][0],
                'CV_recall_mean': results['cv_recall_mean'][0],
                'CV_recall_sd': results['cv_recall_sd'][0],
                'CV_ROC AUC_mean': results['cv_roc_auc_mean'][0],
                'CV_ROC AUC_sd': results['cv_roc_auc_sd'][0],
                'Accuracy Train': results['accuracy_train'][0],
                'Accuracy Test': results['accuracy_test'][0],
                'Balanced Accuracy Train': results['balanced_accuracy_train'][0],
                'Balanced Accuracy Test': results['balanced_accuracy_test'][0],
                'MCC Train': results['mcc_train'][0],
                'MCC Test': results['mcc_test'][0],
                'F1 Train': results['f1_weighted_train'][0],
                'F1 Test': results['f1_weighted_test'][0],
                'ROC AUC Train': results['roc_auc_train'][0],
                'ROC AUC Test': results['roc_auc_test'][0],
                'PR-AUC Test': results['pr_auc_test'][0],
                'test_chem': test_indices.tolist(),
                'y_test': y_test.tolist(),
                'y_pred_test': y_hat_test.tolist(),
                'y_prob_test': y_prob_test.tolist() if y_prob_test is not None else None  # Convert y_prob_test to list format
            }

            # Append the row to the list of all results
            all_results1.append(row)

# Convert the list of results into a DataFrame
df_noresults = pd.DataFrame(all_results1)

# Optionally, save the DataFrame to a CSV file or display it
df_noresults.to_csv('Model_results_nosmote.csv', index=False)
print(df_noresults)

Model: Ada
Model: KNN


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost






































Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
















Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
















Model: Ada
Model: KNN


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost







































































































Model: Ada
Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
































Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost






















Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost















Model: Ada
Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost






























Model: Ada
Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost










































































































Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost

























Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost

























Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost









Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost























Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost








































































































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost






























Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost











Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost

























Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost



































































































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost


























Model: Ada
Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost






















Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost















Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
























Model: Ada
Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost







































































































Model: Ada
Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost


























  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost



















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost










































































































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost

































    Label Type  Feature Type     Model  \
0           EN         RDKit       Ada   
1           EN         RDKit       KNN   
2           EN         RDKit        RF   
3           EN         RDKit       SVC   
4           EN         RDKit       wLR   
..         ...           ...       ...   
205      Whole  All_included        RF   
206      Whole  All_included       SVC   
207      Whole  All_included       wLR   
208      Whole  All_included       XGB   
209      Whole  All_included  CatBoost   

                                           Best Params  TruePos_train  \
0    {'learning_rate': 1.2308333003018237, 'n_estim...             24   
1    {'metric': 'euclidean', 'n_neighbors': 8, 'wei...              2   
2    {'max_depth': 8, 'max_features': 'sqrt', 'min_...             16   
3    {'C': 85.21366715168568, 'degree': 4, 'gamma':...             31   
4      {'C': 9.551011649041131, 'solver': 'newton-cg'}            102   
..                                                 ...   

In [None]:
# Adjust warnings filtering (optional)
warnings.filterwarnings("ignore", category=UserWarning, module='joblib')

# Prepare an empty list to store all results
QSAR_results1 = []

# Print the accuracy, precision, recall, and roc-auc results for hyperparameter tuning simultaneously for three algorithms: Logisticregression, RF, and SVC with five types of descriptors
for label_type, y in zip(label_types, [Yen, Yex, Yhe, Yres, Yu, Y]):
    print(f"========== Label Type: {label_type} ==========")

    for feature_type, X in zip(feature_types, [X_rdkit_descrs_scal, ECFP6_descrs, FCFP6_descrs, MACCS_descrs, All]):
        print(f"========== Feature Type: {feature_type} ==========")

        for model_name, (model, _) in models.items():
            print(f"Model: {model_name}")
            best_model, best_params, results, X_train, X_test, y_train, y_test, y_hat_train, y_hat_test, test_indices, y_prob_test = tuning_hyper2(X=X, y=y, model_name=model_name)
            dump(best_model, f"{model_name}_noSMOTE_QSAR_{label_type}_{feature_type}.joblib")
            # Flatten the results dictionary into a row format for the DataFrame
            row = {
                'Label Type': label_type,
                'Feature Type': feature_type,
                'Model': model_name,
                'Best Params': best_params,
                'TruePos_train': results['TruePos'][0],
                'TrueNeg_train': results['TrueNeg'][0],
                'FalsePos_train': results['FalsePos'][0],
                'FalseNeg_train': results['FalseNeg'][0],
                'TruePos_test': results['tp2'][0],
                'TrueNeg_test': results['tn2'][0],
                'FalsePos_test': results['fp2'][0],
                'FalseNeg_test': results['fn2'][0],
                'CV_accuracy_mean': results['cv_accuracy_mean'][0],
                'CV_accuracy_sd': results['cv_accuracy_sd'][0],
                'CV_precision_mean': results['cv_precision_mean'][0],
                'CV_precision_sd': results['cv_precision_sd'][0],
                'CV_recall_mean': results['cv_recall_mean'][0],
                'CV_recall_sd': results['cv_recall_sd'][0],
                'CV_ROC AUC_mean': results['cv_roc_auc_mean'][0],
                'CV_ROC AUC_sd': results['cv_roc_auc_sd'][0],
                'Accuracy Train': results['accuracy_train'][0],
                'Accuracy Test': results['accuracy_test'][0],
                'Balanced Accuracy Train': results['balanced_accuracy_train'][0],
                'Balanced Accuracy Test': results['balanced_accuracy_test'][0],
                'MCC Train': results['mcc_train'][0],
                'MCC Test': results['mcc_test'][0],
                'F1 Train': results['f1_weighted_train'][0],
                'F1 Test': results['f1_weighted_test'][0],
                'ROC AUC Train': results['roc_auc_train'][0],
                'ROC AUC Test': results['roc_auc_test'][0],
                'PR-AUC Test': results['pr_auc_test'][0],
                'test_chem': test_indices.tolist(),
                'y_test': y_test.tolist(),
                'y_pred_test': y_hat_test.tolist(),
                'y_prob_test': y_prob_test.tolist() if y_prob_test is not None else None  # Convert y_prob_test to list format
            }

            # Append the row to the list of all results
            QSAR_results1.append(row)

# Convert the list of results into a DataFrame
QSAR_noresults = pd.DataFrame(QSAR_results1)

# Optionally, save the DataFrame to a CSV file or display it
QSAR_noresults.to_csv('QSARModel_results_nosmote.csv', index=False)
print(QSAR_noresults)

Model: Ada
Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost




















Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost



































Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost













Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost





















































































Model: Ada
Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
























Model: Ada
Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost










Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost






























Model: Ada
Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost













Model: Ada
Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost




























































































Model: Ada
Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost



























Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost











Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost




















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost












Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost




















































































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost


























Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost








Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost



















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost

















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost




















































































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost




























Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost











Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost



















Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost















Model: Ada


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost






















































































Model: Ada
Model: KNN
Model: RF


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: SVC
Model: wLR
Model: XGB
Model: CatBoost






















Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost












Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost






























Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost




















































































Model: Ada
Model: KNN
Model: RF
Model: SVC
Model: wLR
Model: XGB
Model: CatBoost
























    Label Type  Feature Type     Model  \
0           EN         RDKit       Ada   
1           EN         RDKit       KNN   
2           EN         RDKit        RF   
3           EN         RDKit       SVC   
4           EN         RDKit       wLR   
..         ...           ...       ...   
205      Whole  All_included        RF   
206      Whole  All_included       SVC   
207      Whole  All_included       wLR   
208      Whole  All_included       XGB   
209      Whole  All_included  CatBoost   

                                           Best Params  TruePos_train  \
0    {'learning_rate': 1.3257017585228865, 'n_estim...             19   
1    {'metric': 'euclidean', 'n_neighbors': 3, 'wei...             43   
2    {'max_depth': 4, 'max_features': 'sqrt', 'min_...              4   
3    {'C': 15.699452033620265, 'degree': 4, 'gamma'...             12   
4      {'C': 35.63978380769749, 'solver': 'liblinear'}             93   
..                                                 ...   