Install packages

In [None]:
!pip install pytdc xgboost rdkit scikit-learn pandas numpy

Import the Dataset 

In [3]:
from tdc.single_pred import Tox
from sklearn import svm

data = Tox(name = 'hERG_Karim')
df = data.get_data()
df #view dataframe to explore

Found local copy...
Loading...
Done!


Unnamed: 0,Drug_ID,Drug,Y
0,0,Fc1ccc(-n2cc(NCCN3CCCCC3)nn2)cc1F,1
1,1,COc1cc(N2Cc3ccc(Sc4ccc(F)cc4)nc3C2=O)ccc1OCCN1...,0
2,2,CCOC(=O)[C@H]1CC[C@@H](N2CC(NC(=O)CNc3nn(C(N)=...,0
3,3,N[C@@H](Cn1c(=O)cnc2ccc(F)cc21)C1CCC(NCc2ccc3c...,0
4,4,O=C(NC1COc2cccc(-c3ccnc(CO)c3)c2C1)c1ccc(OCC(F...,0
...,...,...,...
13440,13440,Cc1csc(NC(=O)c2sc3nc4c(c(C(F)(F)F)c3c2N)CCC4)n1,0
13441,13441,Cc1cccc(-c2n[nH]cc2-c2ccc3ncccc3n2)n1,0
13442,13442,Cc1ccccc1-n1c(Cn2cnc3c(N)ncnc32)nc2cccc(C)c2c1=O,0
13443,13443,Cc1ccccc1-n1c(Cn2ncc3c(N)ncnc32)nc2cccc(C)c2c1=O,0


Import Necessary Packages and Functions

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score

from rdkit.Chem import MACCSkeys
from rdkit.Chem import Descriptors
from rdkit.Chem import rdmolops



Feature Engineering

In [None]:
#data conversion
split = data.get_split()
smiles_train = split['train']['Drug']
Y_train = split['train']['Y']
smiles_valid = split['valid']['Drug']
Y_valid = split['valid']['Y']

# Compute MACCS Keys
def compute_maccs_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    maccs = MACCSkeys.GenMACCSKeys(mol)
    return np.array(maccs)

# Compute additional molecular descriptors
def compute_molecular_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)

    descriptors = []
    # Molecular weight
    descriptors.append(Descriptors.MolWt(mol))
    # LogP (octanol-water partition coefficient)
    descriptors.append(Descriptors.MolLogP(mol))
    # Topological Polar Surface Area (TPSA)
    descriptors.append(Descriptors.TPSA(mol))
    # Number of rotatable bonds
    descriptors.append(Descriptors.NumRotatableBonds(mol))
    # Aromaticity (True/False)
    descriptors.append(Descriptors.NumAromaticRings(mol))
    # Hydrogen Bond Donors and Acceptors
    descriptors.append(Descriptors.NumHDonors(mol))
    descriptors.append(Descriptors.NumHAcceptors(mol))

    return np.array(descriptors)

#Compute Morgan fingerprints
def compute_morgan_fingerprint(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    return np.array(fingerprint)

# Compute all
def compute_combined_fingerprints(smiles):
    # Compute Morgan Fingerprint
    morgan_fingerprint = compute_morgan_fingerprint(smiles)

    # Compute MACCS Fingerprint
    maccs_fingerprint = compute_maccs_fingerprint(smiles)

    # Compute molecular descriptors
    molecular_descriptors = compute_molecular_descriptors(smiles)

    # Combine all features into a single vector
    combined_features = np.concatenate([morgan_fingerprint, maccs_fingerprint, molecular_descriptors])

    return combined_features

train_features = smiles_train.apply(compute_combined_fingerprints)
X_train_combined = np.stack(train_features.values)

# Compute features for validation data
valid_features = smiles_valid.apply(compute_combined_fingerprints)
X_valid_combined = np.stack(valid_features.values)

# Scale the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_combined)
X_valid_scaled = scaler.transform(X_valid_combined)

Train and Evaluate Models

In [None]:
# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train_scaled, Y_train)

# Predict and evaluate Random Forest model
y_pred_rf = rf_model.predict(X_valid_scaled)
y_prob_rf = rf_model.predict_proba(X_valid_scaled)
print("Random Forest Accuracy:", accuracy_score(Y_valid, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(Y_valid, y_pred_rf))
print("Random Forest AUC: ", roc_auc_score(Y_valid, y_prob_rf[:,1]))

Random Forest Accuracy: 0.8608630952380952
Random Forest Classification Report:                precision    recall  f1-score   support

           0       0.87      0.86      0.86       683
           1       0.85      0.86      0.86       661

    accuracy                           0.86      1344
   macro avg       0.86      0.86      0.86      1344
weighted avg       0.86      0.86      0.86      1344

Random Forest AUC:  0.9254512551416174


In [14]:
# Train SVM model
#Best parameters found from randomized search: {'C': 1.6599452033620266, 'gamma': 0.06808361216819946, 'kernel': 'poly'}
svm_model = SVC(kernel='poly', random_state=42, probability=True, C=1.7, gamma= 0.07)
svm_model.fit(X_train_scaled, Y_train)

# Predict and evaluate SVM model
y_pred_svm = svm_model.predict(X_valid_scaled)
y_prob_svm = svm_model.predict_proba(X_valid_scaled)
print("SVM Accuracy:", accuracy_score(Y_valid, y_pred_svm))
print("SVM Classification Report:\n", classification_report(Y_valid, y_pred_svm))
print("SVM AUC: ", roc_auc_score(Y_valid, y_prob_svm[:,1]))

SVM Accuracy: 0.8333333333333334
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.81      0.83       683
           1       0.81      0.86      0.84       661

    accuracy                           0.83      1344
   macro avg       0.83      0.83      0.83      1344
weighted avg       0.83      0.83      0.83      1344

SVM AUC:  0.9127868728998833


In [12]:
#Train XGBoost
#Best parameters found from randomized search: {'subsample': 0.8, 'reg_lambda': 3, 'reg_alpha': 1, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.3, 'gamma': 0, 'colsample_bytree': 0.6}
model = XGBClassifier(
    subsample = 0.8,
    reg_lambda = 3,
    reg_alpha = 1,
    max_depth=9,
    learning_rate=0.3,
    n_estimators=200,
    gamma = 0,
    colsample_bytree=0.6,
    random_state=42
)

model.fit(X_train_scaled, Y_train)
y_pred_xg = model.predict(X_valid_scaled)
y_prob_xg = model.predict_proba(X_valid_scaled)
print("XGBoost Validation accuracy:", accuracy_score(Y_valid, y_pred_xg))
print("XGBoost Classification report:\n", classification_report(Y_valid, y_pred_xg))
print("XGBoost AUC: ", roc_auc_score(Y_valid, y_prob_xg[:,1]))

XGBoost Validation accuracy: 0.8660714285714286
XGBoost Classification report:
               precision    recall  f1-score   support

           0       0.87      0.86      0.87       683
           1       0.86      0.87      0.86       661

    accuracy                           0.87      1344
   macro avg       0.87      0.87      0.87      1344
weighted avg       0.87      0.87      0.87      1344

XGBoost AUC:  0.9245608167225221


In [None]:
# Initialize the AdaBoost model
adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)  # You can adjust n_estimators

# Train the model
adaboost.fit(X_train_scaled, Y_train)

# Predict on the validation set
y_pred_ada = adaboost.predict(X_valid_scaled)
y_prob_ada = adaboost.predict_proba(X_valid_scaled)
# Evaluate the model
print("Adaboost Accuracy:", accuracy_score(Y_valid, y_pred_ada))
print("Adaboost Classification Report:\n", classification_report(Y_valid, y_pred_ada))
print("Adaboost AUC: ", roc_auc_score(Y_valid, y_prob_ada[:,1]))

Adaboost Accuracy: 0.7142857142857143
Adaboost Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.69      0.71       683
           1       0.70      0.74      0.72       661

    accuracy                           0.71      1344
   macro avg       0.72      0.71      0.71      1344
weighted avg       0.72      0.71      0.71      1344

Adaboost AUC:  0.8015573812250394
