### BBBP
As a membrane separating circulating blood and brain extracellular fluid, the blood-brain barrier (BBB) is the protection layer that blocks most foreign drugs. Thus the ability of a drug to penetrate the barrier to deliver to the site of action forms a crucial challenge in development of drugs for central nervous system From MoleculeNet.

In [10]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from tdc.single_pred.adme import ADME
from tdc import Evaluator
from rdkit import Chem
from rdkit.Chem import AllChem
import pickle
import pandas as pd

In [2]:
class Featurizer:
    def __init__(self, y_column, smiles_col='Drug', **kwargs):
        self.y_column = y_column
        self.smiles_col = smiles_col
        self.__dict__.update(kwargs)

    def __call__(self, df):
        raise NotImplementedError()

class ECFPFeaturizer(Featurizer):
    def __init__(self, y_column, radius=2, length=1024, **kwargs):
        self.radius = radius
        self.length = length
        super().__init__(y_column, **kwargs)

    def __call__(self, df):
        fingerprints = []
        labels = []
        for i, row in df.iterrows():
            y = row[self.y_column]
            smiles = row[self.smiles_col]
            mol = Chem.MolFromSmiles(smiles)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, nBits=self.length)
            fingerprints.append(fp)
            labels.append(y)
        fingerprints = np.array(fingerprints)
        labels = np.array(labels)
        return fingerprints, labels


In [3]:
def train(X_train, y_train, X_valid, y_valid):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate on the validation set
    val_predictions = model.predict_proba(X_valid)[:, 1]  # Probability for class 1
    val_preds_class = model.predict(X_valid)  # Predicted class labels
    roc_auc_val = roc_auc_score(y_valid, val_predictions)
    accuracy_val = accuracy_score(y_valid, val_preds_class)
    print(f'Validation ROC AUC: {roc_auc_val}, Accuracy: {accuracy_val}')
    
    return model


In [4]:
def predict(model, X_test):
    test_predictions_proba = model.predict_proba(X_test)[:, 1]  # Probability for class 1
    test_predictions_class = model.predict(X_test)  # Predicted class labels
    return test_predictions_proba, test_predictions_class


In [5]:
# Load the BBBP dataset
data = ADME('bbb_martins')
split = data.get_split(method='scaffold')

# Featurizer to generate molecular fingerprints
featurizer = ECFPFeaturizer(y_column='Y')

X_train, y_train = featurizer(split['train'])
X_valid, y_valid = featurizer(split['valid'])
X_test, y_test = featurizer(split['test'])


Found local copy...
Loading...
Done!
100%|████████████████████████████████████████████████████████████████████████████| 2030/2030 [00:00<00:00, 2055.13it/s]


In [6]:
# Train the model
model = train(X_train, y_train, X_valid, y_valid)

# Make predictions on the test set
predictions_proba, predictions_class = predict(model, X_test)

# Evaluate on the test set
roc_auc_test = roc_auc_score(y_test, predictions_proba)
accuracy_test = accuracy_score(y_test, predictions_class)

print(f'Test ROC AUC: {roc_auc_test}, Accuracy: {accuracy_test}')


Validation ROC AUC: 0.836410512308716, Accuracy: 0.8916256157635468
Test ROC AUC: 0.873964196372733, Accuracy: 0.8817733990147784


In [15]:
# Save the trained model to a pickle file
with open('BBBP_model.pkl', 'wb') as f:
    pickle.dump(model, f)


In [12]:
class Featurizer:
    def __init__(self, y_column=None, smiles_col='Drug', **kwargs):
        self.y_column = y_column
        self.smiles_col = smiles_col
        self.__dict__.update(kwargs)

    def __call__(self, df):
        raise NotImplementedError()

# ECFP Featurizer for molecular fingerprint generation
class ECFPFeaturizer(Featurizer):
    def __init__(self, y_column=None, radius=2, length=1024, **kwargs):
        self.radius = radius
        self.length = length
        super().__init__(y_column, **kwargs)

    def __call__(self, df):
        fingerprints = []
        labels = []
        for i, row in df.iterrows():
            smiles = row[self.smiles_col]
            mol = Chem.MolFromSmiles(smiles)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, nBits=self.length)
            fingerprints.append(fp)
            
            # Only add label if y_column exists (not needed for single SMILES prediction)
            if self.y_column and self.y_column in df.columns:
                y = row[self.y_column]
                labels.append(y)

        fingerprints = np.array(fingerprints)
        labels = np.array(labels) if labels else None
        return fingerprints, labels


# Load the trained model from the pickle file
with open('BBBP_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Initialize the featurizer (without a y_column for prediction)
featurizer = ECFPFeaturizer(smiles_col='Drug')  # y_column not needed for predictions

# Example SMILES string for prediction
smiles_string = "CCCCCC1=CC2=C(C3C=C(CCC3C(O2)(C)C)C)C(=C1)O"  # Example SMILES (ethanol)

# Make predictions for the single SMILES string
predicted_class, predicted_probability = predict_single_smiles(smiles_string, model, featurizer)

# Output the predictions
print(f'Predicted Class: {predicted_class}')
print(f'Predicted Probability for Class 1: {predicted_probability}')

Predicted Class: 1
Predicted Probability for Class 1: 0.96


In [13]:
# Function to predict for a single SMILES string
def predict_single_smiles(smiles, model, featurizer):
    # Create a dataframe for the single SMILES string (since the featurizer expects a dataframe)
    df = pd.DataFrame({featurizer.smiles_col: [smiles]})

    # Featurize the SMILES string
    X_new, _ = featurizer(df)  # We don't need the labels here, so they can be ignored

    # Make prediction (predict_proba returns probabilities for each class)
    prediction_proba = model.predict_proba(X_new)[:, 1]  # Probability for class 1
    prediction_class = model.predict(X_new)  # Predicted class label

    return prediction_class[0], prediction_proba[0]

In [14]:
# Load the trained model from the pickle file
with open('BBBP_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Initialize the featurizer (without a y_column for prediction)
featurizer = ECFPFeaturizer(smiles_col='Drug')  # y_column not needed for predictions

# Example SMILES string for prediction
smiles_string = "CCCCCC1=CC2=C(C3C=C(CCC3C(O2)(C)C)C)C(=C1)O"  # Example SMILES   CCCCCC1=CC2=C(C3C=C(CCC3C(O2)(C)C)C)C(=C1)O

# Make predictions for the single SMILES string
predicted_class, predicted_probability = predict_single_smiles(smiles_string, model, featurizer)

# Output the predictions
print(f'Predicted Class: {predicted_class}')
print(f'Predicted Probability for Class 1: {predicted_probability}')

Predicted Class: 1
Predicted Probability for Class 1: 0.96
