In [6]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from rdkit import Chem
from rdkit.Chem import AllChem
from tdc.single_pred.adme import ADME
from tdc import Evaluator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split

In [8]:
# Featurizer class definition
class Featurizer:
    def __init__(self, y_column, smiles_col='X', **kwargs):
        self.y_column = y_column
        self.smiles_col = smiles_col
        self.__dict__.update(kwargs)

    def __call__(self, df):
        raise NotImplementedError()

# ECFP Featurizer for molecular fingerprint generation
class ECFPFeaturizer(Featurizer):
    def __init__(self, y_column, radius=2, length=1024, **kwargs):
        self.radius = radius
        self.length = length
        super().__init__(y_column, **kwargs)

    def __call__(self, df):
        fingerprints = []
        labels = []
        for i, row in df.iterrows():
            y = row[self.y_column]
            smiles = row[self.smiles_col]
            mol = Chem.MolFromSmiles(smiles)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, nBits=self.length)
            fingerprints.append(fp)
            labels.append(y)
        fingerprints = np.array(fingerprints)
        labels = np.array(labels)
        return fingerprints, labels


In [10]:
data = pd.read_csv('half_life_obach.csv')

# Split the data into features and target variable
X = data.drop(columns=['Y'])  # Features
y = data['Y']  # Target variable

# Split the data into training (70%), validation (15%), and test (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Re-add the target variable to the features for each split
X_train['Y'] = y_train.values
X_valid['Y'] = y_valid.values
X_test['Y'] = y_test.values

# Featurizer to generate molecular fingerprints
featurizer = ECFPFeaturizer(y_column='Y', smiles_col='X')  # Ensure the SMILES column name is correct

# Apply featurizer to generate fingerprints
X_train_featurized, y_train_featurized = featurizer(X_train)
X_valid_featurized, y_valid_featurized = featurizer(X_valid)
X_test_featurized, y_test_featurized = featurizer(X_test)

In [11]:
def train(X_train, y_train, X_valid, y_valid):
    # RandomForestRegressor for regression task
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate the model on the validation set
    valid_predictions = model.predict(X_valid)
    valid_rmse = mean_squared_error(y_valid, valid_predictions, squared=False)
    valid_mae = mean_absolute_error(y_valid, valid_predictions)
    valid_r2 = r2_score(y_valid, valid_predictions)
    
    print(f'Validation RMSE: {valid_rmse}, MAE: {valid_mae}, R2: {valid_r2}')
    return model


In [12]:
def predict(model, X_test):
    predictions = model.predict(X_test)
    return predictions


In [13]:
# Train model
model = train(X_train_featurized, y_train_featurized, X_valid_featurized, y_valid_featurized)

# Make predictions on the test set
predictions = predict(model, X_test_featurized)

# Evaluate predictions
rmse = mean_squared_error(y_test_featurized, predictions, squared=False)
mae = mean_absolute_error(y_test_featurized, predictions)
r2 = r2_score(y_test_featurized, predictions)

print(f'RMSE: {rmse}, MAE: {mae}, R2: {r2}')


Validation RMSE: 43.33834027400233, MAE: 16.1747538, R2: -5.822653712019516
RMSE: 119.55044038666611, MAE: 36.252982508250824, R2: 0.24216582201107717


In [14]:
# Save the trained model as a pickle file
with open('Excretion_model.pkl', 'wb') as f:
    pickle.dump(model, f)


In [15]:
class Pre_Featurizer:
    def __init__(self, y_column, smiles_col='Drug', **kwargs):
        self.y_column = y_column
        self.smiles_col = smiles_col
        self.__dict__.update(kwargs)

    def __call__(self, df):
        raise NotImplementedError()

# ECFP Featurizer for molecular fingerprint generation
class Pre_ECFPFeaturizer(Pre_Featurizer):
    def __init__(self, y_column=None, radius=2, length=1024, **kwargs):
        self.radius = radius
        self.length = length
        super().__init__(y_column, **kwargs)

    def __call__(self, df):
        fingerprints = []
        labels = []
        for i, row in df.iterrows():
            smiles = row[self.smiles_col]
            mol = Chem.MolFromSmiles(smiles)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, self.radius, nBits=self.length)
            fingerprints.append(fp)

            # Only append labels if y_column is present in the dataframe
            if self.y_column and self.y_column in df.columns:
                labels.append(row[self.y_column])

        fingerprints = np.array(fingerprints)
        labels = np.array(labels) if labels else None
        return fingerprints, labels

In [17]:
# Load the model from pickle file
with open('Excretion_model.pkl', 'rb') as f:
    model = pickle.load(f)

def predict_single_smiles(smiles, model, featurizer):
    # Create a dataframe for the single SMILES string (since the featurizer expects a dataframe)
    df = pd.DataFrame({featurizer.smiles_col: [smiles]})

    # Featurize the SMILES string
    X_new, _ = featurizer(df)  # We don't need the labels here, so they can be ignored

    # Make prediction (for regression, no need for predict_proba)
    prediction_value = model.predict(X_new)[0]  # Predicted solubility value

    return prediction_value

# Initialize the same featurizer as used during training
featurizer = Pre_ECFPFeaturizer()  # No need to pass y_column for prediction
smiles_string = "CCCCCC1=CC2=C(C3C=C(CCC3C(O2)(C)C)C)C(=C1)O"  # Example SMILES (ethanol)

# Make predictions for the single SMILES string
predicted_value = predict_single_smiles(smiles_string, model, featurizer)

print(f'Predicted Half life Value: {predicted_value}')



Predicted Half life Value: 21.520199999999996
