In [1]:
!pip3 install PyTDC rdkit tensorflow mordred pandas scikit-learn numpy matplotlib



Collecting PyTDC
  Using cached pytdc-1.1.14.tar.gz (151 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting rdkit
  Using cached rdkit-2024.9.5-cp312-cp312-macosx_11_0_arm64.whl.metadata (4.0 kB)
Collecting tensorflow
  Using cached tensorflow-2.18.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (4.0 kB)
Collecting mordred
  Using cached mordred-1.2.0.tar.gz (128 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pandas
  Using cached pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting numpy
  Using cached numpy-2.2.3-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting matplotlib
  Us

In [3]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tdc.single_pred import ADME




ModuleNotFoundError: No module named 'numpy'

Converting smiles format to morgan fingerprints so regression models can be built

In [None]:
def mol_to_fp(smiles, radius=2, nBits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # fallback if parsing fails
        return np.zeros(nBits, dtype=int)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    arr = np.zeros((nBits,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

In [None]:
from tdc.single_pred import ADME
data_lipophilicity = ADME(name='Lipophilicity_AstraZeneca').get_data()

print(data_lipophilicity.head())

In [None]:
import numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem


X = np.array([mol_to_fp(s) for s in data_lipophilicity['Drug']])
y = data_lipophilicity['Y'].values  # Lipophilicity (logD)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Can add more models if you want but I've already found this to be the best (and pretty fast)

models = {
    "XGBRegressor": {
        "model": XGBRegressor(random_state=33),
        "param_grid": {
            'n_estimators': [100, 300],
            'max_depth': [3, 6, 10],
            'learning_rate': [0.01, 0.1],
        },
    },
}

best_models = {}
best_scores = {}
best_params = {}

for model_name, mp in models.items():
    print(f"=== Grid Search for {model_name} ===")
    grid_search = GridSearchCV(
        estimator=mp["model"],
        param_grid=mp["param_grid"],
        scoring='neg_mean_squared_error',  
        cv=5,
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)

    
    best_estimator = grid_search.best_estimator_
    cv_mse = -grid_search.best_score_  

    # Evaluate on the test set
    y_pred_test = best_estimator.predict(X_test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = np.sqrt(test_mse)

    print("Best Parameters:", grid_search.best_params_)
    print("CV MSE:", cv_mse)
    print("Test RMSE:", test_rmse)
    print("")

    
    best_models[model_name] = best_estimator
    best_scores[model_name] = test_mse
    best_params[model_name] = grid_search.best_params_


best_model_name = min(best_scores, key=best_scores.get)
best_model = best_models[best_model_name]

print(f"Best overall model: {best_model_name}")
print(f"Parameters: {best_params[best_model_name]}")
print(f"Test MSE: {best_scores[best_model_name]}")



In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = best_model.predict(X_test)

mse_test = mean_squared_error(y_test, y_pred)
mae_test = mean_absolute_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)

print("Test MSE:", mse_test)
print("Test MAE:", mae_test)
print("Test R^2:", r2_test)


In [None]:
test_smiles = "COc1cc(OC)c(S(=O)(=O)N2c3ccccc3CCC2C)cc1NC(=O)CSCC(=O)O" 
test_smiles = mol_to_fp(test_smiles)

best_model.predict([test_smiles])