In [5]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.9.5-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Downloading rdkit-2025.9.5-cp312-cp312-manylinux_2_28_x86_64.whl (36.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.7/36.7 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.5


In [1]:
# Core
import pandas as pd
import numpy as np
import joblib

# Model
from sklearn.ensemble import RandomForestRegressor

# Reproducibility
RANDOM_STATE = 42

In [3]:
# Load cleaned dataset
df = pd.read_csv("drd2_cleaned_dataset.csv")

print("Dataset shape:", df.shape)

Dataset shape: (890, 5)


In [7]:
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit import DataStructs

# Create Morgan generator (ECFP4 equivalent: radius=2)
radius = 2
n_bits = 990

morgan_generator = rdFingerprintGenerator.GetMorganGenerator(
    radius=radius,
    fpSize=n_bits
)

def smiles_to_morgan(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    fp = morgan_generator.GetFingerprint(mol)
    arr = np.zeros((n_bits,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)

    return arr

# Generate fingerprints
fingerprints = []

for smi in df["canonical_smiles"]:
    arr = smiles_to_morgan(smi)
    fingerprints.append(arr)

X = np.array(fingerprints)
y = df["pIC50"].values

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (890, 990)
y shape: (890,)


In [8]:
# Train final model on full dataset

rf_final = RandomForestRegressor(
    n_estimators=200,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

rf_final.fit(X, y)

print("Final Random Forest trained on full dataset.")

Final Random Forest trained on full dataset.


In [9]:
# Save trained model
joblib.dump(rf_final, "rf_morgan_model.pkl")

print("Model saved as rf_morgan_model.pkl")

Model saved as rf_morgan_model.pkl


In [10]:
# Save model configuration metadata

model_config = {
    "model_type": "RandomForest",
    "fingerprint_type": "Morgan",
    "radius": radius,
    "n_bits": n_bits,
    "target": "pIC50",
    "training_samples": int(X.shape[0])
}

joblib.dump(model_config, "model_config.pkl")

print("Model configuration saved as model_config.pkl")
print(model_config)

Model configuration saved as model_config.pkl
{'model_type': 'RandomForest', 'fingerprint_type': 'Morgan', 'radius': 2, 'n_bits': 990, 'target': 'pIC50', 'training_samples': 890}


In [11]:
# Save training fingerprints for similarity-based confidence

joblib.dump(X, "training_fingerprints.pkl")

print("Training fingerprints saved as training_fingerprints.pkl")

Training fingerprints saved as training_fingerprints.pkl
