## Notebook 11: XGBoost Model - InterDILI Dataset

Training XGBoost model on the large-scale HepaTox dataset. This represents our best and final attempt to surpass the 81% performance target.

### Setup

In [14]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
import ast
import os
import joblib

from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem

print("Libraries imported successfully.")

Libraries imported successfully.


### Load the InterDILI Dataset

In [15]:
try:
    df = pd.read_csv('data/processed/Total_dataset.csv')
    print("HepaTox dataset loaded successfully.")
    print(f"Shape of the dataset: {df.shape}")
except FileNotFoundError:
    print("Error: Total_dataset.csv not found.")
    print("Please make sure you have uploaded the file to your Colab session.")

# Initial Data Cleaning
df.rename(columns={'toxicity': 'dili_concern'}, inplace=True)
# Drop rows with missing SMILES strings
df.dropna(subset=['smiles'], inplace=True)
print(f"Shape after dropping NaNs: {df.shape}")

HepaTox dataset loaded successfully.
Shape of the dataset: (1850, 3)
Shape after dropping NaNs: (1850, 3)


### Engineer a Comprehensive Feature Set
Generate both the Morgan fingerprints and the 28 physicochemical descriptors for this new, larger dataset.

In [16]:
# Generate Morgan Fingerprints
def generate_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return list(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024))
    return None

print("Generating Morgan Fingerprints...")
tqdm.pandas(desc="Fingerprinting")
df['fingerprint'] = df['smiles'].progress_apply(generate_fingerprint)

# Generate Physicochemical Descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [np.nan] * 28

    return [
        Descriptors.MolWt(mol), Descriptors.MolLogP(mol), Descriptors.TPSA(mol),
        Descriptors.NumHAcceptors(mol), Descriptors.NumHDonors(mol), Descriptors.NumRotatableBonds(mol),
        Descriptors.NOCount(mol), Descriptors.NumAliphaticCarbocycles(mol),
        Descriptors.NumAliphaticHeterocycles(mol), Descriptors.NumAliphaticRings(mol),
        Descriptors.NumAromaticCarbocycles(mol), Descriptors.NumAromaticHeterocycles(mol),
        Descriptors.NumAromaticRings(mol), Descriptors.NumSaturatedCarbocycles(mol),
        Descriptors.NumSaturatedHeterocycles(mol), Descriptors.NumSaturatedRings(mol),
        Descriptors.RingCount(mol), Descriptors.MolMR(mol), Descriptors.FractionCSP3(mol),
        Descriptors.HeavyAtomCount(mol), Descriptors.NHOHCount(mol), Descriptors.NOCount(mol),
        Descriptors.NumAliphaticRings(mol), Descriptors.NumAromaticRings(mol),
        Descriptors.NumHAcceptors(mol), Descriptors.NumHDonors(mol),
        Descriptors.NumHeteroatoms(mol), Descriptors.NumRotatableBonds(mol)
    ]

descriptor_names = [
    'MolWt', 'MolLogP', 'TPSA', 'NumHAcceptors', 'NumHDonors', 'NumRotatableBonds',
    'NOCount', 'NumAliphaticCarbocycles', 'NumAliphaticHeterocycles', 'NumAliphaticRings',
    'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 'NumAromaticRings',
    'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'RingCount',
    'MolMR', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount', 'LipinskiNOCount',
    'LipinskiNumAliphaticRings', 'LipinskiNumAromaticRings', 'LipinskiNumHAcceptors',
    'LipinskiNumHDonors', 'NumHeteroatoms', 'LipinskiNumRotatableBonds'
]

print("\nCalculating 28 physicochemical descriptors...")
tqdm.pandas(desc="Calculating Descriptors")
descriptor_features = df['smiles'].progress_apply(lambda s: pd.Series(calculate_descriptors(s)))
descriptor_features.columns = descriptor_names

df = pd.concat([df.reset_index(drop=True), descriptor_features.reset_index(drop=True)], axis=1)

# Final cleanup
df.dropna(inplace=True)
print(f"\nFeature generation complete. Final dataset shape: {df.shape}")

Generating Morgan Fingerprints...


Fingerprinting:   0%|          | 0/1850 [00:00<?, ?it/s]




Calculating 28 physicochemical descriptors...




Calculating Descriptors:   0%|          | 0/1850 [00:00<?, ?it/s]


Feature generation complete. Final dataset shape: (1850, 32)


### Final Hybrid Feature Set

In [17]:
fingerprints = np.array(df['fingerprint'].tolist())
descriptors = df[descriptor_names].values

scaler = StandardScaler()
descriptors_scaled = scaler.fit_transform(descriptors)

X_hybrid = np.concatenate([fingerprints, descriptors_scaled], axis=1)
y = df['dili_concern'].values

print(f"Shape of final hybrid features: {X_hybrid.shape}")

Shape of final hybrid features: (1850, 1052)


### Train and Evaluate the Model

In [None]:
# Create a train/test split
X_train, X_test, y_train, y_test = train_test_split(X_hybrid, y, test_size=0.2, random_state=42, stratify=y)

# Best XGBoost parameters
best_xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'use_label_encoder': False,
    'random_state': 42,
    'n_estimators': 349,
    'max_depth': 9,
    'learning_rate': 0.013094879048120515,
    'subsample': 0.7701118471150662,
    'colsample_bytree': 0.8656908580435939,
    'gamma': 0.03486754343407765,
    'min_child_weight': 1
}

# Class weight balancing
neg_count = np.sum(y_train == 0)
pos_count = np.sum(y_train == 1)
best_xgb_params['scale_pos_weight'] = neg_count / pos_count if pos_count > 0 else 1

print("\nTraining the final XGBoost model on the HepaTox dataset...")
final_model = xgb.XGBClassifier(**best_xgb_params)
final_model.fit(X_train, y_train)
print("Final model training complete.")

# Make predictions
y_pred = final_model.predict(X_test)
y_pred_proba = final_model.predict_proba(X_test)[:, 1]

# Calculate final metrics
final_accuracy = accuracy_score(y_test, y_pred)
final_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Save the Model & Feature Scaler
MODEL_OUTPUT_DIR = 'models'
MODEL_PATH = os.path.join(MODEL_OUTPUT_DIR, 'XGBoost22k_model.pkl')
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

# Save the model and the scaler together in a dictionary
production_artifact = {
    'model': final_model,
    'scaler': scaler
}

joblib.dump(production_artifact, MODEL_PATH)
print(f"\nChampion model and scaler successfully saved to '{MODEL_PATH}'")


Training the final XGBoost model on the HepaTox dataset...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Final model training complete.

Champion model successfully saved to 'models/XGBoost22k_model.pkl'


### Results and Conclusion

In [19]:
print("\n--- XGBoost Model Performance ---")
print(f"Accuracy on Test Set: {final_accuracy:.3f}")
print(f"ROC AUC on Test Set:  {final_roc_auc:.3f}")

print("\n--- Comparison ---")
print("Metric         | Previous Best (Ensemble) | HepaTox-Trained Model")
print("----------------|--------------------------|-----------------------")
ensemble_roc_auc = 0.768
print(f"ROC AUC       | {ensemble_roc_auc:.3f}                     | {final_roc_auc:.3f}")


--- XGBoost Model Performance ---
Accuracy on Test Set: 0.795
ROC AUC on Test Set:  0.849

--- Comparison ---
Metric         | Previous Best (Ensemble) | HepaTox-Trained Model
----------------|--------------------------|-----------------------
ROC AUC       | 0.768                     | 0.849
