## Notebook 7: XGBoost Model Evaluation

Train and evaluate an XGBoost classifier on our molecular fingerprint data.


### Setup

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

print("Libraries imported successfully.")

Libraries imported successfully.


### Load the Processed Data

In [3]:
try:
    df = pd.read_csv('data/processed/dili_data_clean.csv')
    print("Processed data loaded successfully.")
    print(f"Shape of the dataset: {df.shape}")
except FileNotFoundError:
    print("Error: dili_data_clean.csv not found.")
    print("Please make sure you have uploaded the file to your Colab session.")

Processed data loaded successfully.
Shape of the dataset: (907, 5)


### Prepare Data for Modeling

In [4]:
# Drop rows with missing fingerprints
df.dropna(subset=['fingerprint'], inplace=True)

# Safely convert the string representation of the list back into a list of integers
df['fingerprint'] = df['fingerprint'].apply(ast.literal_eval)

# Separate features (X) and target (y)
X = np.array(df['fingerprint'].tolist())
y = df['dili_concern'].values

# Create a train/test split for final evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


print(f"Data prepared. Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")

Data prepared. Training set shape: (725, 1024), Test set shape: (182, 1024)


### Fine-Tune the XGBoost Model with Optuna

Use Optuna to perform a more intelligent search for the best hyperparameters.

In [5]:
# Define the objective function for Optuna to optimize
def objective(trial):
    # Define the search space for hyperparameters
    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }

    # Add class weight balancing
    neg_count = np.sum(y_train == 0)
    pos_count = np.sum(y_train == 1)
    param['scale_pos_weight'] = neg_count / pos_count if pos_count > 0 else 1

    model = xgb.XGBClassifier(**param)

    # Perform cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)

    return np.mean(scores)

# Create an Optuna study and run the optimization
# We want to maximize the ROC AUC score
study = optuna.create_study(direction='maximize')
print("Starting Optuna hyperparameter search...")
study.optimize(objective, n_trials=100) # Run for 100 trials

print("\nBest trial found:")
print(f"  Value: {study.best_value:.4f}")
print("  Params: ")
for key, value in study.best_params.items():
    print(f"    {key}: {value}")

[I 2025-08-23 02:49:16,325] A new study created in memory with name: no-name-6f819323-d08e-4559-8f30-5515d0774c40


Starting Optuna hyperparameter search...


[I 2025-08-23 02:49:20,377] Trial 0 finished with value: 0.697274587048172 and parameters: {'n_estimators': 84, 'max_depth': 10, 'learning_rate': 0.17518259762106722, 'subsample': 0.7053992715677528, 'colsample_bytree': 0.8914399719020145, 'gamma': 0.43348269398175515, 'min_child_weight': 1}. Best is trial 0 with value: 0.697274587048172.
[I 2025-08-23 02:49:22,374] Trial 1 finished with value: 0.6996357730319994 and parameters: {'n_estimators': 251, 'max_depth': 5, 'learning_rate': 0.014213700572033144, 'subsample': 0.8469223682218545, 'colsample_bytree': 0.7488629462048528, 'gamma': 0.2734402036000213, 'min_child_weight': 2}. Best is trial 1 with value: 0.6996357730319994.
[I 2025-08-23 02:49:25,157] Trial 2 finished with value: 0.6689788513373419 and parameters: {'n_estimators': 428, 'max_depth': 4, 'learning_rate': 0.078105082870901, 'subsample': 0.8586445962398741, 'colsample_bytree': 0.6561025234163058, 'gamma': 0.2756306849133796, 'min_child_weight': 5}. Best is trial 1 with val


Best trial found:
  Value: 0.7260
  Params: 
    n_estimators: 219
    max_depth: 4
    learning_rate: 0.03577298730002174
    subsample: 0.7096467716188001
    colsample_bytree: 0.6006461216221706
    gamma: 0.04075701312346882
    min_child_weight: 1


### Evaluate the Best Model

In [6]:
# Get the best parameters
best_params = study.best_params

# Add the static parameters back in
neg_count = np.sum(y_train == 0)
pos_count = np.sum(y_train == 1)
best_params['objective'] = 'binary:logistic'
best_params['eval_metric'] = 'logloss'
best_params['use_label_encoder'] = False
best_params['random_state'] = 42
best_params['scale_pos_weight'] = neg_count / pos_count if pos_count > 0 else 1

# Train the final model with the best parameters
best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate metrics
tuned_accuracy = accuracy_score(y_test, y_pred)
tuned_roc_auc = roc_auc_score(y_test, y_pred_proba)

print("\n--- Tuned XGBoost Model Performance ---")
print(f"Accuracy on Test Set: {tuned_accuracy:.3f}")
print(f"ROC AUC on Test Set:  {tuned_roc_auc:.3f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Tuned XGBoost Model Performance ---
Accuracy on Test Set: 0.753
ROC AUC on Test Set:  0.757


### Tuning Results Visualization

In [8]:
from optuna.visualization import plot_optimization_history, plot_param_importances

# Plot optimization history
fig1 = plot_optimization_history(study)
fig1.show()

# Plot hyperparameter importances
fig2 = plot_param_importances(study)
fig2.show()

### Final Comparison

In [9]:
print("\n--- Comparison ---")
print("Metric         | RandomForest (Baseline) | Tuned XGBoost Model")
print("----------------|-------------------------|---------------------")
rf_roc_auc = 0.761 #
print(f"ROC AUC       | {rf_roc_auc:.3f}                   | {tuned_roc_auc:.3f}")


--- Comparison ---
Metric         | RandomForest (Baseline) | Tuned XGBoost Model
----------------|-------------------------|---------------------
ROC AUC       | 0.761                   | 0.757
