In [11]:
!pip install xgboost
!pip install shap

Collecting shap
  Downloading shap-0.46.0-cp312-cp312-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.46.0-cp312-cp312-win_amd64.whl (456 kB)
Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.46.0 slicer-0.0.8


In [37]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import xgboost as xgb
import shap
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

# Load and preprocess data
breast_cancer = fetch_ucirepo(id=15)
X = breast_cancer.data.features
y = breast_cancer.data.targets['Class'].replace({2:0, 4:1})

# Handle missing values properly
X = X.replace('?', np.nan).apply(pd.to_numeric)
X = X.fillna(X.mean())

# Check class distribution
class_ratio = y.value_counts(normalize=True)
print(f"Class distribution:\n{class_ratio}\n")

# Split data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.25, 
    random_state=42, 
    stratify=y
)

# Configure XGBoost with class weighting
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=1000,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.7,
    gamma=0.1,
    reg_alpha=0.5,
    reg_lambda=1.0,
    scale_pos_weight=class_ratio[0]/class_ratio[1],
    eval_metric=['logloss', 'auc', 'error'],
    early_stopping_rounds=50,
    random_state=42
)

# Train with validation set
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

# Get best iteration
best_iter = xgb_model.best_iteration

# SHAP explainer
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_test)

# Predictions with probability threshold adjustment
y_proba = xgb_model.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.3).astype(int)  # Adjusted threshold for better recall

# Calculate metrics with zero_division parameter
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, zero_division=0),
    'Recall': recall_score(y_test, y_pred),
    'F1-Score': f1_score(y_test, y_pred),
    'AUC-ROC': roc_auc_score(y_test, y_proba),
    'Best Iteration': best_iter
}

# Display results
print(f"\nXGBoost (Stopped at {metrics['Best Iteration']} iterations) Performance:")
print(f"- Accuracy: {metrics['Accuracy']:.4f}")
print(f"- Precision: {metrics['Precision']:.3f} | Recall: {metrics['Recall']:.3f}")
print(f"- F1-Score: {metrics['F1-Score']:.3f} | AUC-ROC: {metrics['AUC-ROC']:.3f}")

# Confusion matrix
conf_matrix = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=['Actual Benign (TN/FP)', 'Actual Malignant (FN/TP)'],
    columns=['Predicted Benign', 'Predicted Malignant']
)
print("\nConfusion Matrix:")
print(conf_matrix)

Class distribution:
Class
0    0.655222
1    0.344778
Name: proportion, dtype: float64


XGBoost (Stopped at 18 iterations) Performance:
- Accuracy: 0.9200
- Precision: 0.811 | Recall: 1.000
- F1-Score: 0.896 | AUC-ROC: 0.987

Confusion Matrix:
                          Predicted Benign  Predicted Malignant
Actual Benign (TN/FP)                  101                   14
Actual Malignant (FN/TP)                 0                   60
