In [1]:
# ssra_train.py

import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from imblearn.over_sampling import SMOTE
import xgboost as xgb

# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv("Realistic_Surgical_Risk_Data_30000.csv")

X = df.drop(columns=["Risk"])
y = df["Risk"]

# -----------------------------
# 2. Define feature groups
# -----------------------------
numeric_features = ["Surgery_Duration", "Age", "BMI", "Hb", "Creatinine", "ASA"]
binary_features = ["Smoking", "Cardiac_History", "Diabetes", "Hypertension", "Emergency"]
categorical_features = ["Sex", "Surgery_Type"]

# -----------------------------
# 3. Build transformers
# -----------------------------
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

binary_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# -----------------------------
# 4. Column transformer
# -----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("bin", binary_transformer, binary_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# -----------------------------
# 5. Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 6. Preprocess data
# -----------------------------
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# -----------------------------
# 7. Apply SMOTE on training set
# -----------------------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_processed, y_train)

print("Class distribution before SMOTE:", np.bincount(y_train))
print("Class distribution after SMOTE:", np.bincount(y_train_res))

# -----------------------------
# 8. Train XGBoost model
# -----------------------------
xgb_clf = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=3,                 # 3 classes: Low, Moderate, High
    eval_metric="mlogloss",
    random_state=42,
    learning_rate=0.05,
    n_estimators=300,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False
)

xgb_clf.fit(X_train_res, y_train_res)

# -----------------------------
# 9. Evaluation
# -----------------------------
y_pred = xgb_clf.predict(X_test_processed)
y_proba = xgb_clf.predict_proba(X_test_processed)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_proba, multi_class="ovr")
print("\nROC-AUC Score:", roc_auc)

# -----------------------------
# 10. Save model + preprocessor
# -----------------------------
joblib.dump(preprocessor, "ssra_preprocessor.pkl")
joblib.dump(xgb_clf, "ssra_xgb_model.pkl")

print("\n✅ Training complete. Preprocessor and model saved!")

Class distribution before SMOTE: [13806  9904   290]
Class distribution after SMOTE: [13806 13806 13806]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Confusion Matrix:
 [[3450    1    0]
 [  12 2460    4]
 [   0   17   56]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3451
           1       0.99      0.99      0.99      2476
           2       0.93      0.77      0.84        73

    accuracy                           0.99      6000
   macro avg       0.97      0.92      0.94      6000
weighted avg       0.99      0.99      0.99      6000


ROC-AUC Score: 0.9991436719881505

✅ Training complete. Preprocessor and model saved!
