In [1]:
!pip install shap
!pip install xgboost



In [2]:
import pandas as pd
import numpy as np

import boto3
from pyathena import connect
import sagemaker
from sagemaker.feature_store.feature_group import FeatureGroup, FeatureDefinition, FeatureTypeEnum
from sagemaker.session import Session
from sagemaker import get_execution_role

from pyathena import connect

import time
import shap
import json
import joblib

import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score, precision_score, recall_score

from itertools import combinations
import pickle
import warnings

import model_methods


# S3 and Athena details
bucket_name = "group3-project-bucket"
database_name = "group_project_db"
table_name = "hospital_readmissions"
s3_output = f"s3://{bucket_name}/athena-results/"
region = "us-east-1"
s3_client = boto3.client("s3", region_name=region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


## Find the latest feature store to generate a new model!

In [3]:
glue_client = boto3.client("glue")

# List databases in AWS Glue
response = glue_client.get_databases()
print("\nAvailable Databases in Glue:")
for db in response["DatabaseList"]:
    print(f"- {db['Name']}")

# List tables in the `sagemaker_featurestore` database (if it exists)
database_name = "sagemaker_featurestore"

try:
    response = glue_client.get_tables(DatabaseName=database_name)
    print(f"\nTables in `{database_name}` database:")
    for table in response["TableList"]:
        print(f"- {table['Name']}")
except glue_client.exceptions.EntityNotFoundException:
    print(f"\nDatabase `{database_name}` not found in Glue.")


Available Databases in Glue:
- default
- group_project_db
- sagemaker_featurestore

Tables in `sagemaker_featurestore` database:
- hospital_readmissions_features_1740354579


Query Athenta Tables for Data Splitting

In [4]:
# Query the feature store in Athena

latest_table = response["TableList"][-1]
query = f"""
SELECT * 
FROM "sagemaker_featurestore"."{latest_table["Name"]}"
"""

# Connect to Athena
connection = connect(
    s3_staging_dir=f"s3://{bucket_name}/athena-results/",
    region_name="us-east-1"
)

# Retrieve all feature data
df = pd.read_sql(query, connection)

  df = pd.read_sql(query, connection)


In [5]:
df = df.drop(columns=["event_time", "write_time", "api_invocation_time", "is_deleted"])
display(df.head())


print("Base Features: ", len(df.columns))
print("Data Samples: ", len(df))

categorical_columns, num_cats = model_methods.get_categorical_columns_from_s3(bucket_name, s3_client)
print("Categorical columns: ", categorical_columns)
print("Num classes per category: ", num_cats)

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,a1ctest,change,diabetes_med,readmitted
0,1,8,48,0,16,0,1,1,4,6,0,6,1,1,0,1,1
1,1,2,1,0,9,0,1,0,4,1,2,6,1,1,1,1,1
2,1,4,58,4,15,0,0,0,4,0,0,0,1,0,0,1,0
3,1,3,68,3,13,0,0,0,0,0,0,7,1,1,0,0,0
4,1,11,56,6,44,0,0,0,4,0,0,6,1,1,0,1,0


Base Features:  17
Data Samples:  25000
Identified categorical columns: ['glucose_test', 'A1Ctest', 'age', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'change', 'diabetes_med', 'readmitted']
Category counts per categorical column: {'glucose_test': 3, 'A1Ctest': 3, 'age': 6, 'medical_specialty': 7, 'diag_1': 8, 'diag_2': 8, 'diag_3': 8, 'change': 2, 'diabetes_med': 2, 'readmitted': 2}
Categorical columns:  ['glucose_test', 'A1Ctest', 'age', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'change', 'diabetes_med', 'readmitted']
Num classes per category:  {'glucose_test': 3, 'A1Ctest': 3, 'age': 6, 'medical_specialty': 7, 'diag_1': 8, 'diag_2': 8, 'diag_3': 8, 'change': 2, 'diabetes_med': 2, 'readmitted': 2}


Train an XGB model raw, use SHAP to visual feature importance.

In [6]:
# Split data into train (50%), test (10%), production (40%)
train_df, temp_df = train_test_split(df, test_size=0.5, random_state=42)
test_df, prod_df = train_test_split(temp_df, test_size=0.8, random_state=42)

# Separate features and target variable
target_column = "readmitted"
X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]
X_test = test_df.drop(columns=[target_column])
y_test = test_df[target_column]

# Train XGBoost model on production dataset
dmatrix_train = xgb.DMatrix(X_train, label=y_train)
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "seed": 42
}
model = xgb.train(params, dmatrix_train, num_boost_round=100)

def eval_model(model, X_test, y_test):
    """
    Evaluates the XGBoost model and returns evaluation metrics.
    Ensures all relevant metrics are logged for the model card.
    """
    # Convert test data into DMatrix
    dmatrix_test = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)
    
    # Make predictions
    y_pred_proba = model.predict(dmatrix_test)
    y_pred = (y_pred_proba >= 0.5).astype(int)  # Convert probabilities to binary predictions
    
    # Compute evaluation metrics
    test_log_loss = log_loss(y_test, y_pred_proba)
    test_accuracy = accuracy_score(y_test, y_pred)
    test_auc = roc_auc_score(y_test, y_pred_proba)
    test_precision = precision_score(y_test, y_pred)
    test_recall = recall_score(y_test, y_pred)

    # Print evaluation metrics
    print(f"✅ Test Log Loss: {test_log_loss:.4f}")
    print(f"✅ Test Accuracy: {test_accuracy:.4f}")
    print(f"✅ Test AUC: {test_auc:.4f}")
    print(f"✅ Test Precision: {test_precision:.4f}")
    print(f"✅ Test Recall: {test_recall:.4f}")

    return test_accuracy, test_auc, test_precision, test_recall


eval_model(model, X_test, y_test)

✅ Test Log Loss: 0.6914
✅ Test Accuracy: 0.5904
✅ Test AUC: 0.6352
✅ Test Precision: 0.5868
✅ Test Recall: 0.5149


(0.5904, 0.6351637945174584, 0.5867924528301887, 0.5149006622516556)

Extract Important Features from XGBoost Model.. create interacting features from statistitically most important features.

In [7]:
X_train_final, X_test_final = model_methods.shap_feature_engineering(model, X_train, X_test, bucket_name, s3_client)
display(X_train_final.head())
display(X_test_final.head())

print(X_train_final.dtypes)

Identified categorical columns: ['glucose_test', 'A1Ctest', 'age', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'change', 'diabetes_med', 'readmitted']
Category counts per categorical column: {'glucose_test': 3, 'A1Ctest': 3, 'age': 6, 'medical_specialty': 7, 'diag_1': 8, 'diag_2': 8, 'diag_3': 8, 'change': 2, 'diabetes_med': 2, 'readmitted': 2}




Top Features: ['n_inpatient', 'n_lab_procedures', 'n_medications', 'n_outpatient', 'time_in_hospital', 'age', 'diag_1', 'n_procedures', 'medical_specialty', 'diag_3', 'diag_2', 'n_emergency', 'diabetes_med', 'glucose_test', 'a1ctest', 'change']
Interaction features saved to s3://group3-project-bucket/config/interaction_features.json


Unnamed: 0,n_inpatient,n_lab_procedures,n_medications,n_outpatient,time_in_hospital,age,diag_1,n_procedures,medical_specialty,diag_3,...,n_inpatient_x_n_lab_procedures,n_inpatient_x_n_medications,n_inpatient_x_n_outpatient,n_inpatient_x_time_in_hospital,n_lab_procedures_x_n_medications,n_lab_procedures_x_n_outpatient,n_lab_procedures_x_time_in_hospital,n_medications_x_n_outpatient,n_medications_x_time_in_hospital,n_outpatient_x_time_in_hospital
12204,0,80,18,0,5,3,0,0,4,0,...,0,0,0,0,1440,0,400,0,90,0
2655,0,9,13,0,4,1,0,0,4,6,...,0,0,0,0,117,0,36,0,52,0
9592,0,38,29,0,12,3,6,6,4,6,...,0,0,0,0,1102,0,456,0,348,0
18228,0,31,10,0,4,2,1,0,4,0,...,0,0,0,0,310,0,124,0,40,0
18105,0,52,10,0,1,2,6,0,3,1,...,0,0,0,0,520,0,52,0,10,0


Unnamed: 0,n_inpatient,n_lab_procedures,n_medications,n_outpatient,time_in_hospital,age,diag_1,n_procedures,medical_specialty,diag_3,...,n_inpatient_x_n_lab_procedures,n_inpatient_x_n_medications,n_inpatient_x_n_outpatient,n_inpatient_x_time_in_hospital,n_lab_procedures_x_n_medications,n_lab_procedures_x_n_outpatient,n_lab_procedures_x_time_in_hospital,n_medications_x_n_outpatient,n_medications_x_time_in_hospital,n_outpatient_x_time_in_hospital
7198,0,106,40,0,12,3,6,3,4,7,...,0,0,0,0,4240,0,1272,0,480,0
4580,0,38,12,0,3,1,7,0,2,0,...,0,0,0,0,456,0,114,0,36,0
4278,0,28,15,4,2,1,0,0,4,0,...,0,0,0,0,420,112,56,60,30,8
1837,1,41,11,0,4,5,0,0,0,6,...,41,11,0,4,451,0,164,0,44,0
9770,1,64,18,0,4,3,0,0,4,1,...,64,18,0,4,1152,0,256,0,72,0


n_inpatient                            int64
n_lab_procedures                       int64
n_medications                          int64
n_outpatient                           int64
time_in_hospital                       int64
age                                    int64
diag_1                                 int64
n_procedures                           int64
medical_specialty                      int64
diag_3                                 int64
diag_2                                 int64
n_emergency                            int64
diabetes_med                           int64
glucose_test                           int64
a1ctest                                int64
change                                 int64
n_inpatient_x_n_lab_procedures         int64
n_inpatient_x_n_medications            int64
n_inpatient_x_n_outpatient             int64
n_inpatient_x_time_in_hospital         int64
n_lab_procedures_x_n_medications       int64
n_lab_procedures_x_n_outpatient        int64
n_lab_proc

In [8]:
X_prod = prod_df.drop(columns=[target_column])
y_prod = prod_df[target_column]

X_prod_final = model_methods.apply_interaction_features(X_prod, bucket_name, s3_client)
print(X_prod_final.columns)

Index(['age', 'time_in_hospital', 'n_lab_procedures', 'n_procedures',
       'n_medications', 'n_outpatient', 'n_inpatient', 'n_emergency',
       'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'glucose_test',
       'a1ctest', 'change', 'diabetes_med', 'n_inpatient_x_n_lab_procedures',
       'n_inpatient_x_n_medications', 'n_inpatient_x_n_outpatient',
       'n_inpatient_x_time_in_hospital', 'n_lab_procedures_x_n_medications',
       'n_lab_procedures_x_n_outpatient',
       'n_lab_procedures_x_time_in_hospital', 'n_medications_x_n_outpatient',
       'n_medications_x_time_in_hospital', 'n_outpatient_x_time_in_hospital'],
      dtype='object')


Bayesian Optimization Procedure to find best XGB model

In [9]:
!pip install optuna



In [10]:
import optuna
from sklearn.metrics import roc_auc_score

def xgb_objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'seed': 42
    }
    
    dtrain = xgb.DMatrix(X_train_final, label=y_train, enable_categorical=True)
    dtest = xgb.DMatrix(X_test_final, label=y_test, enable_categorical=True)
    model = xgb.train(params, dtrain, num_boost_round=100)
    preds = model.predict(dtest)
    
    return roc_auc_score(y_test, preds)

study = optuna.create_study(direction="maximize")
study.optimize(xgb_objective, n_trials=100)

best_params = study.best_params
best_params['max_depth'] = int(best_params['max_depth'])



[I 2025-02-24 06:32:10,091] A new study created in memory with name: no-name-418c7899-dee5-4f36-b21d-20a3bb6143c1
[I 2025-02-24 06:32:10,437] Trial 0 finished with value: 0.6482448024521765 and parameters: {'learning_rate': 0.09950101252770646, 'max_depth': 4, 'min_child_weight': 2.3276848970939183, 'colsample_bytree': 0.4337996966164983, 'subsample': 0.9778325077669885}. Best is trial 0 with value: 0.6482448024521765.
[I 2025-02-24 06:32:10,946] Trial 1 finished with value: 0.6469768109404793 and parameters: {'learning_rate': 0.022995425078808654, 'max_depth': 7, 'min_child_weight': 7.259576241891933, 'colsample_bytree': 0.5739261561771722, 'subsample': 0.8122080337838671}. Best is trial 0 with value: 0.6482448024521765.
[I 2025-02-24 06:32:11,434] Trial 2 finished with value: 0.6240184118262153 and parameters: {'learning_rate': 0.16872294317422176, 'max_depth': 8, 'min_child_weight': 5.985516671855449, 'colsample_bytree': 0.7418986456100651, 'subsample': 0.699928718649544}. Best is t

In [11]:
# Train the best model
dmatrix_train = xgb.DMatrix(X_train_final, label=y_train, enable_categorical=True)
model = xgb.train(best_params, dmatrix_train, num_boost_round=500)

In [12]:
# Visualize Optuna Trials
fig = optuna.visualization.matplotlib.plot_optimization_history(study)
plt.savefig("figures/optuna_optimization_history.png")
plt.close()

fig = optuna.visualization.matplotlib.plot_param_importances(study)
plt.savefig("figures/optuna_param_importance.png")
plt.close()

  fig = optuna.visualization.matplotlib.plot_optimization_history(study)
  fig = optuna.visualization.matplotlib.plot_param_importances(study)


In [13]:
explainer = shap.Explainer(model, X_test_final)
shap_values = explainer(X_test_final)

# Save SHAP summary plot
plt.figure()
shap.summary_plot(shap_values, X_test_final, show=False)
plt.savefig("figures/final_shap_summary.png")
plt.close()

# Save SHAP dependence plot for the first feature
plt.figure()
shap.dependence_plot(0, shap_values.values, X_test_final, show=False)
plt.savefig("figures/final_shap_dependence_0.png")
plt.close()



<Figure size 640x480 with 0 Axes>

In [14]:
print("TEST")
acc, roc, precision, recall = eval_model(model, X_test_final, y_test)

print("TRAIN")
acc_train, roc_train, precision_train, recall_train = eval_model(model, X_train_final, y_train)

TEST
✅ Test Log Loss: 0.7142
✅ Test Accuracy: 0.6028
✅ Test AUC: 0.6358
✅ Test Precision: 0.6053
✅ Test Recall: 0.5116
TRAIN
✅ Test Log Loss: 0.4845
✅ Test Accuracy: 0.7973
✅ Test AUC: 0.8875
✅ Test Precision: 0.8377
✅ Test Recall: 0.7039


In [15]:
# Save the model
model.save_model("models/tuned_xgboost_model.model")
print("Model saved to models/tuned_xgboost_model.model")

Model saved to models/tuned_xgboost_model.model




## PUT XGBOOST MODEL IN MODEL STORE

In [16]:
import tarfile

region = boto3.Session().region_name
role = get_execution_role()
sagemaker_session = sagemaker.Session()

best_params = study.best_params

# ✅ SageMaker client
sm_client = boto3.client("sagemaker", region_name=region)
s3_client = boto3.client("s3", region_name=region)

# ✅ **Model Package Group Name (Unique Identifier)**
model_package_group_name = "xgboost-hospital-readmissions-" + str(int(time.time()))
model_package_group_description = "XGBoost model for predicting hospital readmissions."

# ✅ **Create Model Package Group**
create_model_package_group_response = sm_client.create_model_package_group(
    ModelPackageGroupName=model_package_group_name,
    ModelPackageGroupDescription=model_package_group_description,
)

print("✅ ModelPackageGroup Arn:", create_model_package_group_response["ModelPackageGroupArn"])

# ✅ **Save the trained XGBoost model**
joblib.dump(model, "model.joblib")

# ✅ **Save the model as a compressed `.tar.gz` file**
tar_filename = "model.tar.gz"
with tarfile.open(tar_filename, "w:gz") as tar:
    tar.add("model.joblib")

# ✅ **Upload model to S3**
timestamp = int(time.time())
prefix = "hospital-readmissions-xgboost"
s3_key = f"{prefix}/model-{timestamp}.tar.gz"
model_s3_uri = f"s3://{bucket_name}/{s3_key}"

s3_client.upload_file(tar_filename, bucket_name, s3_key)
print(f"✅ Model artifact uploaded to: {model_s3_uri}")

# ✅ **Define SageMaker Model Package**
model_package_name = "xgboost-hospital-readmissions-package-" + str(int(time.time()))
xgboost_image_uri = "683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1"

inference_spec = {
    "Containers": [
        {
            "Image": xgboost_image_uri,
            "ModelDataUrl": model_s3_uri,
        }
    ],
    "SupportedTransformInstanceTypes": ["ml.m5.large", "ml.m5.xlarge"],
    "SupportedRealtimeInferenceInstanceTypes": ["ml.m5.large", "ml.m5.xlarge"],
    "SupportedContentTypes": ["text/csv"],
    "SupportedResponseMIMETypes": ["text/csv", "application/json"],
}

create_model_package_response = sm_client.create_model_package(
    ModelPackageGroupName=model_package_group_name,
    ModelPackageDescription="XGBoost model for predicting hospital readmissions.",
    InferenceSpecification=inference_spec,
    ModelApprovalStatus="PendingManualApproval",
)

print("✅ Model Package ARN:", create_model_package_response["ModelPackageArn"])

# ✅ **Describe the Model Package**
response = sm_client.describe_model_package(
    ModelPackageName=create_model_package_response["ModelPackageArn"]
)

print("Model Package Details:", response)

# ✅ **Create a Model Card**
model_card_name = f"xgboost-hospital-readmissions-card-{int(time.time())}"

# ✅ Update Model Card with **full evaluation details**
model_card_content = {
    "model_overview": {
        "model_description": "XGBoost model predicting hospital readmissions.",
        "model_creator": "Your Team",
        "model_artifact": [model_s3_uri],
        "algorithm_type": "XGBoost",
        "problem_type": "Binary Classification",
        "model_owner": "Group 3",
    },
    "intended_uses": {
        "purpose_of_model": "Predict likelihood of hospital readmission within 30 days.",
        "intended_uses": "Hospital decision support, optimizing patient care.",
        "risk_rating": "Medium",
        "explanations_for_risk_rating": "Incorrect predictions may lead to unnecessary interventions or missed readmissions.",
    },
    "business_details": {
        "business_problem": "Reducing hospital readmission rates.",
        "business_stakeholders": "Healthcare providers, insurers, hospital administrators.",
    },
    "training_details": {
        "training_observations": "Model trained on historical patient admission data.",
        "training_job_details": {
            "training_datasets": [f"sagemaker_featurestore.{latest_table['Name']}"],
            "training_environment": {
                "container_image": [xgboost_image_uri],
            },
            "training_metrics": [
                {"name": "accuracy", "value": acc_train},
                {"name": "roc_auc", "value": roc_train},
                {"name": "precision", "value": precision_train},
                {"name": "recall", "value": recall_train},
            ],
            "hyper_parameters": [
                {"name": k, "value": str(v)} for k, v in best_params.items()
            ],
        },
    },
    "evaluation_details": [
        {
            "name": "evaluation-1",
            "datasets": [f"sagemaker_featurestore.{latest_table['Name']}"],
            "metric_groups": [
                {
                    "name": "readmission-metrics",
                    "metric_data": [
                        {"name": "accuracy", "type": "number", "value": acc},
                        {"name": "roc_auc", "type": "number", "value": roc},
                        {"name": "precision", "type": "number", "value": precision},
                        {"name": "recall", "type": "number", "value": recall},
                    ],
                }
            ],
            "evaluation_observation": "Evaluated on a holdout test dataset.",
        }
    ],
    "additional_information": {
        "ethical_considerations": "Model should not be used to make final medical decisions.",
        "caveats_and_recommendations": "Should be used alongside physician assessment.",
    },
}


# ✅ **Create the Model Card**
response = sm_client.create_model_card(
    ModelCardName=model_card_name,
    Content=json.dumps(model_card_content),
    ModelCardStatus="Draft",
)

print("✅ Model Card ARN:", response["ModelCardArn"])
print("✅ Model Card Name:", model_card_name)

# ✅ **Describe the Model Card**
describe_response = sm_client.describe_model_card(ModelCardName=model_card_name)
print(describe_response)

✅ ModelPackageGroup Arn: arn:aws:sagemaker:us-east-1:321261761338:model-package-group/xgboost-hospital-readmissions-1740378802
✅ Model artifact uploaded to: s3://group3-project-bucket/hospital-readmissions-xgboost/model-1740378802.tar.gz
✅ Model Package ARN: arn:aws:sagemaker:us-east-1:321261761338:model-package/xgboost-hospital-readmissions-1740378802/1
Model Package Details: {'ModelPackageGroupName': 'xgboost-hospital-readmissions-1740378802', 'ModelPackageVersion': 1, 'ModelPackageArn': 'arn:aws:sagemaker:us-east-1:321261761338:model-package/xgboost-hospital-readmissions-1740378802/1', 'ModelPackageDescription': 'XGBoost model for predicting hospital readmissions.', 'CreationTime': datetime.datetime(2025, 2, 24, 6, 33, 23, 459000, tzinfo=tzlocal()), 'InferenceSpecification': {'Containers': [{'Image': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1', 'ImageDigest': 'sha256:c764382b16cd0c921f1b2e66de8684fb999ccbd0c042c95679f0b69bc9cdd12c', 'ModelDataUrl': 's3://g