In [1]:
!pip install shap
!pip install xgboost

[0m

In [12]:
import pandas as pd
import xgboost as xgb
import shap
import boto3
from pyathena import connect
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# S3 and Athena details
bucket_name = "group3-project-bucket"
database_name = "group_project_db"
table_name = "hospital_readmissions"
s3_output = f"s3://{bucket_name}/athena-results/"
region = "us-east-1"
s3_client = boto3.client("s3", region_name=region)

Query Athenta Tables for Data Splitting

In [13]:
# Connect to Athena
connection = connect(s3_staging_dir=s3_output, region_name=region)

# Query the data
query = f"""
SELECT * 
FROM {database_name}.{table_name}
"""
df = pd.read_sql(query, connection)

  df = pd.read_sql(query, connection)


In [14]:
display(df.head())
print(len(df.columns))

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,a1ctest,change,diabetes_med,readmitted
0,3,8,72,1,18,2,0,0,4,0,7,6,1,1,0,1,0
1,3,3,34,2,13,0,0,0,5,6,6,6,1,1,0,1,0
2,1,5,45,0,18,0,0,0,4,0,0,0,1,1,1,1,1
3,3,2,36,0,12,1,0,0,4,0,6,1,1,1,1,1,1
4,2,1,42,0,7,0,0,0,3,6,0,7,1,1,0,1,0


17


Train an XGB model raw, use SHAP to visual feature importance.

In [15]:
# Split data into train (50%), test (10%), production (40%)
train_df, temp_df = train_test_split(df, test_size=0.5, random_state=42)
test_df, prod_df = train_test_split(temp_df, test_size=0.8, random_state=42)

# Separate features and target variable
target_column = "readmitted"
X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]
X_test = test_df.drop(columns=[target_column])
y_test = test_df[target_column]

# Train XGBoost model on production dataset
dmatrix_train = xgb.DMatrix(X_train, label=y_train)
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "seed": 42
}
model = xgb.train(params, dmatrix_train, num_boost_round=100)

Extract Important Features from XGBoost Model.. create interacting features from statistitically most important features.

In [16]:
from itertools import combinations
import json

def shap_feature_engineering(model, X_train, X_test, top_k=17, shap_threshold=0.01):
    explainer = shap.Explainer(model, X_test)
    shap_values = explainer(X_test)
    
    # Save SHAP summary plot
    plt.figure()
    shap.summary_plot(shap_values, X_test, show=False)
    plt.savefig("figures/prelim_shap_summary.png")
    plt.close()

    # Save SHAP dependence plot for the first feature
    plt.figure()
    shap.dependence_plot(0, shap_values.values, X_test, show=False)
    plt.savefig("figures/prelim_shap_dependence_0.png")
    plt.close()
    
    feature_importance = pd.DataFrame({
        "feature": X_test.columns,
        "shap_importance": abs(shap_values.values).mean(axis=0)
    }).sort_values(by="shap_importance", ascending=False)

    selected_features = feature_importance[feature_importance["shap_importance"] > shap_threshold]["feature"].tolist()
    top_features = feature_importance.head(top_k)["feature"].tolist()
    print(top_features)
    
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]

    # Create multiple interaction terms
    interaction_features = list(combinations(top_features[:5], 2))  # Generate pairwise interactions from top 5 features
    for f1, f2 in interaction_features:
        X_train_selected[f"{f1}_x_{f2}"] = X_train[f1] * X_train[f2]
        X_test_selected[f"{f1}_x_{f2}"] = X_test[f1] * X_test[f2]
        
    # Save interaction features to S3
    interaction_json = {"interaction_features": interaction_features}
    interaction_file = "figures/interaction_features.json"
    with open(interaction_file, "w") as f:
        json.dump(interaction_json, f)
    s3_client.upload_file(interaction_file, bucket_name, "config/interaction_features.json")
    print(f"Interaction features saved to s3://{bucket_name}/config/interaction_features.json")
    
    return X_train_selected, X_test_selected

def apply_interaction_features(X_new):
    interaction_key = "config/interaction_features.json"
    interaction_file = "interaction_features.json"
    s3_client.download_file(bucket_name, interaction_key, interaction_file)
    
    with open(interaction_file, "r") as f:
        interaction_data = json.load(f)
    
    for f1, f2 in interaction_data["interaction_features"]:
        X_new[f"{f1}_x_{f2}"] = X_new[f1] * X_new[f2]
    
    return X_new


In [17]:
X_train_final, X_test_final = shap_feature_engineering(model, X_train, X_test)
display(X_train_final.head())
display(X_test_final.head())

['n_inpatient', 'n_lab_procedures', 'n_medications', 'n_procedures', 'diag_1', 'n_outpatient', 'time_in_hospital', 'age', 'diag_2', 'diabetes_med', 'medical_specialty', 'n_emergency', 'diag_3', 'a1ctest', 'change', 'glucose_test']
Interaction features saved to s3://group3-project-bucket/config/interaction_features.json


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_selected[f"{f1}_x_{f2}"] = X_train[f1] * X_train[f2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_selected[f"{f1}_x_{f2}"] = X_test[f1] * X_test[f2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_selected[f"{f1}_x_{f2}"] = X_train[f1] * X_train[f2]
A value is trying to be s

Unnamed: 0,n_inpatient,n_lab_procedures,n_medications,n_procedures,diag_1,n_outpatient,time_in_hospital,age,diag_2,diabetes_med,...,n_inpatient_x_n_lab_procedures,n_inpatient_x_n_medications,n_inpatient_x_n_procedures,n_inpatient_x_diag_1,n_lab_procedures_x_n_medications,n_lab_procedures_x_n_procedures,n_lab_procedures_x_diag_1,n_medications_x_n_procedures,n_medications_x_diag_1,n_procedures_x_diag_1
12204,0,55,23,6,0,0,5,2,0,1,...,0,0,0,0,1265,330,0,138,0,0
2655,0,64,17,0,6,0,3,4,0,1,...,0,0,0,0,1088,0,384,0,102,0
9592,0,46,7,0,6,0,4,1,6,1,...,0,0,0,0,322,0,276,0,42,0
18228,0,38,12,2,0,0,2,4,1,1,...,0,0,0,0,456,76,0,24,0,0
18105,0,63,27,4,3,0,4,3,0,1,...,0,0,0,0,1701,252,189,108,81,12


Unnamed: 0,n_inpatient,n_lab_procedures,n_medications,n_procedures,diag_1,n_outpatient,time_in_hospital,age,diag_2,diabetes_med,...,n_inpatient_x_n_lab_procedures,n_inpatient_x_n_medications,n_inpatient_x_n_procedures,n_inpatient_x_diag_1,n_lab_procedures_x_n_medications,n_lab_procedures_x_n_procedures,n_lab_procedures_x_diag_1,n_medications_x_n_procedures,n_medications_x_diag_1,n_procedures_x_diag_1
7198,0,95,18,3,0,0,3,1,6,1,...,0,0,0,0,1710,285,0,54,0,0
4580,0,72,19,0,6,0,7,0,6,1,...,0,0,0,0,1368,0,432,0,114,0
4278,0,58,11,0,6,0,5,3,0,1,...,0,0,0,0,638,0,348,0,66,0
1837,0,46,6,1,2,0,4,0,2,0,...,0,0,0,0,276,46,92,6,12,2
9770,1,34,16,0,7,0,2,3,6,0,...,34,16,0,7,544,0,238,0,112,0


<Figure size 640x480 with 0 Axes>

In [20]:
X_prod = prod_df.drop(columns=[target_column])
y_prod = prod_df[target_column]

X_prod_final = apply_interaction_features(X_prod)
print(X_prod_final.columns)

Index(['age', 'time_in_hospital', 'n_lab_procedures', 'n_procedures',
       'n_medications', 'n_outpatient', 'n_inpatient', 'n_emergency',
       'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'glucose_test',
       'a1ctest', 'change', 'diabetes_med', 'n_inpatient_x_n_lab_procedures',
       'n_inpatient_x_n_medications', 'n_inpatient_x_n_procedures',
       'n_inpatient_x_diag_1', 'n_lab_procedures_x_n_medications',
       'n_lab_procedures_x_n_procedures', 'n_lab_procedures_x_diag_1',
       'n_medications_x_n_procedures', 'n_medications_x_diag_1',
       'n_procedures_x_diag_1'],
      dtype='object')


Bayesian Optimization Procedure to find best XGB model

In [21]:
!pip install optuna

[0m

In [22]:
import optuna
from sklearn.metrics import roc_auc_score

def xgb_objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'seed': 42
    }
    
    dtrain = xgb.DMatrix(X_train_final, label=y_train)
    dtest = xgb.DMatrix(X_test_final, label=y_test)
    model = xgb.train(params, dtrain, num_boost_round=100)
    preds = model.predict(dtest)
    
    return roc_auc_score(y_test, preds)

study = optuna.create_study(direction="maximize")
study.optimize(xgb_objective, n_trials=30)

best_params = study.best_params
best_params['max_depth'] = int(best_params['max_depth'])

# Train the best model
dmatrix_prod = xgb.DMatrix(X_train_final, label=y_train)
model = xgb.train(best_params, dmatrix_prod, num_boost_round=500)

[I 2025-02-10 23:52:49,562] A new study created in memory with name: no-name-8b0d029c-5148-40e3-9c80-2aee476b81fd
[I 2025-02-10 23:52:49,863] Trial 0 finished with value: 0.6564758976898903 and parameters: {'learning_rate': 0.17464435394877542, 'max_depth': 3, 'min_child_weight': 1.7105075634364395, 'colsample_bytree': 0.766896706363402, 'subsample': 0.6668713866957869}. Best is trial 0 with value: 0.6564758976898903.
[I 2025-02-10 23:52:50,107] Trial 1 finished with value: 0.6483659106460066 and parameters: {'learning_rate': 0.23508410680986302, 'max_depth': 3, 'min_child_weight': 4.748681290827516, 'colsample_bytree': 0.32236939118352426, 'subsample': 0.7364896309830595}. Best is trial 0 with value: 0.6564758976898903.
[I 2025-02-10 23:52:50,742] Trial 2 finished with value: 0.6242616658250673 and parameters: {'learning_rate': 0.1195800590633182, 'max_depth': 9, 'min_child_weight': 1.2042960261636935, 'colsample_bytree': 0.6172629488158796, 'subsample': 0.8671657622103552}. Best is t

In [23]:
# Visualize Optuna Trials
fig = optuna.visualization.matplotlib.plot_optimization_history(study)
plt.savefig("figures/optuna_optimization_history.png")
plt.close()

fig = optuna.visualization.matplotlib.plot_param_importances(study)
plt.savefig("figures/optuna_param_importance.png")
plt.close()

  fig = optuna.visualization.matplotlib.plot_optimization_history(study)
  fig = optuna.visualization.matplotlib.plot_param_importances(study)


In [25]:
explainer = shap.Explainer(model, X_test_final)
shap_values = explainer(X_test_final)

# Save SHAP summary plot
plt.figure()
shap.summary_plot(shap_values, X_test_final, show=False)
plt.savefig("figures/final_shap_summary.png")
plt.close()

# Save SHAP dependence plot for the first feature
plt.figure()
shap.dependence_plot(0, shap_values.values, X_test_final, show=False)
plt.savefig("figures/final_shap_dependence_0.png")
plt.close()



<Figure size 640x480 with 0 Axes>

In [26]:
# Save the model
model.save_model("models/tuned_xgboost_model.model")
print("Model saved to models/tuned_xgboost_model.model")

Model saved to models/tuned_xgboost_model.model


