In [2]:
!pip install shap

Collecting shap
  Using cached shap-0.46.0-cp311-cp311-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting slicer==0.0.8 (from shap)
  Using cached slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Using cached shap-0.46.0-cp311-cp311-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (540 kB)
Using cached slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.46.0 slicer-0.0.8


In [1]:
import model_methods
import json
import joblib
import os
import shap
import xgboost as xgb
import numpy as np
import pandas as pd
import sagemaker
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
from sagemaker.predictor import Predictor
import boto3
import tarfile
import io

sagemaker_session = sagemaker.Session()
sm_client = boto3.client("sagemaker")
s3_client = boto3.client("s3")
model_package_group_name = "xgboost-hospital-readmissions-1740627359"
bucket_name = "group3-project-bucket"

def load_from_prod(bucket, key):
    response = s3_client.get_object(Bucket=bucket, Key=key)
    return pd.read_csv(io.BytesIO(response['Body'].read()))

X_prod_file = "production_data/X_prod_final.csv"
y_prod_file = "production_data/y_prod.csv"
X_train_file = "production_data/X_train_final.csv"
y_train_file = "production_data/y_train.csv"


X_prod_final = load_from_prod(bucket_name, X_prod_file)
y_prod = load_from_prod(bucket_name, y_prod_file)

X_train_final = load_from_prod(bucket_name, X_train_file)
y_train = load_from_prod(bucket_name, y_train_file)

print(f"Loaded {X_prod_final.shape[0]} rows and {X_prod_final.shape[1]} features from X_prod_final.")
print(f"Loaded {y_prod.shape[0]} target values from y_prod.")

print(f"Loaded {X_train_final.shape[0]} rows and {X_train_final.shape[1]} features from X_train_final.")
print(f"Loaded {y_train.shape[0]} target values from y_train.")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Loaded 10000 rows and 22 features from X_prod_final.
Loaded 10000 target values from y_prod.
Loaded 12500 rows and 22 features from X_train_final.
Loaded 12500 target values from y_train.


In [2]:
# List all models (approved or not)
response = sm_client.list_model_packages(
    ModelPackageGroupName=model_package_group_name,
    SortBy="CreationTime",
    SortOrder="Descending",
    MaxResults=5,  # Show up to 5 latest models
)

if not response["ModelPackageSummaryList"]:
    raise ValueError(f"No models exist in Model Package Group: {model_package_group_name}")

# Print available models and their approval status
for model in response["ModelPackageSummaryList"]:
    print(f"Model ARN: {model['ModelPackageArn']}")
    print(f"Approval Status: {model['ModelApprovalStatus']}")
    print(f"Creation Time: {model['CreationTime']}")
    print("-" * 50)

Model ARN: arn:aws:sagemaker:us-east-1:321261761338:model-package/xgboost-hospital-readmissions-1740627359/1
Approval Status: PendingManualApproval
Creation Time: 2025-02-27 03:36:01.326000+00:00
--------------------------------------------------


In [3]:
latest_model_arn = "arn:aws:sagemaker:us-east-1:321261761338:model-package/xgboost-hospital-readmissions-1740627359/1"
response = sm_client.update_model_package(
    ModelPackageArn=latest_model_arn,
    ModelApprovalStatus="Approved",
    ApprovalDescription="Approved for deployment",
)

In [4]:
# Get the latest approved model package
response = sm_client.list_model_packages(
    ModelPackageGroupName=model_package_group_name,
    SortBy="CreationTime",
    SortOrder="Descending",
    ModelApprovalStatus="Approved",  # Only get models that are approved for deployment
    MaxResults=1,
)

if not response["ModelPackageSummaryList"]:
    raise ValueError(f"No approved models found in Model Package Group: {model_package_group_name}")

latest_model_package_arn = response["ModelPackageSummaryList"][0]["ModelPackageArn"]
print(f"✅ Using Model Package: {latest_model_package_arn}")

# Get model details to extract S3 path of the trained model
model_details = sm_client.describe_model_package(ModelPackageName=latest_model_package_arn)
model_s3_uri = model_details["InferenceSpecification"]["Containers"][0]["ModelDataUrl"]
model_image_uri = model_details["InferenceSpecification"]["Containers"][0]["Image"]

print(f"✅ Extracted Model Artifact from S3: {model_s3_uri}")

print(f"✅ Model Artifact: {model_s3_uri}")
print(f"✅ Model Container Image: {model_image_uri}")

✅ Using Model Package: arn:aws:sagemaker:us-east-1:321261761338:model-package/xgboost-hospital-readmissions-1740627359/1
✅ Extracted Model Artifact from S3: s3://group3-project-bucket/hospital-readmissions-xgboost/model-1740627360.tar.gz
✅ Model Artifact: s3://group3-project-bucket/hospital-readmissions-xgboost/model-1740627360.tar.gz
✅ Model Container Image: 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1


## Mock Endpoint
CreateModel permissions are not available for any member of our group, so we have created a Mock endpoint that utilizes our latest model to simulate endpoint states as if our model was working.

In [5]:
## Download Model
import tempfile

def download_model(model_uri):
    s3 = boto3.client("s3")
    bucket_name = model_uri.split("/")[2]
    model_key = "/".join(model_uri.split("/")[3:])
    
    # Save to a temporary directory
    temp_dir = tempfile.mkdtemp()
    local_model_path = os.path.join(temp_dir, "model.tar.gz")
    
    # Download the model file
    s3.download_file(bucket_name, model_key, local_model_path)
    print(f"✅ Model downloaded: {local_model_path}")
    
    # Extract the model
    extracted_model_path = os.path.join(temp_dir, "extracted_model")
    with tarfile.open(local_model_path, "r:gz") as tar:
        tar.extractall(extracted_model_path)
    
    print(f"✅ Model extracted at: {extracted_model_path}")
    
    # Find the `model.joblib` file
    extracted_files = os.listdir(extracted_model_path)
    print(f"🔍 Extracted Files: {extracted_files}")
    
    model_file_path = os.path.join(extracted_model_path, "model.joblib")
    
    if not os.path.exists(model_file_path):
        raise FileNotFoundError("🚨 `model.joblib` file not found after extraction!")
    
    # Load the model using joblib
    model = joblib.load(model_file_path)
    print(f"✅ Model loaded successfully from: {model_file_path}")
    
    return model

class MockEndpoint():
    def __init__(self, model_s3_uri):
        self.model_s3_uri = model_s3_uri
        self.model = download_model(model_s3_uri)
    
    def predict(self, data):
        features = self.model.feature_names
        dmatrix = xgb.DMatrix(data[features])
        predictions = self.model.predict(dmatrix)
        
        return json.dumps({"predictions": predictions.tolist()})

mock_endpoint = MockEndpoint(model_s3_uri)

print(mock_endpoint.model)

✅ Model downloaded: /tmp/tmp5y207p23/model.tar.gz
✅ Model extracted at: /tmp/tmp5y207p23/extracted_model
🔍 Extracted Files: ['model.joblib']
✅ Model loaded successfully from: /tmp/tmp5y207p23/extracted_model/model.joblib
<xgboost.core.Booster object at 0x7ffa14176690>


## Mock Bias Report

In [6]:
print(X_prod_final.columns)

Index(['age', 'time_in_hospital', 'n_lab_procedures', 'n_procedures',
       'n_medications', 'n_outpatient', 'n_inpatient', 'n_emergency',
       'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'glucose_test',
       'a1ctest', 'change', 'diabetes_med', 'n_inpatient_x_n_lab_procedures',
       'n_inpatient_x_n_medications', 'n_inpatient_x_time_in_hospital',
       'n_lab_procedures_x_n_medications',
       'n_lab_procedures_x_time_in_hospital',
       'n_medications_x_time_in_hospital'],
      dtype='object')


In [7]:
# Compute quartile bin edges using training data
quantile_bins = X_train_final["n_medications_x_time_in_hospital"].quantile([0.25, 0.5, 0.75]).values
print(f"Training Set Quantile Edges: {quantile_bins}")

# Create categorical labels
quartile_labels = ["Low", "Medium-Low", "Medium-High", "High"]

# Apply training quartiles to both training & production datasets
X_train_final["facet_category"] = pd.cut(
    X_train_final["n_medications_x_time_in_hospital"], 
    bins=[-np.inf] + quantile_bins.tolist() + [np.inf], 
    labels=quartile_labels
)

X_prod_final["facet_category"] = pd.cut(
    X_prod_final["n_medications_x_time_in_hospital"], 
    bins=[-np.inf] + quantile_bins.tolist() + [np.inf], 
    labels=quartile_labels
)

# Print category distributions
print("Training Data Quartile Distribution:\n", X_train_final["facet_category"].value_counts())
print("Production Data Quartile Distribution:\n", X_prod_final["facet_category"].value_counts())

Training Set Quantile Edges: [ 26.  54. 108.]
Training Data Quartile Distribution:
 facet_category
Low            3264
High           3112
Medium-High    3090
Medium-Low     3034
Name: count, dtype: int64
Production Data Quartile Distribution:
 facet_category
Low            2718
Medium-High    2515
High           2415
Medium-Low     2352
Name: count, dtype: int64


## Bias Report

### Pre-training Bias

In [18]:
label_proportions = y_train.groupby(X_train_final["facet_category"]).mean()
display(label_proportions)

label_imbalance = y_train.value_counts(normalize=True)
print("Label Class Imbalance:\n", label_imbalance)

  label_proportions = y_train.groupby(X_train_final["facet_category"]).mean()


Unnamed: 0_level_0,readmitted
facet_category,Unnamed: 1_level_1
Low,0.413297
Medium-Low,0.462755
Medium-High,0.499029
High,0.499357


Label Class Imbalance:
 readmitted
0             0.53208
1             0.46792
Name: proportion, dtype: float64


In [15]:
dpl = label_proportions.loc["High", "readmitted"] - label_proportions.loc["Low", "readmitted"]

print(f"Difference in Proportions of Labels (DPL): {dpl:.4f}")

Difference in Proportions of Labels (DPL): 0.0861


### Post-Training Bias

In [17]:
import json

# Create empty dictionary to store predictions
category_preds = {}

# Get predictions per quartile
for category in quartile_labels:
    subset = X_prod_final[X_prod_final["facet_category"] == category]
    preds = json.loads(mock_endpoint.predict(subset))["predictions"]
    category_preds[category] = preds

# Compute acceptance rates per quartile
p_high_pred = (np.array(category_preds["High"]) > 0.5).mean()
p_low_pred = (np.array(category_preds["Low"]) > 0.5).mean()

spd = p_high_pred - p_low_pred
print(f"Statistical Parity Difference (SPD): {spd:.4f}")

di = p_high_pred / (p_low_pred + 1e-8)
print(f"Disparate Impact (DI): {di:.4f}")

y_high = y_prod.loc[X_prod_final["facet_category"] == "High"].values
y_low = y_prod.loc[X_prod_final["facet_category"] == "Low"].values

# Ensure category_preds["High"] is a NumPy array
accuracy_high = (np.round(np.array(category_preds["High"])) == y_high).mean()
accuracy_low = (np.round(np.array(category_preds["Low"])) == y_low).mean()

ad = accuracy_high - accuracy_low
print(f"Accuracy Difference (AD): {ad:.4f}")

Statistical Parity Difference (SPD): 0.1985
Disparate Impact (DI): 1.6734
Accuracy Difference (AD): -0.0402


In [22]:
import json
import boto3

# Convert Bias Metrics to JSON
bias_report = {
    "Pre-Training Bias": {
        "Label Imbalance": {
            "Class 0": round(label_imbalance.get(0, 0), 4),
            "Class 1": round(label_imbalance.get(1, 0), 4)
        },
        "Difference in Proportions of Labels (DPL)": round(dpl, 4)
    },
    "Post-Training Bias": {
        "Statistical Parity Difference (SPD)": round(spd, 4),
        "Disparate Impact (DI)": round(di, 4),
        "Accuracy Difference (AD)": round(ad, 4)
    }
}

# Convert to JSON string
bias_report_json = json.dumps(bias_report, indent=4)

# S3 Configuration
s3_filename = f"bias_reports/bias_report_{model_package_group_name}.json"
s3_uri = f"s3://{bucket_name}/{s3_filename}"

# Save to S3
s3_client.put_object(
    Bucket=bucket_name,
    Key=s3_filename,
    Body=bias_report_json
)

print(f"Bias report saved as JSON to S3: {s3_uri}")


Bias report saved as JSON to S3: s3://group3-project-bucket/bias_reports/bias_report_xgboost-hospital-readmissions-1740627359.json


In [23]:
print(bias_report_json)

{
    "Pre-Training Bias": {
        "Label Imbalance": {
            "Class 0": 0.5321,
            "Class 1": 0.4679
        },
        "Difference in Proportions of Labels (DPL)": 0.0861
    },
    "Post-Training Bias": {
        "Statistical Parity Difference (SPD)": 0.1985,
        "Disparate Impact (DI)": 1.6734,
        "Accuracy Difference (AD)": -0.0402
    }
}


# Bias Report Interpretation. Facet: n_medications_x_time_in_hospital

This bias report provides insights into **pre-training bias (data distribution) and post-training bias (model predictions).** Let's break it down.

---

## **Pre-Training Bias (Before Model Training)**

### **Label Imbalance**
| Class | Proportion |
|--------|-----------|
| **Class 0** (Not Readmitted) | **53.21%** |
| **Class 1** (Readmitted) | **46.79%** |

**Interpretation:**  
- The dataset is **fairly balanced**, but **slightly favors Class 0 (non-readmitted patients).**
- **No extreme label imbalance** (e.g., 90%–10%), so training should not be overly biased toward predicting one outcome.

---

### **Difference in Proportions of Labels (DPL)**
- **DPL = 0.0861** → The **High** quartile has a **higher proportion of positive labels (readmitted patients)** than the **Low** quartile.

**Interpretation:**  
- Patients in the **High quartile of `n_medications_x_time_in_hospital`** are **more likely to be labeled as readmitted** compared to the **Low quartile.**  
- This suggests that **hospital stay duration and number of medications are correlated with readmission risk**.
- If this correlation is **unwanted**, we may need to **adjust for this bias** in feature engineering.

---

## **Post-Training Bias (After Model Predictions)**

### **Statistical Parity Difference (SPD)**
- **SPD = 0.1985** → The model predicts **"Readmitted" (Positive Class) more often** for **patients in the High quartile** compared to those in the Low quartile.

**Interpretation:**  
- The model assigns **different readmission probabilities based on `n_medications_x_time_in_hospital` quartiles.**  
- This could indicate **a learned bias** in favor of certain patient groups.

---

### **Disparate Impact (DI)**
- **DI = 1.6734** → The **High quartile group** is **1.67 times more likely** to be predicted as "Readmitted" compared to the Low quartile.

**Interpretation:**  
- The model disproportionately predicts **readmissions for patients with longer hospital stays and more medications.**
- **If DI > 1.25**, this could indicate **potential fairness concerns** (depending on legal and ethical guidelines).
- A **DI between 0.8 and 1.25** is usually considered **fair** (depending on context).

---

### **Accuracy Difference (AD)**
- **AD = -0.0402** → The model is **4% more accurate** for patients in the Low quartile than the High quartile.

**Interpretation:**  
- **Accuracy is slightly worse for patients with longer hospital stays & more medications.**
- This suggests that the model might struggle with **complex cases requiring many medications.**
- **Potential solution**: Improve feature representation or collect more data for High quartile patients.

---

## **Final Bias Report Summary**

| Metric | Value | Interpretation |
|---------|------|---------------|
| **Class 0 Imbalance** | 53.21% | Slightly more non-readmitted patients. |
| **Class 1 Imbalance** | 46.79% | Nearly balanced, but slight bias toward non-readmission. |
| **DPL** | 0.0861 | Higher readmission rates in the High quartile group. |
| **SPD** | 0.1985 | Model predicts "Readmitted" more often for High quartile patients. |
| **DI** | 1.6734 | High quartile patients are **1.67x more likely** to be predicted as "Readmitted." |
| **AD** | -0.0402 | **4% lower accuracy** for High quartile patients. |

---

## **Recommendations**
**Monitor fairness thresholds** (DI should ideally be between **0.8 and 1.25**).  
**Investigate why the model predicts higher readmission for the High quartile group.**  
**Check feature engineering** to ensure `n_medications_x_time_in_hospital` isn’t over-weighted.  
**If necessary, balance training data** (e.g., re-weight samples or augment underrepresented groups).  

**This is a well-balanced model, but there are signs of potential bias in predictions based on hospitalization length & medications.** 🚀
