Final Project - ml diabetes app

In [1]:
import boto3
s3 = boto3.client("s3")
response = s3.list_objects_v2(Bucket="diabetes-prediction-dataset13")
for obj in response.get("Contents", []):
    print(obj["Key"])

Dockerfile
Fina_Project_Diabetes_Pred.ipynb
best_diabetes_model.pkl
diabetes_prediction_dataset.csv
inference.py
output/sagemaker-scikit-learn-2025-11-20-03-32-43-485/debug-output/training_job_end.ts
output/sagemaker-scikit-learn-2025-11-20-03-32-43-485/profiler-output/framework/training_job_end.ts
output/sagemaker-scikit-learn-2025-11-20-03-32-43-485/profiler-output/system/incremental/2025112003/1763609580.algo-1.json
output/sagemaker-scikit-learn-2025-11-20-03-32-43-485/profiler-output/system/incremental/2025112003/1763609640.algo-1.json
output/sagemaker-scikit-learn-2025-11-20-03-32-43-485/profiler-output/system/incremental/2025112003/1763609700.algo-1.json
output/sagemaker-scikit-learn-2025-11-20-03-32-43-485/profiler-output/system/training_job_end.ts
output/sagemaker-scikit-learn-2025-11-20-03-40-24-799/debug-output/training_job_end.ts
output/sagemaker-scikit-learn-2025-11-20-03-40-24-799/output/model.tar.gz
output/sagemaker-scikit-learn-2025-11-20-03-40-24-799/profiler-output/fra

In [2]:
import boto3

bucket = "diabetes-prediction-dataset13"
key = "diabetes_prediction_dataset.csv"  # adjust if you store under a folder

# Use region-specific client (matches Studio region)
s3 = boto3.client("s3", region_name="us-east-2")

# 1) List objects to confirm the exact key
resp = s3.list_objects_v2(Bucket=bucket, Prefix="")
for obj in resp.get("Contents", []):
    print(obj["Key"])

# If many objects, narrow the prefix:
# resp = s3.list_objects_v2(Bucket=bucket, Prefix="data/")

# 2) Try a head_object to validate existence
s3.head_object(Bucket=bucket, Key=key)

# 3) Download once you have the correct key
s3.download_file(bucket, key, "diabetes_prediction_dataset.csv")


Dockerfile
Fina_Project_Diabetes_Pred.ipynb
best_diabetes_model.pkl
diabetes_prediction_dataset.csv
inference.py
output/sagemaker-scikit-learn-2025-11-20-03-32-43-485/debug-output/training_job_end.ts
output/sagemaker-scikit-learn-2025-11-20-03-32-43-485/profiler-output/framework/training_job_end.ts
output/sagemaker-scikit-learn-2025-11-20-03-32-43-485/profiler-output/system/incremental/2025112003/1763609580.algo-1.json
output/sagemaker-scikit-learn-2025-11-20-03-32-43-485/profiler-output/system/incremental/2025112003/1763609640.algo-1.json
output/sagemaker-scikit-learn-2025-11-20-03-32-43-485/profiler-output/system/incremental/2025112003/1763609700.algo-1.json
output/sagemaker-scikit-learn-2025-11-20-03-32-43-485/profiler-output/system/training_job_end.ts
output/sagemaker-scikit-learn-2025-11-20-03-40-24-799/debug-output/training_job_end.ts
output/sagemaker-scikit-learn-2025-11-20-03-40-24-799/output/model.tar.gz
output/sagemaker-scikit-learn-2025-11-20-03-40-24-799/profiler-output/fra

In [3]:
# Step 1: Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import joblib
import sagemaker

# Step 2: Create SageMaker session and role
session = sagemaker.Session()
role = sagemaker.get_execution_role()   # <-- this fixes the NameError

# Step 3: S3 URI for your dataset
s3_uri = "s3://diabetes-prediction-dataset13/diabetes_prediction_dataset.csv"

# Step 4: Load directly into pandas
df = pd.read_csv(s3_uri)
print(df.head())

# Step 5: Basic Cleaning
df['gender'] = df['gender'].str.strip().str.title()
df = df[df['gender'].isin(['Male','Female'])]
df['gender'] = df['gender'].map({'Male':1, 'Female':0})

# Step 6: Features and Target
X = df.drop('diabetes', axis=1)
y = df['diabetes']

# Step 7: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 8: Preprocessing
numeric_features = ['age','bmi','HbA1c_level','blood_glucose_level']
categorical_features = ['smoking_history']

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'  # keep hypertension, heart_disease, gender
)

# Step 9: Gradient Boosting Model
gb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

gb_model.fit(X_train, y_train)

# Step 10: Evaluation
y_pred = gb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, gb_model.predict_proba(X_test)[:,1]))

# Step 11: Save Best Model
joblib.dump(gb_model, "best_diabetes_model.pkl")
print("Best model saved: Gradient Boosting")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  
Accuracy: 0.9723958593789068
F1 Score: 0.8087318087318087
ROC-AUC: 0.9794863349504419
Best model saved: Gradient Boosting


In [4]:
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnModel
import sagemaker
import pandas as pd

session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket = "diabetes-prediction-dataset13"
region = session.boto_region_name  # should be 'us-east-2'
output_path = f"s3://{bucket}/output"
training_data_s3 = f"s3://{bucket}/diabetes_prediction_dataset.csv"

print("Region:", region)
print("Output path:", output_path)
print("Training data:", training_data_s3)

Region: us-east-2
Output path: s3://diabetes-prediction-dataset13/output
Training data: s3://diabetes-prediction-dataset13/diabetes_prediction_dataset.csv


In [None]:
# Train Estimator

from sagemaker.sklearn.estimator import SKLearn

# Define the estimator
sklearn_estimator = SKLearn(
    entry_point="train.py",                # training script
    source_dir="diabetes-training",        # folder containing train.py
    role=role,
    instance_type="ml.m5.large",           # or ml.t2.medium if quota is limited
    instance_count=1,
    framework_version="0.23-1",
    py_version="py3",
    output_path="s3://diabetes-prediction-dataset13/output",  # where model.tar.gz will be saved
    sagemaker_session=session
)

# Launch training job
sklearn_estimator.fit({
    "train": "s3://diabetes-prediction-dataset13/diabetes_prediction_dataset.csv"
})

In [8]:
# Deploy the trained model as an Endpoint
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=sklearn_estimator.model_data,   # points to model.tar.gz in S3
    role=role,
    entry_point="inference.py",
    source_dir="diabetes-training",
    framework_version="0.23-1",
    py_version="py3",
    sagemaker_session=session
)

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium",
    endpoint_name="diabetes-prediction-endpoint139"
)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2025-11-20-06-30-19-582
INFO:sagemaker:Creating endpoint-config with name diabetes-prediction-endpoint139
INFO:sagemaker:Creating endpoint with name diabetes-prediction-endpoint139


-----!

In [9]:
import boto3

# Create SageMaker client in your region
sm_client = boto3.client("sagemaker", region_name="us-east-2")

# List endpoints
response = sm_client.list_endpoints()

print("Active SageMaker Endpoints:")
for ep in response["Endpoints"]:
    print(f"- {ep['EndpointName']} (Status: {ep['EndpointStatus']})")

Active SageMaker Endpoints:
- diabetes-prediction-endpoint139 (Status: InService)
- diabetes-prediction-endpoint136 (Status: InService)
- diabetes-prediction-endpoint135 (Status: InService)


In [10]:
import boto3

sm_client = boto3.client("sagemaker", region_name="us-east-2")

# List of endpoints you want to delete
endpoints_to_delete = [
    
    "diabetes-prediction-endpoint136",
    "diabetes-prediction-endpoint135"
]

for ep in endpoints_to_delete:
    sm_client.delete_endpoint(EndpointName=ep)
    print(f"Deleted endpoint: {ep}")

Deleted endpoint: diabetes-prediction-endpoint136
Deleted endpoint: diabetes-prediction-endpoint135


In [11]:
endpoint_name = "diabetes-prediction-endpoint-clean"

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium",              # adjust if you have quota
    endpoint_name=endpoint_name
)

INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2025-11-20-06-36-11-948
INFO:sagemaker:Creating endpoint-config with name diabetes-prediction-endpoint-clean
INFO:sagemaker:Creating endpoint with name diabetes-prediction-endpoint-clean


--------!

In [14]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
import pandas as pd

# Attach serializers
predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

# Build sample input
sample = pd.DataFrame([[
    "Male",        # gender
    70,            # age
    0,             # hypertension
    1,             # heart_disease
    "never",       # smoking_history
    40.0,          # bmi
    55,           # HbA1c_level
    180            # blood_glucose_level
]])

payload = sample.to_csv(index=False, header=False)

# Send request
result = predictor.predict(payload)
print("Prediction:", result)

Prediction: [1]
