In [1]:
import boto3, io, pandas as pd
from sklearn.model_selection import train_test_split
import sagemaker
from sagemaker import get_execution_role

region = "us-east-1" # update if needed
bucket = "healthcare-project-data-rakshitha" # your bucket name
role = get_execution_role()
s3 = boto3.client("s3", region_name=region)

hist_key = "raw/historical/heart_attack_prediction_dataset.csv"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
#load the processed/merged CSV from S3 (produced by EMR) to confirm shape & columns.

obj = s3.get_object(Bucket=bucket, Key=hist_key)
df = pd.read_csv(io.BytesIO(obj["Body"].read()))
print("Loaded dataset:", df.shape)
df.head(2)

Loaded dataset: (8763, 26)


Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0


In [4]:
def preprocess_health_data(df):
    # Split blood pressure
    if "Blood Pressure" in df.columns:
        bp = df["Blood Pressure"].astype(str).str.split("/", n=1, expand=True)
        df["BP_Systolic"] = pd.to_numeric(bp[0], errors="coerce")
        df["BP_Diastolic"] = pd.to_numeric(bp[1], errors="coerce")
        df.drop(columns=["Blood Pressure"], inplace=True)

    # Drop identifiers
    df = df.drop(columns=["Patient ID","Country","Continent","Hemisphere"], errors="ignore")

    # One-hot encode categoricals
    df = pd.get_dummies(df, drop_first=True).fillna(0)
    return df

In [5]:
proc_df = preprocess_health_data(df)
y = proc_df["Heart Attack Risk"].astype(int)
X = proc_df.drop(columns=["Heart Attack Risk"])
final_df = pd.concat([y, X], axis=1)
train_df, test_df = train_test_split(final_df, test_size=0.2, random_state=42, stratify=y)
print("Train:", train_df.shape, "| Test:", test_df.shape)
print("\n Sample training row:")
display(train_df.head(1))

Train: (7010, 24) | Test: (1753, 24)

 Sample training row:


Unnamed: 0,Heart Attack Risk,Age,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,...,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,BP_Systolic,BP_Diastolic,Sex_Male,Diet_Healthy,Diet_Unhealthy
1034,0,21,250,78,1,0,1,0,1,2.392506,...,273208,23.575795,176,2,5,109,76,True,True,False


In [6]:
train_key = "raw/historical/train/train.csv"
test_key = "raw/historical/test/test.csv"

def upload_csv(df, key):
    s3.put_object(Bucket=bucket, Key=key, Body=df.to_csv(index=False, header=False).encode())
    print(f"Uploaded → s3://{bucket}/{key}")

upload_csv(train_df, train_key)
upload_csv(test_df, test_key)

Uploaded → s3://healthcare-project-data-rakshitha/raw/historical/train/train.csv
Uploaded → s3://healthcare-project-data-rakshitha/raw/historical/test/test.csv


In [7]:
feature_list = list(X.columns)
with open("feature_list.txt", "w") as f:
    f.write("\n".join(feature_list))

!aws s3 cp feature_list.txt s3://{bucket}/preprocess/feature_list.txt
print(f" Uploaded feature list → s3://{bucket}/preprocess/feature_list.txt")

upload: ./feature_list.txt to s3://healthcare-project-data-rakshitha/preprocess/feature_list.txt
 Uploaded feature list → s3://healthcare-project-data-rakshitha/preprocess/feature_list.txt


In [8]:
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
import time

timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
output_path = f"s3://{bucket}/models/xgboost"
xgb_image = sagemaker.image_uris.retrieve("xgboost", region=region, version="1.5-1")

xgb_estimator = Estimator(
    image_uri=xgb_image,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    volume_size=5,
    output_path=output_path,
    base_job_name=f"xgboost-heart-attack-{timestamp}",
)

xgb_estimator.set_hyperparameters(
    objective="binary:logistic",
    num_round=100,
    eta=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="auc"
)

train_input = f"s3://{bucket}/{train_key}"
test_input = f"s3://{bucket}/{test_key}"

print(" Starting training job.")
xgb_estimator.fit(
    {
        "train": TrainingInput(train_input, content_type="text/csv"),
        "validation": TrainingInput(test_input, content_type="text/csv")
    }
)

model_artifact = xgb_estimator.model_data
print(" Model training complete!")
print(" Model artifact stored at:", model_artifact)

INFO:sagemaker:Creating training-job with name: xgboost-heart-attack-2025-11-18-20-00-1-2025-11-18-20-00-17-594


 Starting training job.
2025-11-18 20:00:17 Starting - Starting the training job...
2025-11-18 20:00:43 Starting - Preparing the instances for training...
2025-11-18 20:01:10 Downloading - Downloading input data...
2025-11-18 20:01:45 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-11-18 20:02:48.525 ip-10-2-65-54.ec2.internal:8 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-11-18 20:02:48.547 ip-10-2-65-54.ec2.internal:8 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-11-18:20:02:48:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-11-18:20:02:48:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2025-11-18:20:02:48:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-11-18:20:02:48:INFO] No GPUs detected (no

In [9]:
from sagemaker.model import Model
import sagemaker, time

sagemaker_session = sagemaker.session.Session()
timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
xgb_image = sagemaker.image_uris.retrieve("xgboost", region=region, version="1.5-1")

# Define the model object
xgb_model = Model(
    image_uri=xgb_image,
    model_data=model_artifact,
    role=role,
    name=f"xgb-heart-attack-{timestamp}",
    sagemaker_session=sagemaker_session,
)

# Create a custom endpoint name
endpoint_name = f"xgb-heart-attack-endpoint-{timestamp}"
print(f" Deploying XGBoost model as endpoint: {endpoint_name} .")

# Deploy using the model's .deploy() — returns None in newer SDKs,
# so we attach a Predictor manually afterward
xgb_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name=endpoint_name
)

# Manually create predictor for runtime access
from sagemaker.predictor import Predictor
predictor = Predictor(endpoint_name=endpoint_name, sagemaker_session=sagemaker_session)

print("\n Model deployed successfully!")
print(" Endpoint name:", endpoint_name)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: xgb-heart-attack-2025-11-18-20-06-53


 Deploying XGBoost model as endpoint: xgb-heart-attack-endpoint-2025-11-18-20-06-53 .


INFO:sagemaker:Creating endpoint-config with name xgb-heart-attack-endpoint-2025-11-18-20-06-53
INFO:sagemaker:Creating endpoint with name xgb-heart-attack-endpoint-2025-11-18-20-06-53


-------!
 Model deployed successfully!
 Endpoint name: xgb-heart-attack-endpoint-2025-11-18-20-06-53
