In [30]:
# parameters
bucket = "verikai-heart-risk-pipeline"
prefix = "data/processed_data"
region = "us-east-1"

In [23]:
import boto3
import pandas as pd
from io import StringIO
from sklearn.model_selection import train_test_split
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from datetime import datetime, timezone
import sagemaker
import os

In [28]:
project_root = Path.cwd().parent
data_dir     = project_root / "data" / "derived"
#Find the latest phase4 CSV 
candidates = list(data_dir.glob("phase4_pruned_all_latest_*.csv"))
if not candidates:
    raise FileNotFoundError(f"No files matching phase4_pruned_all_latest_*.csv under {data_dir}")
latest_file = max(candidates, key=lambda p: p.stat().st_mtime)
print(f"⟳ Loading local CSV → {latest_file}")

df = pd.read_csv(latest_file)

train_df, val_df = train_test_split(
    df, test_size=0.2, stratify=df["heart_attack"], random_state=42
)
df = df[["heart_attack"] + [c for c in df.columns if c != "heart_attack"]]
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["heart_attack"], random_state=42)

out_dir = data_dir
train_df.to_csv(out_dir / "train.csv",      index=False, header=False)
val_df.to_csv(out_dir / "validation.csv",   index=False, header=False)
print(f"Wrote local splits → {out_dir}/train.csv and validation.csv")

⟳ Loading local CSV → /Users/rev/IUB/Projects/HeartAttackRiskPrediction/data/derived/phase4_pruned_all_latest_20250510_0445.csv
Wrote local splits → /Users/rev/IUB/Projects/HeartAttackRiskPrediction/data/derived/train.csv and validation.csv


In [11]:
# # parameters 
# bucket = "verikai-heart-risk-pipeline"
# prefix = "data/processed_data"
# region = "us-east-1"

from boto3.s3.transfer import TransferConfig
import boto3
from botocore.client import Config
from pathlib import Path

# Configure an S3 client against the regional endpoint, forcing path-style as I got SSL error
s3 = boto3.client(
    "s3",
    region_name=region,
    endpoint_url=f"https://s3.{region}.amazonaws.com",
    config=Config(
        signature_version="s3v4",
        s3={"addressing_style": "path"}
    )
)

# Multipart‐upload config for large files
transfer_cfg = TransferConfig(
    multipart_threshold=25 * 1024 * 1024,   
    multipart_chunksize=10 * 1024 * 1024,  
    max_concurrency=4
)

def upload_split(df, key_name):
    """Write a DataFrame to S3 as CSV via multipart upload."""
    buf = StringIO()
    df.to_csv(buf, index=False, header=False)
    print(f"→ Uploading to s3://{bucket}/{key_name}")
    s3.put_object(
        Bucket=bucket,
        Key=key_name,
        Body=buf.getvalue()
    )

train_key = f"{prefix}/train.csv"
val_key   = f"{prefix}/validation.csv"

upload_split(train_df, train_key)
upload_split(val_df,   val_key)

print("All splits uploaded successfully!")

→ Uploading to s3://verikai-heart-risk-pipeline/data/processed_data/train.csv
→ Uploading to s3://verikai-heart-risk-pipeline/data/processed_data/validation.csv
✅ All splits uploaded successfully!


In [61]:
print(df["heart_attack"].unique())

[0 1]


In [13]:
#Config
bucket = "verikai-heart-risk-pipeline"
role = "arn:aws:iam::904233112003:role/SageMakerExecutionRole-rev"
region = "us-east-1"
prefix = "data/processed_data"

timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
output_path = f"s3://{bucket}/models/heart_attack/{timestamp}/output"
job_name = f"xgb-heart-{timestamp}"

#  SageMaker session 
session = sagemaker.Session(boto3.Session(region_name=region))
container_uri = sagemaker.image_uris.retrieve("xgboost", region, version="1.5-1")

In [15]:
#  Estimator setup 
xgb = Estimator(
    image_uri=container_uri,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=output_path,
    sagemaker_session=session,
    base_job_name="xgb-heart"
)

xgb.set_hyperparameters(
    objective="binary:logistic",
    eval_metric="auc",
    num_round=100,
    max_depth=6,
    eta=0.2,
    subsample=0.8,
    colsample_bytree=0.8
)

In [17]:
train_input = TrainingInput(
    s3_data=f"s3://{bucket}/{prefix}/train.csv",
    content_type="text/csv"
)

val_input = TrainingInput(
    s3_data=f"s3://{bucket}/{prefix}/validation.csv",
    content_type="text/csv"
)

In [19]:
# X_train = validate_features(X_train, "data/derived/feature_list_final_curated.json") -> Schema validation

xgb.fit({"train": train_input, "validation": val_input}, job_name=job_name)

print(f"SageMaker training job submitted: {job_name}")
print(f"Output model will be in: {output_path}/{job_name}/output/")

INFO:sagemaker:Creating training-job with name: xgb-heart-20250511T182913Z


2025-05-11 18:29:25 Starting - Starting the training job...
...5-05-11 18:29:41 Starting - Preparing the instances for training
...5-05-11 18:30:04 Downloading - Downloading input data
......5-11 18:30:59 Downloading - Downloading the training image
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-05-11 18:31:55.447 ip-10-0-161-35.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-05-11 18:31:55.469 ip-10-0-161-35.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-05-11:18:31:55:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-05-11:18:31:55:INFO] Failed to parse hyperparameter eval_metric value auc to Json.[0m
[34mReturning the value itself[0m
[34m[2025-05-11:18:31:55:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-05-11:18:31:55:INFO] No GPUs detected (normal if no gpus installed)[0m
[3

In [106]:
from sagemaker.tuner import HyperparameterTuner, IntegerParameter
now = datetime.now().strftime("%y%m%d-%H%M")
tuning_job_name = f"sagemaker-xgboost-{now}"

os.makedirs("data", exist_ok=True)
with open("data/latest_sagemaker_tuning_job.txt", "w") as f:
    f.write(tuning_job_name)
xgb_hpo = HyperparameterTuner(
    estimator          = xgb,
    objective_metric_name="validation:auc",
    hyperparameter_ranges={
        "eta": ContinuousParameter(0.1, 0.3),
        "max_depth": IntegerParameter(4, 10),
    },
    max_jobs=10,
    max_parallel_jobs=2,
    base_tuning_job_name=tuning_job_name
)
xgb_hpo.fit({"train": train_input, "validation": val_input})
xgb_hpo.wait()

best_job_name = xgb_hpo.best_training_job()
print("Best training job:", best_job_name)

INFO:sagemaker:Creating hyperparameter tuning job with name: sagemaker-xgboost-25-250510-1422


.....................................................................................................................................................!
!
Best training job: sagemaker-xgboost-25-250510-1422-008-0a47af4b


In [109]:
best_estimator = sagemaker.estimator.Estimator.attach(best_job_name)
print("Best model S3 path:", best_estimator.model_data)


2025-05-10 18:33:35 Starting - Found matching resource for reuse
2025-05-10 18:33:35 Downloading - Downloading the training image
2025-05-10 18:33:35 Training - Training image download completed. Training in progress.
2025-05-10 18:33:35 Uploading - Uploading generated training model
2025-05-10 18:33:35 Completed - Resource reused by training job: sagemaker-xgboost-25-250510-1422-010-07c2e525
Best model S3 path: s3://verikai-heart-risk-pipeline/models/heart_attack/20250510T063154Z/output/sagemaker-xgboost-25-250510-1422-008-0a47af4b/output/model.tar.gz


In [112]:
# sm = boto3.client("sagemaker")
# tuning_jobs = sm.list_hyper_parameter_tuning_jobs(MaxResults=10)
# for job in tuning_jobs["HyperParameterTuningJobSummaries"]:
#     print(job["HyperParameterTuningJobName"])


sagemaker-xgboost-250510-1352
sagemaker-xgboost-250510-1327
sagemaker-xgboost-250510-1305
sagemaker-xgboost-25-250510-1422


# ───────────────────────────────────────────────────────────────
# Evaluate & MLflow-log best SageMaker model (with plots & categories)
# ───────────────────────────────────────────────────────────────


In [163]:

# import os, tempfile, tarfile, boto3, warnings
# import pandas as pd
# import xgboost as xgb
# import matplotlib.pyplot as plt
# import seaborn as sns

from io import StringIO
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, RocCurveDisplay
)

import mlflow
import mlflow.xgboost
from sagemaker.estimator import Estimator

# MLflow setup
mlflow.set_tracking_uri("http://localhost:5001")
mlflow.set_experiment("Sagemaker-driven-heart-risk-model")

# Attach best estimator and download model artifact
best_job_name  = xgb_hpo.best_training_job()
best_estimator = Estimator.attach(best_job_name)
print("🛠️  Best training job:", best_job_name)

bucket      = "verikai-heart-risk-pipeline"
model_key   = best_estimator.model_data.split(f"s3://{bucket}/")[-1]

tmpdir      = tempfile.mkdtemp()
local_tar   = os.path.join(tmpdir, "model.tar.gz")
boto3.client("s3").download_file(bucket, model_key, local_tar)

# Suppress deprecation warning on tar
with tarfile.open(local_tar) as tar:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=DeprecationWarning)
        tar.extractall(path=tmpdir)

bst = xgb.Booster()
bst.load_model(os.path.join(tmpdir, "xgboost-model"))
print("Booster loaded")

# Load and realign validation data
s3         = boto3.client("s3")
raw_prefix = "data/processed_data"
val_key    = f"{raw_prefix}/validation.csv"

# fetch canonical feature order from full original CSV
resp      = s3.list_objects_v2(Bucket=bucket, Prefix=raw_prefix + "/")
all_csv   = [o["Key"] for o in resp.get("Contents",[]) if o["Key"].endswith(".csv")]
orig_csv  = sorted([k for k in all_csv if "train.csv" not in k and "validation.csv" not in k], reverse=True)[0]
print("Using header from:", orig_csv)

header_df = pd.read_csv(StringIO(s3.get_object(Bucket=bucket, Key=orig_csv)["Body"].read().decode("utf-8")), nrows=0)
cols      = header_df.columns.tolist()
cols.remove("heart_attack")

# load val split (no header) and assign column names
val_obj = s3.get_object(Bucket=bucket, Key=val_key)
df_val  = pd.read_csv(StringIO(val_obj["Body"].read().decode("utf-8")), header=None)

if df_val.shape[1] != len(cols) + 1:
    raise ValueError(f"Validation has {df_val.shape[1]} cols; expected {len(cols)+1}")
df_val.columns = ["heart_attack", *cols]

X_val = df_val[cols].copy()
y_val = df_val["heart_attack"].astype(int).values
print(f"Loaded validation: {X_val.shape[0]:,} rows × {X_val.shape[1]:,} features")

#Handle categorical features
for col in X_val.select_dtypes(include="object"):
    X_val[col] = X_val[col].astype("category")

dval = xgb.DMatrix(X_val, enable_categorical=True)

#Predict
y_proba = bst.predict(dval)
y_pred  = (y_proba >= 0.5).astype(int)

# Start MLflow run
os.makedirs("plots", exist_ok=True)

with mlflow.start_run(run_name=best_job_name):
    # 6.1 Log hyperparameters
    for k, v in best_estimator.hyperparameters().items():
        mlflow.log_param(k, v)

    # 6.2 Log metrics
    metrics = {
        "accuracy":  accuracy_score(y_val, y_pred),
        "precision": precision_score(y_val, y_pred),
        "recall":    recall_score(y_val, y_pred),
        "f1_score":  f1_score(y_val, y_pred),
        "roc_auc":   roc_auc_score(y_val, y_proba)
    }
    for name, val in metrics.items():
        mlflow.log_metric(name, val)
    print("Logged metrics:", metrics)

    # Log model artifact
    mlflow.xgboost.log_model(bst, artifact_path="xgb-model")
    print("Logged model to MLflow")

    RocCurveDisplay.from_predictions(y_val, y_proba)
    plt.title("ROC Curve — Validation")
    roc_path = "plots/roc_curve.png"
    plt.savefig(roc_path, bbox_inches="tight")
    plt.close()
    mlflow.log_artifact(roc_path)

    # Save + log confusion matrix
    cm = confusion_matrix(y_val, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=["Pred 0", "Pred 1"],
                yticklabels=["True 0", "True 1"])
    plt.title("Confusion Matrix — Validation")
    plt.xlabel("Predicted"); plt.ylabel("Actual")
    cm_path = "plots/confusion_matrix_annotated.png"
    plt.savefig(cm_path, bbox_inches="tight")
    plt.close()
    mlflow.log_artifact(cm_path)

print(f"""
Validation metrics
------------------
Accuracy : {metrics['accuracy']:.4f}
Precision: {metrics['precision']:.4f}
Recall   : {metrics['recall']:.4f}
F1-score : {metrics['f1_score']:.4f}
ROC AUC  : {metrics['roc_auc']:.4f}
""")

2025/05/10 17:12:30 INFO mlflow.tracking.fluent: Experiment with name 'Sagemaker-driven-heart-risk-model' does not exist. Creating a new experiment.



2025-05-10 18:33:35 Starting - Found matching resource for reuse
2025-05-10 18:33:35 Downloading - Downloading the training image
2025-05-10 18:33:35 Training - Training image download completed. Training in progress.
2025-05-10 18:33:35 Uploading - Uploading generated training model
2025-05-10 18:33:35 Completed - Resource reused by training job: sagemaker-xgboost-25-250510-1422-010-07c2e525
🛠️  Best training job: sagemaker-xgboost-25-250510-1422-008-0a47af4b
✅ Booster loaded
✏️  Using header from: data/processed_data/phase4_pruned_all_latest_0445.csv
✅ Loaded validation: 86,151 rows × 44 features
✅ Logged metrics: {'accuracy': 0.9705400981996727, 'precision': 0.6721049264235445, 'recall': 0.8959488272921109, 'f1_score': 0.7680497166879913, 'roc_auc': 0.9879486119288079}


  xgb_model.save_model(model_data_path)


✅ Logged model to MLflow
🏃 View run sagemaker-xgboost-25-250510-1422-008-0a47af4b at: http://localhost:5001/#/experiments/3/runs/267c913ded474285b8b17456f34c9b99
🧪 View experiment at: http://localhost:5001/#/experiments/3

Validation metrics
------------------
Accuracy : 0.9705
Precision: 0.6721
Recall   : 0.8959
F1-score : 0.7680
ROC AUC  : 0.9879



In [None]:
import json

metadata = {
    "model_artifact_s3": f"s3://{bucket}/{model_key}",
    "training_job_name": best_job_name
}

with open("/opt/airflow/out/notebook_output_metadata.json", "w") as f:
    json.dump(metadata, f)