### Notebook 07: Model Evaluation and Model registry



1.	Load ground truth (y_test) from test feature parquet in S3
2.	Read Batch Transform predictions from S3 output
3.	Compute evaluation metrics (RMSE, MAE, RÂ²)
4.	Save evaluation metrics as metrics.json to S3
5.	Create / reuse SageMaker Model Package Group
6.	Register trained model artifact in SageMaker Model Registry with metrics
7.	Set model status to PendingManualApproval for governance

In [4]:
import sys
sys.path.append("..")

from config.config import (
    BUCKET_NAME,
    S3_PREFIX,
    TARGET_COLUMN,
    MODEL_NAME,
    AWS_REGION
)

bucket = BUCKET_NAME

In [None]:
# import boto3
# import json

# s3 = boto3.client("s3", region_name=AWS_REGION)

# prefix = "nfci-xgboost-regression"

# metadata_key = f"{prefix}/run_metadata/latest.json"

# obj = s3.get_object(Bucket=bucket, Key=metadata_key)
# run_metadata = json.loads(obj["Body"].read())

# job_name = run_metadata["training_job_name"]
# prefix   = run_metadata["training_prefix"]

# print("Loaded job_name:", job_name)
# print("Loaded prefix:", prefix)

In [1]:
import os
from dotenv import load_dotenv

# Load .env from project root
load_dotenv()

ROLE_ARN = os.getenv("SAGEMAKER_ROLE_ARN")

if not ROLE_ARN:
    raise RuntimeError(
        "SAGEMAKER_ROLE_ARN not found.\n"
        "Ensure it is set in .env or environment variables."
    )

print("Using ROLE_ARN:", ROLE_ARN)

Using ROLE_ARN: arn:aws:iam::222634372778:role/service-role/AmazonSageMaker-ExecutionRole-20250603T080776


In [None]:
import sys
sys.path.append("..")

from config.config import (
    BUCKET_NAME,
    S3_PREFIX,
    TARGET_COLUMN,
    MODEL_NAME,
    AWS_REGION
)

bucket = BUCKET_NAME

In [14]:
import sys
sys.path.append("..")

from config.config import BUCKET_NAME

bucket = BUCKET_NAME
prefix = "nfci-xgboost-regression"   # must match training notebook
job_name = "xgb-nfci-reg-2026-02-02-07-51-47"  # paste your actual job name

In [17]:
# Load the groung truth y_test
test_s3 = f"s3://{bucket}/{S3_PREFIX['test']}/features.parquet"
df_test = pd.read_parquet(test_s3)

# Drop non-numeric columns (same as training) - only if needed
df_test = df_test.drop(columns=["date", "Datetime"], errors="ignore")

y_true = df_test[TARGET_COLUMN].values
print("y_true shape:", y_true.shape)

y_true shape: (1100,)


In [18]:
# Find and read the Batch Transform output file in S3
s3 = boto3.client("s3")
out_prefix = f"{prefix}/batch-output/{job_name}/"

resp = s3.list_objects_v2(Bucket=bucket, Prefix=out_prefix)
keys = [obj["Key"] for obj in resp.get("Contents", [])]

print("Files under batch-output:")
for k in keys:
    print("-", k)

Files under batch-output:
- nfci-xgboost-regression/batch-output/xgb-nfci-reg-2026-02-02-07-51-47/test.csv.out


In [19]:
pred_key = next(k for k in keys if k.endswith(".out") or "out" in k.lower())
print("Using prediction file:", pred_key)

Using prediction file: nfci-xgboost-regression/batch-output/xgb-nfci-reg-2026-02-02-07-51-47/test.csv.out


In [20]:
obj = s3.get_object(Bucket=bucket, Key=pred_key)
pred_text = obj["Body"].read().decode("utf-8").strip()

# XGBoost container returns one prediction per line
y_pred = np.array([float(x) for x in pred_text.splitlines()])
print("y_pred shape:", y_pred.shape)

y_pred shape: (1100,)


In [7]:
import boto3
s3 = boto3.client("s3")

out_prefix = f"{prefix}/batch-output/{job_name}/"
resp = s3.list_objects_v2(Bucket=bucket, Prefix=out_prefix)

for obj in resp.get("Contents", []):
    print(obj["Key"])

nfci-xgboost-regression/batch-output/xgb-nfci-reg-2026-02-02-07-51-47/test.csv.out


In [22]:
# Compute evaluation metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

mse  = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae  = mean_absolute_error(y_true, y_pred)
r2   = r2_score(y_true, y_pred)

metrics = {
    "rmse": float(rmse),
    "mae": float(mae),
    "r2": float(r2),
}

metrics

{'rmse': 0.0511026203345222,
 'mae': 0.038589602195566355,
 'r2': -0.2429530794626118}

In [None]:
# small preview of ground_truth and predictions
pd.DataFrame({"y_true": y_true[:10], "y_pred": y_pred[:10]})

Unnamed: 0,y_true,y_pred
0,-0.50232,-0.569124
1,-0.54393,-0.463803
2,-0.56916,-0.570779
3,-0.54224,-0.572127
4,-0.56609,-0.571256
5,-0.6007,-0.571896
6,-0.59566,-0.571528
7,-0.55303,-0.571861
8,-0.45946,-0.562891
9,-0.4238,-0.437722


In [24]:
# Save metrics to S3 (good for audit + Model Registry)

import json, time

metrics_key = f"{prefix}/evaluation/{job_name}/metrics.json"
s3.put_object(
    Bucket=bucket,
    Key=metrics_key,
    Body=json.dumps(metrics, indent=2).encode("utf-8"),
    ContentType="application/json",
)

print("Saved metrics to:", f"s3://{bucket}/{metrics_key}")

Saved metrics to: s3://nfci-forecasting-222634372778/nfci-xgboost-regression/evaluation/xgb-nfci-reg-2026-02-02-07-51-47/metrics.json


In [27]:
# Register in SageMaker Model Registry (with metrics), creating a model package group

sm = boto3.client("sagemaker")

model_package_group_name = f"{MODEL_NAME}-group"  # e.g., nfci-xgboost-group

try:
    sm.create_model_package_group(
        ModelPackageGroupName=model_package_group_name,
        ModelPackageGroupDescription="NFCI XGBoost regression models"
    )
    print("Created ModelPackageGroup:", model_package_group_name)
except sm.exceptions.ResourceInUse:
    print("ModelPackageGroup already exists:", model_package_group_name)

Created ModelPackageGroup: nfci-xgboost-group


In [28]:
# register model artifact from training group

desc = sm.describe_training_job(TrainingJobName=job_name)
model_artifact = desc["ModelArtifacts"]["S3ModelArtifacts"]
print("Model artifact:", model_artifact)

Model artifact: s3://nfci-forecasting-222634372778/nfci-xgboost-regression/output/xgb-nfci-reg-2026-02-02-07-51-47/xgb-nfci-reg-2026-02-02-07-51-47/output/model.tar.gz


In [30]:
# register a model package (XGBoost image inference container):

import sagemaker
from sagemaker import image_uris

sess = sagemaker.Session()
region = sess.boto_region_name

inference_image = image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.7-1",
    image_scope="inference"
)

model_metrics = {
    "ModelQuality": {
        "Statistics": {
            "ContentType": "application/json",
            "S3Uri": f"s3://{bucket}/{metrics_key}"
        }
    }
}

response = sm.create_model_package(
    ModelPackageGroupName=model_package_group_name,
    ModelPackageDescription=f"NFCI XGBoost regression - {job_name}",
    InferenceSpecification={
        "Containers": [
            {
                "Image": inference_image,
                "ModelDataUrl": model_artifact,
                "Environment": {}
            }
        ],
        "SupportedContentTypes": ["text/csv"],
        "SupportedResponseMIMETypes": ["text/csv"]
    },
    ModelMetrics=model_metrics,
    ModelApprovalStatus="PendingManualApproval",  # or "Approved" if your rubric allows
)

model_package_arn = response["ModelPackageArn"]
print(" ModelPackageArn:", model_package_arn)

 ModelPackageArn: arn:aws:sagemaker:us-east-1:222634372778:model-package/nfci-xgboost-group/2
