
1.	Load engineered train/validation/test feature splits from S3 (Parquet).
2.	Drop non-numeric columns (e.g., date, Datetime) and prepare XGBoost CSV format (target first for train/val, features only for test).
3.	Upload the generated CSV files to S3 under a training prefix.
4.	Train an XGBoost regression model in SageMaker using train + validation channels.
5.	Run Batch Transform on the test set to generate predictions and store outputs in S3.
6.	Capture and print the training job name and batch output S3 path for downstream evaluation/registry notebooks.



In [5]:
import os
from dotenv import load_dotenv

# Load .env from project root
load_dotenv()

ROLE_ARN = os.getenv("SAGEMAKER_ROLE_ARN")

if not ROLE_ARN:
    raise RuntimeError(
        "SAGEMAKER_ROLE_ARN not found.\n"
        "Ensure it is set in .env or environment variables."
    )

print("Using ROLE_ARN:", ROLE_ARN)

Using ROLE_ARN: arn:aws:iam::222634372778:role/service-role/AmazonSageMaker-ExecutionRole-20250603T080776


In [6]:
import os, sys, time
import boto3
import pandas as pd
import sagemaker
from sagemaker.inputs import TrainingInput
from time import gmtime, strftime

sys.path.append("..")
from config.config import AWS_REGION, BUCKET_NAME, S3_PREFIX, TARGET_COLUMN, MODEL_NAME, BUCKET_NAME, SAGEMAKER_ROLE_NAME

sess = sagemaker.Session()
region = sess.boto_region_name

bucket = BUCKET_NAME
prefix = "nfci-xgboost-regression"   # keep this notebook-specific prefix

print("Region:", region)
print("Bucket:", bucket)
print("Prefix:", prefix)

Region: us-east-1
Bucket: nfci-forecasting-222634372778
Prefix: nfci-xgboost-regression


In [3]:
import sagemaker
print("sagemaker version:", sagemaker.__version__)

sagemaker version: 2.256.1


In [5]:
# Load Parquet splits from S3

train_s3 = f"s3://{bucket}/{S3_PREFIX['train']}/features.parquet"
val_s3   = f"s3://{bucket}/{S3_PREFIX['validation']}/features.parquet"
test_s3  = f"s3://{bucket}/{S3_PREFIX['test']}/features.parquet"

df_train = pd.read_parquet(train_s3)
df_val   = pd.read_parquet(val_s3)
df_test  = pd.read_parquet(test_s3)

print(df_train.shape, df_val.shape, df_test.shape)
print("Target:", TARGET_COLUMN)

(6800, 80) (1100, 80) (1100, 80)
Target: NFCI


In [10]:
# Drop non-numeric columns BEFORE generating CSVs
DROP_COLS = ["date", "Datetime"]

df_train = df_train.drop(columns=DROP_COLS, errors="ignore")
df_val   = df_val.drop(columns=DROP_COLS, errors="ignore")
df_test  = df_test.drop(columns=DROP_COLS, errors="ignore")

In [11]:
## Convert to CSV in XGBoost required format (target first)
def to_xgb_train_csv(df: pd.DataFrame, target: str, out_file: str):
    feature_cols = [c for c in df.columns if c != target]
    df_xgb = df[[target] + feature_cols].copy()
    df_xgb.to_csv(out_file, index=False, header=False)
    return out_file

def to_xgb_infer_csv(df: pd.DataFrame, target: str, out_file: str):
    feature_cols = [c for c in df.columns if c != target]
    df_xgb = df[feature_cols].copy()
    df_xgb.to_csv(out_file, index=False, header=False)
    return out_file

In [12]:
train_file = "train.csv"
val_file   = "validation.csv"
test_file  = "test.csv"

to_xgb_train_csv(df_train, TARGET_COLUMN, "train.csv")
to_xgb_train_csv(df_val,   TARGET_COLUMN, "validation.csv")
to_xgb_infer_csv(df_test,  TARGET_COLUMN, "test.csv")   

train_s3_uri = sess.upload_data(train_file, key_prefix=f"{prefix}/train")
val_s3_uri   = sess.upload_data(val_file,   key_prefix=f"{prefix}/validation")
test_s3_uri  = sess.upload_data(test_file,  key_prefix=f"{prefix}/test")

print("Uploaded train:", train_s3_uri)
print("Uploaded val:", val_s3_uri)
print("Uploaded test:", test_s3_uri)

Uploaded train: s3://sagemaker-us-east-1-222634372778/nfci-xgboost-regression/train/train.csv
Uploaded val: s3://sagemaker-us-east-1-222634372778/nfci-xgboost-regression/validation/validation.csv
Uploaded test: s3://sagemaker-us-east-1-222634372778/nfci-xgboost-regression/test/test.csv


In [None]:
import boto3
import json

iam_client = boto3.client("iam")

ROLE_NAME = ROLE_ARN.split("/")[-1] 
# BUCKET_NAME = "nfci-forecasting-222634372778"

policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Sid": "AllowListBucket",
            "Effect": "Allow",
            "Action": ["s3:ListBucket"],
            "Resource": f"arn:aws:s3:::{BUCKET_NAME}"
        },
        {
            "Sid": "AllowReadWriteObjects",
            "Effect": "Allow",
            "Action": [
                "s3:GetObject",
                "s3:PutObject",
                "s3:DeleteObject",
                "s3:AbortMultipartUpload",
                "s3:ListMultipartUploadParts",
                "s3:ListBucketMultipartUploads"
            ],
            "Resource": f"arn:aws:s3:::{BUCKET_NAME}/*"
        }
    ]
}

iam_client.put_role_policy(
    RoleName=ROLE_NAME,
    PolicyName="SageMakerS3TrainingArtifactsAccess",
    PolicyDocument=json.dumps(policy_document)
)

print("Inline policy attached successfully")

Inline policy attached successfully


In [14]:
# Training job

job_name = "xgb-nfci-reg-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
output_location = f"s3://{bucket}/{prefix}/output/{job_name}"

image = sagemaker.image_uris.retrieve(
    framework="xgboost", region=region, version="1.7-1"
)

sm_estimator = sagemaker.estimator.Estimator(
    image_uri=image,
    role=ROLE_ARN,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=50,
    input_mode="File",
    output_path=output_location,
    sagemaker_session=sess,
)

sm_estimator.set_hyperparameters(
    objective="reg:squarederror",
    eval_metric="rmse",
    max_depth=5,
    eta=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    num_round=300,
)

train_data = TrainingInput(train_s3_uri, content_type="text/csv")
validation_data = TrainingInput(val_s3_uri, content_type="text/csv")

sm_estimator.fit({"train": train_data, "validation": validation_data}, job_name=job_name, logs=True)

INFO:sagemaker:Creating training-job with name: xgb-nfci-reg-2026-02-02-07-51-47


2026-02-02 07:49:09 Starting - Starting the training job...
2026-02-02 07:49:26 Starting - Preparing the instances for training...
2026-02-02 07:50:12 Downloading - Downloading the training image......
  import pkg_resources
[2026-02-02 07:51:14.338 ip-10-0-167-41.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2026-02-02 07:51:14.399 ip-10-0-167-41.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.
[2026-02-02:07:51:14:INFO] Imported framework sagemaker_xgboost_container.training
[2026-02-02:07:51:14:INFO] Failed to parse hyperparameter eval_metric value rmse to Json.
Returning the value itself
[2026-02-02:07:51:14:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.
Returning the value itself
[2026-02-02:07:51:14:INFO] No GPUs detected (normal if no gpus installed)
[2026-02-02:07:51:14:INFO] Running XGBoost Sagemaker in algorithm mode
[2026-02-02:07:51:14:INFO] Determined 0 GPU(s) available on the instance.
[

In [15]:
# Take the trained model artifact from sm_estimator and Prepare to write predictions to the given S3 path
transformer = sm_estimator.transformer(
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=f"s3://{bucket}/{prefix}/batch-output/{job_name}",
)

transformer.transform(test_s3_uri, content_type="text/csv", split_type="Line")
transformer.wait()
print("Batch output:", transformer.output_path)
local_test_csv = "test.csv"   

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2026-02-02-07-54-45-582
INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2026-02-02-07-54-46-259


  import pkg_resources
[2026-02-02:07:57:01:INFO] No GPUs detected (normal if no gpus installed)
[2026-02-02:07:57:01:INFO] No GPUs detected (normal if no gpus installed)
[2026-02-02:07:57:01:INFO] nginx config: 
worker_processes auto;
daemon off;
pid /tmp/nginx.pid;
error_log  /dev/stderr;
worker_rlimit_nofile 4096;
events {
  worker_connections 2048;
}
  import pkg_resources
[2026-02-02:07:57:01:INFO] No GPUs detected (normal if no gpus installed)
[2026-02-02:07:57:01:INFO] No GPUs detected (normal if no gpus installed)
[2026-02-02:07:57:01:INFO] nginx config: 
worker_processes auto;
daemon off;
pid /tmp/nginx.pid;
error_log  /dev/stderr;
worker_rlimit_nofile 4096;
events {
  worker_connections 2048;
}
http {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(

#### ### record the details after model training as metadata in s3

In [None]:
# import json
# import boto3

# s3 = boto3.client("s3", region_name=AWS_REGION)

# run_metadata = {
#     "training_job_name": job_name,
#     "training_prefix": prefix,
#     "bucket": bucket,
#     "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
# }

# metadata_key = f"{prefix}/run_metadata/latest.json"

# s3.put_object(
#     Bucket=bucket,
#     Key=metadata_key,
#     Body=json.dumps(run_metadata, indent=2).encode("utf-8"),
#     ContentType="application/json"
# )

# print("Saved run metadata to:")
# print(f"s3://{bucket}/{metadata_key}")