# Run XGBoost training with Profiler + MLflow tracking


In [1]:

import boto3
import sagemaker
from time import gmtime, strftime

from sagemaker.xgboost import XGBoost
from sagemaker.inputs import TrainingInput
from sagemaker.debugger import ProfilerRule, rule_configs, ProfilerConfig

# --- Session/Role/Region/Bucket ---
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
bucket = sess.default_bucket()

print("Region:", region)
print("Bucket:", bucket)



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Region: us-east-1
Bucket: sagemaker-us-east-1-423623839320


In [2]:
# --- Load processed train/val/test S3 URI from previous step ---
%store -r processed_train_data_s3_uri
%store -r processed_validation_data_s3_uri
%store -r processed_test_data_s3_uri

print("Train data S3:      ", processed_train_data_s3_uri)
print("Validation data S3: ", processed_validation_data_s3_uri)
print("Test data S3:       ", processed_test_data_s3_uri)


Train data S3:       s3://sagemaker-us-east-1-423623839320/sagemaker-scikit-learn-2025-12-03-07-46-41-610/output/retail-train
Validation data S3:  s3://sagemaker-us-east-1-423623839320/sagemaker-scikit-learn-2025-12-03-07-46-41-610/output/retail-validation
Test data S3:        s3://sagemaker-us-east-1-423623839320/sagemaker-scikit-learn-2025-12-03-07-46-41-610/output/retail-test


In [3]:
train_input = TrainingInput(
    processed_train_data_s3_uri,
    content_type="text/csv",
)
val_input = TrainingInput(
    processed_validation_data_s3_uri,
    content_type="text/csv",
)
test_input = TrainingInput(
    processed_test_data_s3_uri,
    content_type="text/csv",
)

# --- Debugger Profiler configuration ---
profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500  # เก็บ system metrics ทุก ๆ 0.5 วิ
)

rules = [
    ProfilerRule.sagemaker(rule_configs.ProfilerReport())
]

In [13]:


# --- Create XGBoost Estimator (Script mode) ---
timestamp = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
output_path = f"s3://{bucket}/retail-demand/xgboost-model-{timestamp}/"

xgb_estimator = XGBoost(
    entry_point="training.py",   # script ที่เราเขียน (มี MLflow + save feature_columns.json)
    source_dir="src",              # training.py + inference.py อยู่ในโฟลเดอร์นี้
    framework_version="1.7-1",
    py_version="py3",
    role=role,
    instance_count=1,
    instance_type="ml.m5.2xlarge",   # GPU ml.g4dn.xlarge ถ้าคุณมี quota; ถ้าไม่มีใช้ ml.m5.2xlarge ได้
    output_path=output_path,

    # Debugger / Profiler
    profiler_config=profiler_config,
    rules=rules,
    disable_profiler=False,
    enable_sagemaker_metrics=True,

    # Hyperparameters ส่งเข้าไปเป็น argparse args ใน training.py
    hyperparameters={
        "max_depth": 6,
        "n_estimators": 250,
        "learning_rate": 0.1,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "min_child_weight": 1.0,
        "reg_lambda": 1.0,
    },
)

# --- Launch training job ---
xgb_estimator.fit(
    inputs={
        "train": train_input,
        "validation": val_input,
        "test": test_input,
    }
)



INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.2xlarge.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-12-06-13-01-48-299


2025-12-06 13:01:50 Starting - Starting the training job...
2025-12-06 13:02:12 Starting - Preparing the instances for trainingProfilerReport: InProgress
...
2025-12-06 13:02:52 Downloading - Downloading the training image......
  import pkg_resources
[2025-12-06 13:03:47.794 ip-10-2-77-224.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2025-12-06 13:03:47.863 ip-10-2-77-224.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.
[2025-12-06:13:03:48:INFO] Imported framework sagemaker_xgboost_container.training
[2025-12-06:13:03:48:INFO] No GPUs detected (normal if no gpus installed)
[2025-12-06:13:03:48:INFO] Invoking user training script.
[2025-12-06:13:03:48:INFO] Module training does not provide a setup.py. 
Generating setup.py
[2025-12-06:13:03:48:INFO] Generating setup.cfg
[2025-12-06:13:03:48:INFO] Generating MANIFEST.in
[2025-12-06:13:03:48:INFO] Installing module with the following command:
/miniconda3/bin/python3 -m pip install . 

In [14]:
training_job_name = xgb_estimator.latest_training_job.job_name
model_artifact = xgb_estimator.model_data

print("Training job name:", training_job_name)
print("Model artifact S3:", model_artifact)



Training job name: sagemaker-xgboost-2025-12-06-13-01-48-299
Model artifact S3: s3://sagemaker-us-east-1-423623839320/retail-demand/xgboost-model-2025-12-06-13-01-48/sagemaker-xgboost-2025-12-06-13-01-48-299/output/model.tar.gz


In [15]:
# เก็บค่าไว้ใช้ตอน Evaluate / Deploy
%store training_job_name
%store model_artifact


Stored 'training_job_name' (str)
Stored 'model_artifact' (str)
