# รัน SKLearnProcessor เพื่อทำ Feature Transformation + Feature Store

In [1]:
import boto3
import sagemaker
from time import gmtime, strftime
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

print("Region:", region)
print("Bucket:", bucket)



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Region: us-east-1
Bucket: sagemaker-us-east-1-423623839320


In [2]:
# Raw input data (CSV จาก step Ingestion)
raw_input_data_s3_uri = f"s3://{bucket}/retail-demand-forecasting/csv/"
!aws s3 ls $raw_input_data_s3_uri

2025-12-01 07:07:27      37284 retail-demand-forecasting.csv


In [3]:
# Hyper-parameters for splitting
processing_instance_type = "ml.c5.2xlarge"
processing_instance_count = 1
train_split_percentage = 0.90
validation_split_percentage = 0.05
test_split_percentage = 0.05
balance_dataset = True  # ตอนนี้แค่ส่งต่อเป็น argument เฉย ๆ

# สร้างชื่อ Feature Group และ Offline Store prefix (มี timestamp)
timestamp = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
feature_group_name = f"retail-demand-feature-group-{timestamp}"
feature_store_offline_prefix = f"s3://{bucket}/feature-store/retail-demand/offline-store-{timestamp}"

print("Feature Group Name:", feature_group_name)
print("Feature Store Offline Prefix:", feature_store_offline_prefix)

# เก็บค่าไว้ใช้ในโน้ตบุ๊กอื่น
%store feature_group_name
%store feature_store_offline_prefix



Feature Group Name: retail-demand-feature-group-2025-12-03-07-45-13
Feature Store Offline Prefix: s3://sagemaker-us-east-1-423623839320/feature-store/retail-demand/offline-store-2025-12-03-07-45-13
Stored 'feature_group_name' (str)
Stored 'feature_store_offline_prefix' (str)


# SKLearnProcessor

In [6]:
# สร้าง SKLearnProcessor
processor = SKLearnProcessor(
    framework_version="0.23-1",
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    env={"AWS_DEFAULT_REGION": region},
    max_runtime_in_seconds=7200,
)



In [7]:
processor.run(
    code="preprocess-scikit-retail-feature-store.py",
    inputs=[
        ProcessingInput(
            input_name="raw-input-data",
            source=raw_input_data_s3_uri,  # s3://.../retail-demand-forecasting/csv/
            destination="/opt/ml/processing/input/data/",
            s3_data_distribution_type="ShardedByS3Key",
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="retail-train",
            s3_upload_mode="EndOfJob",
            source="/opt/ml/processing/output/retail_product/train",
        ),
        ProcessingOutput(
            output_name="retail-validation",
            s3_upload_mode="EndOfJob",
            source="/opt/ml/processing/output/retail_product/validation",
        ),
        ProcessingOutput(
            output_name="retail-test",
            s3_upload_mode="EndOfJob",
            source="/opt/ml/processing/output/retail_product/test",
        ),
    ],
    arguments=[
        "--train-split-percentage", str(train_split_percentage),
        "--validation-split-percentage", str(validation_split_percentage),
        "--test-split-percentage", str(test_split_percentage),
        "--balance-dataset", str(balance_dataset),
        "--feature-store-offline-prefix", feature_store_offline_prefix,   # ต้องเป็น s3://bucket/prefix
        "--feature-group-name", feature_group_name,
    ],
    logs=True,
    wait=True,
)


INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2025-12-03-07-46-41-610


.........Collecting sagemaker==2.24.1
  Downloading sagemaker-2.24.1.tar.gz (397 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 397.4/397.4 kB 36.1 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting attrs
  Downloading attrs-24.2.0-py3-none-any.whl (63 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 63.0/63.0 kB 10.9 MB/s eta 0:00:00
Collecting google-pasta
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 57.5/57.5 kB 10.0 MB/s eta 0:00:00
Collecting protobuf3-to-dict>=0.1.5
  Downloading protobuf3-to-dict-0.1.5.tar.gz (3.5 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting smdebug_rulesconfig==1.0.1
  Downloading smdebug_rulesconfig-1.0.1-py2.py3-none-any.whl (20 kB)
Collecting packaging>=20.0
  Downloading packaging-24.0-py3-none-any.whl (53 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [9]:
# ดึงชื่อ Processing Job
processing_job_name = processor.jobs[-1].describe()["ProcessingJobName"]
print("Processing job name:", processing_job_name)

running_processor = sagemaker.processing.ProcessingJob.from_processing_name(
    processing_job_name=processing_job_name, sagemaker_session=sess
)
processing_job_description = running_processor.describe()

# ดึง S3 URI ของ train / val / test ที่ออกจาก Processing Job
output_config = processing_job_description["ProcessingOutputConfig"]

for output in output_config["Outputs"]:
    if output["OutputName"] == "retail-train":
        processed_train_data_s3_uri = output["S3Output"]["S3Uri"]
    elif output["OutputName"] == "retail-validation":
        processed_validation_data_s3_uri = output["S3Output"]["S3Uri"]
    elif output["OutputName"] == "retail-test":
        processed_test_data_s3_uri = output["S3Output"]["S3Uri"]

print("Train S3 URI:      ", processed_train_data_s3_uri)
print("Validation S3 URI: ", processed_validation_data_s3_uri)
print("Test S3 URI:       ", processed_test_data_s3_uri)

# เก็บไว้ใช้ในขั้น Train (Step 5)
%store processed_train_data_s3_uri
%store processed_validation_data_s3_uri
%store processed_test_data_s3_uri


Processing job name: sagemaker-scikit-learn-2025-12-03-07-46-41-610
Train S3 URI:       s3://sagemaker-us-east-1-423623839320/sagemaker-scikit-learn-2025-12-03-07-46-41-610/output/retail-train
Validation S3 URI:  s3://sagemaker-us-east-1-423623839320/sagemaker-scikit-learn-2025-12-03-07-46-41-610/output/retail-validation
Test S3 URI:        s3://sagemaker-us-east-1-423623839320/sagemaker-scikit-learn-2025-12-03-07-46-41-610/output/retail-test
Stored 'processed_train_data_s3_uri' (str)
Stored 'processed_validation_data_s3_uri' (str)
Stored 'processed_test_data_s3_uri' (str)


In [None]:
sst = f"{processed_train_data_s3_uri}/train.csv"

!mkdir -p data
!aws s3 cp $sst data/train.csv

In [13]:
import pandas as pd 
df =pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,record_id,date,store_id,day_of_week,is_weekend,is_holiday,holiday_name,max_temp_c,rainfall_mm,is_hot_day,...,event_time,year,month,day,day_of_year,day_of_week_index,discount_amount,is_promo_or_holiday,high_demand,split_type
0,73,2024-03-13,1,Wednesday,0,0,,30.4,4.0,0,...,2024-03-13T00:00:00Z,2024,3,13,73,2,0.0,0,0,train
1,183,2024-03-23,2,Saturday,1,0,,30.6,0.2,0,...,2024-03-23T00:00:00Z,2024,3,23,83,5,0.0,0,0,train
2,132,2024-02-01,2,Thursday,0,0,,29.9,4.1,0,...,2024-02-01T00:00:00Z,2024,2,1,32,3,0.0,0,0,train
3,411,2024-01-11,5,Thursday,0,0,,26.1,0.0,0,...,2024-01-11T00:00:00Z,2024,1,11,11,3,3.792,1,1,train
4,194,2024-04-03,2,Wednesday,0,0,,39.3,1.8,1,...,2024-04-03T00:00:00Z,2024,4,3,94,2,2.962,1,1,train
