In [1]:
import os
import sys
import json
import subprocess
from time import gmtime, strftime

import boto3
import sagemaker
import pandas as pd

from sagemaker import clarify
from sagemaker.session import Session

# ---------------------------------------------------------
# 1) ติดตั้ง MLflow + plugin
# ---------------------------------------------------------
# subprocess.check_call([sys.executable, "-m", "pip", "install", "mlflow==2.13.2", "sagemaker-mlflow==0.1.0"])

import mlflow as mlf
import sagemaker_mlflow  # activate SageMaker MLflow plugin

# ---------------------------------------------------------
# 2) SageMaker session / role / region
# ---------------------------------------------------------
sess: Session = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

print("Region:", region)
print("Bucket:", bucket)
print("Role:", role)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


  import pkg_resources  # noqa: TID251


Region: us-east-1
Bucket: sagemaker-us-east-1-423623839320
Role: arn:aws:iam::423623839320:role/service-role/SageMaker-ExecutionRole-20250705T232334


In [2]:

# โหลด S3 URI ของ train set ที่ preprocess แล้ว
%store -r processed_train_data_s3_uri
print("Processed train S3:", processed_train_data_s3_uri)

# ลองอ่าน header มาดู (เผื่อ debug)
train_csv_s3_path = processed_train_data_s3_uri.rstrip("/") + "/train.csv"
print("Sample train CSV:", train_csv_s3_path)

!mkdir -p data
!aws s3 cp $train_csv_s3_path data/train.csv

sample_df = pd.read_csv("data/train.csv", nrows=5)
print("Train columns:", sample_df.columns.tolist())



Processed train S3: s3://sagemaker-us-east-1-423623839320/sagemaker-scikit-learn-2025-12-03-07-46-41-610/output/retail-train
Sample train CSV: s3://sagemaker-us-east-1-423623839320/sagemaker-scikit-learn-2025-12-03-07-46-41-610/output/retail-train/train.csv
download: s3://sagemaker-us-east-1-423623839320/sagemaker-scikit-learn-2025-12-03-07-46-41-610/output/retail-train/train.csv to data/train.csv
Train columns: ['record_id', 'date', 'store_id', 'day_of_week', 'is_weekend', 'is_holiday', 'holiday_name', 'max_temp_c', 'rainfall_mm', 'is_hot_day', 'is_rainy_day', 'base_price', 'discount_pct', 'is_promo', 'promo_type', 'final_price', 'units_sold', 'event_time', 'year', 'month', 'day', 'day_of_year', 'day_of_week_index', 'discount_amount', 'is_promo_or_holiday', 'high_demand', 'split_type']


In [3]:
# ---------------------------------------------------------
# 3) Clarify DataConfig + BiasConfig
# ---------------------------------------------------------
bias_report_prefix = f"clarify/pretraining-bias-report"
bias_report_output_path = f"s3://{bucket}/{bias_report_prefix}"
print("Bias report output S3:", bias_report_output_path)

# ใช้ train set ที่มี high_demand แล้ว
data_config = clarify.DataConfig(
    s3_data_input_path=processed_train_data_s3_uri,  # prefix ของ train (มี train.csv)
    s3_output_path=bias_report_output_path,
    label="high_demand",                             # label binary ที่เราสร้าง
    dataset_type="text/csv",
)

# facet = is_weekend (0=weekday, 1=weekend), group ที่สนใจ = 1
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1],   # positive outcome = high_demand == 1
    facet_name="is_weekend",
    facet_values_or_threshold=[1],
)

clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type="ml.c5.2xlarge",
    sagemaker_session=sess,
)



Bias report output S3: s3://sagemaker-us-east-1-423623839320/clarify/pretraining-bias-report


In [4]:
# ---------------------------------------------------------
# 4) Helper parse_s3_uri
# ---------------------------------------------------------
def parse_s3_uri(s3_uri: str):
    if not s3_uri.startswith("s3://"):
        raise ValueError(f"Not a valid S3 URI: {s3_uri}")
    no_scheme = s3_uri[5:]
    bucket_name, _, key_prefix = no_scheme.partition("/")
    return bucket_name, key_prefix

# ---------------------------------------------------------
# 5) MLflow config
# ---------------------------------------------------------
EXPERIMENT_NAME = "forcasting_demand_product"
MLFLOW_TRACKING_SERVER_ARN = (
    "arn:aws:sagemaker:us-east-1:423623839320:mlflow-tracking-server/tracking-server-demo"
)

mlf.set_tracking_uri(MLFLOW_TRACKING_SERVER_ARN)
mlf.set_experiment(EXPERIMENT_NAME)

suffix = strftime("%d-%H-%M-%S", gmtime())
run_name = f"clarify-pre-bias-{suffix}"

# metric set สำหรับ pre-training bias (ตาม docs Clarify)
pre_training_methods = [
    "CI",   # Class Imbalance
    "DPL",  # Difference in Positive Proportions in Labels
    "JS",   # Jensen-Shannon Divergence
    "KL",   # Kullback-Leibler Divergence
    "KS",   # Kolmogorov-Smirnov
    "LP",   # L-p Norm
    "TVD",  # Total Variation Distance
]


In [5]:
# ---------------------------------------------------------
# 6) Run Clarify pre-training bias + log เข้า MLflow
# ---------------------------------------------------------
with mlf.start_run(
    run_name=run_name,
    description="SageMaker Clarify pre-training bias on training dataset using high_demand label",
):
    mlf.log_param("clarify_data_s3", processed_train_data_s3_uri)
    mlf.log_param("bias_facet", "is_weekend")
    mlf.log_param("bias_positive_label_value", 1)

    print("Starting Clarify pre-training bias job...")
    clarify_processor.run_pre_training_bias(
        data_config=data_config,
        data_bias_config=bias_config,
        methods=pre_training_methods,
        wait=True,
        logs=True,
    )
    print("Clarify pre-training job finished.")

    # ดึง analysis.json จาก S3
    s3_client = boto3.client("s3", region_name=region)
    bucket_name, key_prefix = parse_s3_uri(bias_report_output_path)
    analysis_s3_key = key_prefix.rstrip("/") + "/analysis.json"

    local_dir = f"./clarify_pre_bias_{suffix}"
    os.makedirs(local_dir, exist_ok=True)
    local_analysis_path = os.path.join(local_dir, "analysis.json")

    print("Downloading analysis.json from S3:", f"s3://{bucket_name}/{analysis_s3_key}")
    s3_client.download_file(bucket_name, analysis_s3_key, local_analysis_path)

    # log report ทั้งก้อนเป็น artifact
    mlf.log_artifact(local_analysis_path)

    # Parse แล้ว log metrics ลง MLflow
    with open(local_analysis_path, "r") as f:
        analysis = json.load(f)

    pre_metrics = analysis.get("pre_training_bias_metrics", {})
    facets = pre_metrics.get("facets", {})

    for facet_name, facet_entries in facets.items():
        for entry in facet_entries:
            facet_value = entry.get("value_or_threshold")
            metrics_list = entry.get("metrics", [])
            for m in metrics_list:
                short_name = m.get("name")     # เช่น CI, DPL, ...
                value = m.get("value", None)
                if value is None:
                    continue
                metric_name = f"pre_bias_{facet_name}_{facet_value}_{short_name}"
                print(metric_name, "=", value)
                mlf.log_metric(metric_name, value)

    print("Logged Clarify pre-training bias metrics to MLflow.")


INFO:sagemaker:Creating processing-job with name Clarify-Pretraining-Bias-2025-12-03-08-27-38-512


Starting Clarify pre-training bias job...
..................sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
We are not in a supported iso region, /bin/sh exiting gracefully with no changes.
INFO:sagemaker-clarify-processing:Starting SageMaker Clarify Processing job
INFO:analyzer.data_loading.data_loader_util:Analysis config path: /opt/ml/processing/input/config/analysis_config.json
INFO:analyzer.data_loading.data_loader_util:Analysis result path: /opt/ml/processing/output
INFO:analyzer.data_loading.data_loader_util:This host is algo-1.
INFO:analyzer.data_loading.data_loader_util:This host is the leader.
INFO:analyzer.data_loading.data_loader_util:Number of hosts in the cluster is 1.
INFO:sagemaker-clarify-processing:Running Python / Pandas based analyzer.
INFO:analyzer.data_loading.data_loader_factory:Dataset type: text/csv uri: /opt/ml/pr

In [7]:
%store bias_report_output_path

Stored 'bias_report_output_path' (str)


In [6]:
bias_report_output_path

's3://sagemaker-us-east-1-423623839320/clarify/pretraining-bias-report'