# Load data from S3 and basic exploration

In [1]:

import boto3
import sagemaker
import pandas as pd
import numpy as np
import glob
import os

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
sm = boto3.Session().client(service_name="sagemaker", region_name=region)

print("Bucket:", bucket)
print("Region:", region)


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Bucket: sagemaker-us-east-1-423623839320
Region: us-east-1


In [2]:
# ดึง path CSV จาก Step 2 ที่เรา %store ไว้
%store -r s3_private_path_csv
print("s3_private_path_csv:", s3_private_path_csv)

# ดาวน์โหลดไฟล์จาก S3 ลงมาที่โฟลเดอร์ ./data-clarify
!mkdir -p data-clarify
!aws s3 cp $s3_private_path_csv ./data-clarify --recursive

s3_private_path_csv: s3://sagemaker-us-east-1-423623839320/retail-demand-forecasting/csv/
download: s3://sagemaker-us-east-1-423623839320/retail-demand-forecasting/csv/retail-demand-forecasting.csv to data-clarify/retail-demand-forecasting.csv


In [4]:
# หาไฟล์ CSV ที่ดาวน์โหลดมา
csv_files = glob.glob("data-clarify/**/*.csv", recursive=True)
print("Found CSV files:", csv_files)
assert len(csv_files) > 0, "No CSV files found under ./data-clarify"

csv_path = csv_files[0]
print("Using CSV file:", csv_path)

print("\nHead:")
display(data.head())



Found CSV files: ['data-clarify/retail-demand-forecasting.csv']
Using CSV file: data-clarify/retail-demand-forecasting.csv

Head:


Unnamed: 0,record_id,date,store_id,day_of_week,is_weekend,is_holiday,holiday_name,max_temp_c,rainfall_mm,is_hot_day,is_rainy_day,base_price,discount_pct,is_promo,promo_type,final_price,units_sold
0,1,2024-01-01,1,Monday,0,1,New Year,29.0,2.4,0,0,12.99,0.0,0,none,12.99,102
1,2,2024-01-02,1,Tuesday,0,0,,27.5,9.3,0,1,14.85,0.0,0,none,14.85,82
2,3,2024-01-03,1,Wednesday,0,0,,27.1,5.2,0,1,10.92,0.0,0,none,10.92,78
3,4,2024-01-04,1,Thursday,0,0,,24.2,0.0,0,0,10.7,0.0,0,none,10.7,76
4,5,2024-01-05,1,Friday,0,0,,26.2,0.0,0,0,13.93,0.0,0,none,13.93,81


# เตรียมข้อมูล + Config และรัน Clarify Bias Job

In [5]:
from sagemaker import clarify

# สร้าง binary label: high_demand (1 = ขายดีมาก, 0 = ปกติ/ขายน้อย)
high_demand_threshold = 100
data["high_demand"] = (data["units_sold"] >= high_demand_threshold).astype(int)

print(
    "High demand threshold:", high_demand_threshold,
    "\nLabel distribution:\n", data["high_demand"].value_counts()
)

# บันทึก dataset ที่เตรียมแล้วสำหรับ Clarify
clarify_input_local = "data-clarify/retail-demand-clarify-input.csv"
data.to_csv(clarify_input_local, index=False)
print("Saved Clarify input to:", clarify_input_local)



High demand threshold: 100 
Label distribution:
 high_demand
1    374
0    126
Name: count, dtype: int64
Saved Clarify input to: data-clarify/retail-demand-clarify-input.csv


In [6]:
# อัปโหลดขึ้น S3 สำหรับ Clarify
clarify_input_key = "clarify/input/retail-demand-clarify-input.csv"
!aws s3 cp $clarify_input_local s3://$bucket/$clarify_input_key

bias_data_s3_uri = f"s3://{bucket}/{clarify_input_key}"
bias_report_output_path = f"s3://{bucket}/clarify/output"

%store bias_data_s3_uri
%store bias_report_output_path

print("Clarify input S3:", bias_data_s3_uri)
print("Clarify output S3:", bias_report_output_path)



upload: data-clarify/retail-demand-clarify-input.csv to s3://sagemaker-us-east-1-423623839320/clarify/input/retail-demand-clarify-input.csv
Stored 'bias_data_s3_uri' (str)
Stored 'bias_report_output_path' (str)
Clarify input S3: s3://sagemaker-us-east-1-423623839320/clarify/input/retail-demand-clarify-input.csv
Clarify output S3: s3://sagemaker-us-east-1-423623839320/clarify/output


# Clarify Processor

In [7]:
# สร้าง Clarify Processor
clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type="ml.c5.xlarge",
    sagemaker_session=sess,
)

# Config ข้อมูลสำหรับ Clarify
bias_data_config = clarify.DataConfig(
    s3_data_input_path=bias_data_s3_uri,
    s3_output_path=bias_report_output_path,
    label="high_demand",                     # label column
    headers=data.columns.to_list(),          # ชื่อคอลัมน์ทั้งหมด
    dataset_type="text/csv",
)

# Config bias: ใช้ is_weekend เป็น facet
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1],           # 1 = High demand
    facet_name="is_weekend",                # เปรียบเทียบ วันธรรมดา vs weekend
    facet_values_or_threshold=[1],          # group ที่สนใจ = weekend
)

# รัน Clarify pre-training bias job
clarify_processor.run_pre_training_bias(
    data_config=bias_data_config,
    data_bias_config=bias_config,
    methods=["CI", "DPL", "KL", "JS", "LP", "TVD", "KS"],
    wait=False,
    logs=False,
)

run_pre_training_bias_processing_job_name = clarify_processor.latest_job.job_name
print("Started Clarify processing job:", run_pre_training_bias_processing_job_name)

# ติดตาม job ให้จบ
running_processor = sagemaker.processing.ProcessingJob.from_processing_name(
    processing_job_name=run_pre_training_bias_processing_job_name,
    sagemaker_session=sess,
)
running_processor.wait(logs=False)

print("Clarify job finished.")


INFO:sagemaker:Creating processing-job with name Clarify-Pretraining-Bias-2025-12-01-07-24-52-949


Started Clarify processing job: Clarify-Pretraining-Bias-2025-12-01-07-24-52-949
...........................................................!Clarify job finished.


In [8]:
# 3.1.3 Download Clarify bias report from S3

%store -r bias_report_output_path
print("Bias report S3 path:", bias_report_output_path)

!mkdir -p generated_bias_report
!aws s3 cp --recursive $bias_report_output_path ./generated_bias_report/

print("Local Clarify report folder:")
!ls -R generated_bias_report


Bias report S3 path: s3://sagemaker-us-east-1-423623839320/clarify/output
download: s3://sagemaker-us-east-1-423623839320/clarify/output/analysis_config.json to generated_bias_report/analysis_config.json
download: s3://sagemaker-us-east-1-423623839320/clarify/output/report.pdf to generated_bias_report/report.pdf
download: s3://sagemaker-us-east-1-423623839320/clarify/output/report.html to generated_bias_report/report.html
download: s3://sagemaker-us-east-1-423623839320/clarify/output/report.ipynb to generated_bias_report/report.ipynb
download: s3://sagemaker-us-east-1-423623839320/clarify/output/analysis.json to generated_bias_report/analysis.json
Local Clarify report folder:
generated_bias_report:
analysis.json  analysis_config.json  report.html  report.ipynb	report.pdf
