In [1]:
%store -r s3_bucket_name
%store -r prefix
%store -r training_data_path

In [2]:
import sagemaker

session = sagemaker.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()

In [3]:
s3_training_data_path = training_data_path
s3_output_path = f"s3://{s3_bucket_name}/{prefix}/output"

In [4]:
!aws s3 cp {s3_training_data_path} tmp/training_data.csv

download: s3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv to tmp/training_data.csv


In [5]:
import pandas as pd

training_data = pd.read_csv("tmp/training_data.csv")
training_data

Unnamed: 0,approved,sex,math,science,technology,random1,random2
0,1,1,97,97,98,93,82
1,1,1,85,68,62,92,65
2,1,1,99,100,80,71,60
3,1,1,91,79,84,60,70
4,1,1,73,86,66,70,98
...,...,...,...,...,...,...,...
595,1,1,99,86,85,98,87
596,1,1,71,97,90,86,99
597,1,1,95,86,62,69,73
598,1,1,78,71,68,72,68


In [6]:
from sagemaker.image_uris import retrieve

container = retrieve('xgboost', region, version='1.2-1')

In [7]:
from sagemaker.estimator import Estimator

estimator = Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.large',
    sagemaker_session=session
)

In [8]:
estimator.set_hyperparameters(
    objective='binary:logistic',
    max_depth=8,
    eta=0.1,
    min_child_weight=4,
    num_round=500
)

In [9]:
from sagemaker.inputs import TrainingInput

train_input = TrainingInput(
    s3_training_data_path, 
    content_type='csv'
)

In [10]:
%%time

estimator.fit({'train': train_input}, wait='True')

2021-06-13 17:34:37 Starting - Starting the training job...
2021-06-13 17:34:47 Starting - Launching requested ML instancesProfilerReport-1623605677: InProgress
......
2021-06-13 17:35:56 Starting - Preparing the instances for training.........
2021-06-13 17:37:29 Downloading - Downloading input data...
2021-06-13 17:37:49 Training - Downloading the training image..[34m[2021-06-13 17:38:17.543 ip-10-2-126-138.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV

In [11]:
import random
from string import ascii_uppercase

def generate_model_name():
    chars = random.choices(ascii_uppercase, k=5)
    output = 'model-' + ''.join(chars)
    return output

In [12]:
model_name = generate_model_name()
model_name

'model-VWXTC'

In [13]:
model = estimator.create_model(name=model_name)

In [14]:
type(model)

sagemaker.model.Model

In [15]:
model.__dict__

{'model_data': 's3://sagemaker-us-east-1-581320662326/sagemaker-xgboost-2021-06-13-17-34-37-316/output/model.tar.gz',
 'image_uri': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-1',
 'role': 'arn:aws:iam::581320662326:role/SuperAdminRole',
 'predictor_cls': <function sagemaker.estimator.Estimator.create_model.<locals>.predict_wrapper(endpoint, session)>,
 'env': {},
 'name': 'model-VWXTC',
 '_base_name': None,
 'vpc_config': None,
 'sagemaker_session': <sagemaker.session.Session at 0x7ff430352310>,
 'endpoint_name': None,
 '_is_compiled_model': False,
 '_compilation_job_name': None,
 '_is_edge_packaged_model': False,
 '_enable_network_isolation': False,
 'model_kms_key': None,
 'image_config': None}

In [16]:
container_def = model.prepare_container_def()
container_def

{'Image': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-1',
 'Environment': {},
 'ModelDataUrl': 's3://sagemaker-us-east-1-581320662326/sagemaker-xgboost-2021-06-13-17-34-37-316/output/model.tar.gz'}

In [17]:
session.create_model(
    model_name,
    role,
    container_def
)

'model-VWXTC'

In [18]:
from sagemaker.clarify import SageMakerClarifyProcessor

processor = SageMakerClarifyProcessor(
    role=role,                                                  
    instance_count=1,                                                  
    instance_type='ml.m5.large',
    sagemaker_session=session
)

In [19]:
from sagemaker.clarify import DataConfig

data_config = DataConfig(
    s3_data_input_path=s3_training_data_path,
    s3_output_path=s3_output_path,
    label='approved',
    headers=training_data.columns.to_list(),
    dataset_type='text/csv'
)

data_config.__dict__

{'s3_data_input_path': 's3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv',
 's3_output_path': 's3://sagemaker-cookbook-bucket/chapter07/output',
 's3_data_distribution_type': 'FullyReplicated',
 's3_compression_type': 'None',
 'label': 'approved',
 'headers': ['approved',
  'sex',
  'math',
  'science',
  'technology',
  'random1',
  'random2'],
 'features': None,
 'analysis_config': {'dataset_type': 'text/csv',
  'headers': ['approved',
   'sex',
   'math',
   'science',
   'technology',
   'random1',
   'random2'],
  'label': 'approved'}}

In [20]:
from sagemaker.clarify import ModelConfig

model_config = ModelConfig(
    model_name=model_name,
    instance_type='ml.c5.xlarge',
    instance_count=1,
    accept_type='text/csv'
)

In [21]:
from sagemaker.clarify import ModelPredictedLabelConfig

predictions_config = ModelPredictedLabelConfig(
    probability_threshold=0.5
)

In [22]:
from sagemaker.clarify import BiasConfig

bias_config = BiasConfig(
    label_values_or_threshold=[1],              
    facet_name='sex',                             
)

In [23]:
%%time

processor.run_post_training_bias(
    data_config=data_config, 
    data_bias_config=bias_config,
    methods=['DPPL', 'RD'],
    model_config=model_config,
    model_predicted_label_config=predictions_config
)


Job Name:  Clarify-Posttraining-Bias-2021-06-13-17-38-51-184
Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/output/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/output', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
................................[34mINFO:sagemaker-clarify-processing:Starting SageMaker C

In [24]:
output = processor.latest_job.outputs[0]
output_destination = output.destination
output_destination

's3://sagemaker-cookbook-bucket/chapter07/output'

In [25]:
!aws s3 cp {output_destination}/ tmp/ --recursive

download: s3://sagemaker-cookbook-bucket/chapter07/output/analysis_config.json to tmp/analysis_config.json
download: s3://sagemaker-cookbook-bucket/chapter07/output/analysis.json to tmp/analysis.json
download: s3://sagemaker-cookbook-bucket/chapter07/output/explanations_shap/baseline.csv to tmp/explanations_shap/baseline.csv
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.pdf to tmp/report.pdf
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.ipynb to tmp/report.ipynb
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.html to tmp/report.html
download: s3://sagemaker-cookbook-bucket/chapter07/output/explanations_shap/out.csv to tmp/explanations_shap/out.csv


In [26]:
!ls -lahF tmp/

total 936K
drwxr-xr-x 4 root root 6.0K Jun 13 17:51 ./
drwxr-xr-x 6 root root 6.0K Jun 13 17:50 ../
drwxr-xr-x 2 root root 6.0K May 29 23:03 .ipynb_checkpoints/
-rw-r--r-- 1 root root 1.4K Jun 13 17:51 analysis.json
-rw-r--r-- 1 root root  492 Jun 13 17:38 analysis_config.json
-rw-r--r-- 1 root root 139K May 24 17:15 baseline.csv
-rw-r--r-- 1 root root 133K May 24 17:28 baseline_no_label.csv
-rw-r--r-- 1 root root 1.1K May 24 20:29 constraints.json
drwxr-xr-x 2 root root 6.0K Jun 13 17:51 explanations_shap/
-rw-r--r-- 1 root root 307K Jun 13 17:51 report.html
-rw-r--r-- 1 root root  41K Jun 13 17:51 report.ipynb
-rw-r--r-- 1 root root  58K Jun 13 17:51 report.pdf
-rw-r--r-- 1 root root  407 May 24 15:11 sample.jsonl
-rw-r--r-- 1 root root 122K May 24 20:29 statistics.json
-rw-r--r-- 1 root root 3.8K Jun 13 17:23 test_data.csv
-rw-r--r-- 1 root root 3.8K Jun 13 17:23 test_data_no_header.csv
-rw-r--r-- 1 root root  45K May 30 09:03 test_features.csv
-rw-r--r-- 1 root root  12K Jun 13 17:

In [27]:
!cat tmp/analysis.json

{
    "version": "1.0",
    "post_training_bias_metrics": {
        "label": "approved",
        "facets": {
            "sex": [
                {
                    "value_or_threshold": "1",
                    "metrics": [
                        {
                            "name": "DPPL",
                            "description": "Difference in Positive Proportions in Predicted Labels (DPPL)",
                            "value": -0.33541395157418197
                        },
                        {
                            "name": "RD",
                            "description": "Recall Difference (RD)",
                            "value": 0.0
                        }
                    ]
                },
                {
                    "value_or_threshold": "0",
                    "metrics": [
                        {
                            "name": "DPPL",
                            "description": "Difference in Positive Proportions in Predicted Labe

In [28]:
%store model_name
model_name

Stored 'model_name' (str)


'model-VWXTC'