In [6]:
%store -r s3_bucket_name
%store -r prefix
%store -r training_data_path

In [2]:
import sagemaker

session = sagemaker.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()

In [3]:
import pandas as pd
import numpy as np

In [4]:
!mkdir -p tmp

In [7]:
s3_training_data_path = training_data_path
s3_output_path = f"s3://{s3_bucket_name}/{prefix}/output"

In [8]:
!aws s3 cp {s3_training_data_path} tmp/training_data.csv

download: s3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv to tmp/training_data.csv


In [9]:
training_data = pd.read_csv("tmp/training_data.csv")
training_data

Unnamed: 0,label,a,b,c,d
0,1,-8.837413,-6.551265,23,-75
1,1,-9.216749,-2.483494,2,-51
2,1,-2.017317,-6.326533,91,34
3,1,-10.748736,-4.622519,8,-78
4,0,-3.675848,12.629029,47,32
...,...,...,...,...,...
2995,0,-5.786462,-6.790668,-65,70
2996,1,-2.552410,-1.793217,42,4
2997,0,-10.692197,1.583437,-90,-62
2998,1,-14.109003,-4.745680,37,64


In [10]:
from sagemaker import clarify

processor = clarify.SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    sagemaker_session=session
)

In [22]:
data_config = clarify.DataConfig(
    s3_data_input_path=s3_training_data_path,
    s3_output_path=s3_output_path,
    label='label',
    headers=training_data.columns.to_list(),
    dataset_type='text/csv'
)

In [13]:
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1],
    facet_name='a',
    facet_values_or_threshold=[5]
)

In [14]:
%%time

processor.run_pre_training_bias(
    data_config=data_config, 
    data_bias_config=bias_config,
    methods=['CI']
)


Job Name:  Clarify-Pretraining-Bias-2021-05-24-13-32-37-629
Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/output/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/output', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
...............................[34mINFO:sagemaker-clarify-processing:Starting SageMaker Cla

In [17]:
output = processor.latest_job.outputs[0]
output_destination = output.destination
output_destination

's3://sagemaker-cookbook-bucket/chapter07/output'

In [18]:
!aws s3 cp {output_destination}/ tmp/ --recursive

download: s3://sagemaker-cookbook-bucket/chapter07/output/analysis_config.json to tmp/analysis_config.json
download: s3://sagemaker-cookbook-bucket/chapter07/output/analysis.json to tmp/analysis.json
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.pdf to tmp/report.pdf
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.ipynb to tmp/report.ipynb
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.html to tmp/report.html


In [19]:
!ls -lahF tmp/

total 792K
drwxr-xr-x 2 root root 6.0K May 24 13:59 ./
drwxr-xr-x 4 root root 6.0K May 24 13:59 ../
-rw-r--r-- 1 root root  574 May 24 13:37 analysis.json
-rw-r--r-- 1 root root  297 May 24 13:32 analysis_config.json
-rw-r--r-- 1 root root 268K May 24 13:37 report.html
-rw-r--r-- 1 root root 1.3K May 24 13:37 report.ipynb
-rw-r--r-- 1 root root  29K May 24 13:37 report.pdf
-rw-r--r-- 1 root root  46K May 24 13:26 test_data.csv
-rw-r--r-- 1 root root  46K May 24 13:26 test_data_no_header.csv
-rw-r--r-- 1 root root 139K May 24 13:26 training_data.csv
-rw-r--r-- 1 root root 139K May 24 13:26 training_data_no_header.csv
-rw-r--r-- 1 root root  47K May 24 13:26 validation_data.csv
-rw-r--r-- 1 root root  47K May 24 13:26 validation_data_no_header.csv


In [21]:
!cat tmp/analysis.json

{
    "version": "1.0",
    "explanations": {
        "kernel_shap": {
            "label0": {
                "global_shap_values": {
                    "a": -0.23615066885249675,
                    "b": 0.23931414559483516,
                    "c": -0.16817792626097805,
                    "d": -0.0255478889434015
                },
                "expected_value": 0.8622586131095886
            }
        }
    }
}