In [1]:
%store -r s3_bucket_name
%store -r prefix
%store -r training_data_path
%store -r test_data_path
%store -r model_name

In [2]:
import sagemaker

session = sagemaker.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()

In [3]:
s3_training_data_path = training_data_path
s3_test_data_path = test_data_path
s3_output_path = f"s3://{s3_bucket_name}/{prefix}/output"

In [4]:
!aws s3 cp {s3_training_data_path} tmp/training_data.csv
!aws s3 cp {s3_test_data_path} tmp/test_data.csv

download: s3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv to tmp/training_data.csv
download: s3://sagemaker-cookbook-bucket/chapter07/input/test_data.csv to tmp/test_data.csv


In [5]:
import pandas as pd

training_data = pd.read_csv("tmp/training_data.csv")
test_data = pd.read_csv("tmp/test_data.csv")

target = test_data['approved']
features = test_data.drop(columns=['approved'])
features.to_csv('tmp/test_features.csv', index=False, header=False)

In [6]:
features

Unnamed: 0,sex,math,science,technology,random1,random2
0,1,64,65,73,85,66
1,0,90,91,99,72,73
2,1,88,90,68,90,100
3,1,64,96,73,60,70
4,1,64,85,98,85,96
...,...,...,...,...,...,...
195,0,99,62,92,71,75
196,0,85,74,91,69,63
197,1,72,99,86,61,65
198,1,79,89,79,98,80


In [7]:
base = f"s3://{s3_bucket_name}/{prefix}/input"
s3_feature_path = f"{base}/test_features.csv"

!aws s3 cp tmp/test_features.csv {s3_feature_path}

upload: tmp/test_features.csv to s3://sagemaker-cookbook-bucket/chapter07/input/test_features.csv


In [8]:
from sagemaker.clarify import ModelConfig

model_config = ModelConfig(
    model_name=model_name,
    instance_type='ml.c5.xlarge',
    instance_count=1,
    accept_type='text/csv'
)

In [9]:
from sagemaker.clarify import SageMakerClarifyProcessor

processor = SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    sagemaker_session=session
)

In [10]:
baseline = features.iloc[0:200].values.tolist()
baseline

[[1, 64, 65, 73, 85, 66],
 [0, 90, 91, 99, 72, 73],
 [1, 88, 90, 68, 90, 100],
 [1, 64, 96, 73, 60, 70],
 [1, 64, 85, 98, 85, 96],
 [1, 73, 84, 82, 90, 82],
 [1, 62, 74, 85, 86, 72],
 [0, 95, 87, 79, 81, 61],
 [1, 91, 68, 71, 72, 66],
 [0, 62, 82, 63, 91, 94],
 [0, 90, 76, 100, 98, 68],
 [1, 82, 71, 96, 91, 94],
 [1, 72, 79, 65, 96, 89],
 [1, 72, 80, 72, 80, 96],
 [1, 89, 97, 74, 98, 81],
 [1, 71, 75, 65, 63, 76],
 [0, 98, 74, 95, 76, 97],
 [1, 77, 72, 76, 86, 95],
 [1, 91, 63, 76, 83, 100],
 [1, 65, 70, 90, 90, 97],
 [0, 65, 77, 85, 83, 99],
 [1, 70, 94, 91, 85, 71],
 [0, 100, 99, 69, 99, 67],
 [1, 84, 92, 66, 83, 99],
 [1, 80, 96, 77, 79, 71],
 [1, 86, 73, 79, 89, 65],
 [0, 74, 78, 73, 99, 95],
 [1, 79, 60, 80, 89, 81],
 [1, 81, 74, 95, 71, 68],
 [0, 74, 87, 93, 66, 64],
 [1, 89, 89, 98, 76, 79],
 [1, 62, 68, 98, 87, 98],
 [0, 72, 85, 70, 65, 77],
 [0, 97, 71, 77, 94, 87],
 [0, 77, 66, 96, 69, 91],
 [1, 93, 95, 67, 83, 60],
 [1, 85, 82, 61, 70, 60],
 [1, 80, 96, 81, 60, 63],
 [1, 87,

In [11]:
from sagemaker.clarify import SHAPConfig

shap_config = SHAPConfig(
    baseline=baseline,
    num_samples=50,
    agg_method='median'
)

In [12]:
headers = training_data.columns.to_list()

In [13]:
from sagemaker.clarify import DataConfig

data_config = DataConfig(
    s3_data_input_path=s3_training_data_path,
    s3_output_path=s3_output_path,
    label='approved',
    headers=headers,
    dataset_type='text/csv'
)

In [14]:
%%time

processor.run_explainability(
    data_config=data_config,       
    model_config=model_config,                                 
    explainability_config=shap_config
)


Job Name:  Clarify-Explainability-2021-06-13-17-59-54-005
Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/output/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/output', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
................................[34mINFO:sagemaker-clarify-processing:Starting SageMaker Clar

In [15]:
output = processor.latest_job.outputs[0]
output_destination = output.destination
output_destination

's3://sagemaker-cookbook-bucket/chapter07/output'

In [16]:
!aws s3 cp {output_destination}/ tmp/ --recursive

download: s3://sagemaker-cookbook-bucket/chapter07/output/explanations_shap/out.csv to tmp/explanations_shap/out.csv
download: s3://sagemaker-cookbook-bucket/chapter07/output/explanations_shap/baseline.csv to tmp/explanations_shap/baseline.csv
download: s3://sagemaker-cookbook-bucket/chapter07/output/analysis_config.json to tmp/analysis_config.json
download: s3://sagemaker-cookbook-bucket/chapter07/output/analysis.json to tmp/analysis.json
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.pdf to tmp/report.pdf
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.ipynb to tmp/report.ipynb
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.html to tmp/report.html


In [17]:
!ls -lahF tmp/

total 1.8M
drwxr-xr-x 4 root root 6.0K Jun 13 18:16 ./
drwxr-xr-x 6 root root 6.0K Jun 13 18:15 ../
drwxr-xr-x 2 root root 6.0K May 29 23:03 .ipynb_checkpoints/
-rw-r--r-- 1 root root  551 Jun 13 18:15 analysis.json
-rw-r--r-- 1 root root 5.4K Jun 13 17:59 analysis_config.json
-rw-r--r-- 1 root root 139K May 24 17:15 baseline.csv
-rw-r--r-- 1 root root 133K May 24 17:28 baseline_no_label.csv
-rw-r--r-- 1 root root 1.1K May 24 20:29 constraints.json
drwxr-xr-x 2 root root 6.0K Jun 13 18:16 explanations_shap/
-rw-r--r-- 1 root root 642K Jun 13 18:15 report.html
-rw-r--r-- 1 root root 376K Jun 13 18:15 report.ipynb
-rw-r--r-- 1 root root 250K Jun 13 18:15 report.pdf
-rw-r--r-- 1 root root  407 May 24 15:11 sample.jsonl
-rw-r--r-- 1 root root 122K May 24 20:29 statistics.json
-rw-r--r-- 1 root root 3.8K Jun 13 17:23 test_data.csv
-rw-r--r-- 1 root root 3.8K Jun 13 17:23 test_data_no_header.csv
-rw-r--r-- 1 root root 3.4K Jun 13 17:58 test_features.csv
-rw-r--r-- 1 root root  12K Jun 13 17:

In [18]:
!cat tmp/analysis.json

{
    "version": "1.0",
    "explanations": {
        "kernel_shap": {
            "label0": {
                "global_shap_values": {
                    "sex": 0.05238322854763917,
                    "math": -0.0008837455598650137,
                    "science": 0.009781007392571676,
                    "technology": 0.0150738264281427,
                    "random1": 0.0044908737612634544,
                    "random2": 0.008706902679070552
                },
                "expected_value": 0.8304535060777561
            }
        }
    }
}