In [2]:
from sagemaker import Session  

session = Session() 

bucket = session.default_bucket() 

prefix = "sagemaker/bias_explain" 

region = session.boto_region_name 

# Define IAM role 

from sagemaker import get_execution_role 

import pandas as pd 

import numpy as np 

import os 

import boto3  

role = get_execution_role() 

s3_client = boto3.client("s3") 

In [3]:
training_data = pd.read_csv("data/churn.csv").dropna() 

training_data.head() 

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
from sklearn.model_selection import train_test_split 

churn_train, churn_test = train_test_split (training_data, test_size=0.2) 

In [11]:
from sklearn import preprocessing
def number_encode_features(df): 

    result = df.copy() 

    encoders = {} 

    for column in result.columns: 

        if result.dtypes[column] == np.object: 

            encoders[column] = preprocessing.LabelEncoder() 

            result[column] = encoders[column].fit_transform(result[column].fillna("None")) 
    return result, encoders

In [12]:
churn_train = pd.concat([churn_train["Exited"], churn_train.drop(["Exited"], axis=1)], axis=1)
churn_train, _ = number_encode_features(churn_train)
churn_train.to_csv("data/train_churn.csv", index=False, header=False)

churn_test, _ = number_encode_features(churn_test)
churn_features = churn_test.drop(["Exited"], axis=1)
churn_target = churn_test["Exited"]
churn_features.to_csv("data/test_churn.csv", index=False, header=False)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  # Remove the CWD from sys.path while we load stuff.


In [13]:
from sagemaker.s3 import S3Uploader 

from sagemaker.inputs import TrainingInput 

train_uri = S3Uploader.upload("data/train_churn.csv", "s3://{}/{}".format(bucket, prefix)) 

train_input = TrainingInput(train_uri, content_type="csv") 

test_uri = S3Uploader.upload("data/test_churn.csv", "s3://{}/{}".format(bucket, prefix)) 

In [14]:
from sagemaker.image_uris import retrieve 

from sagemaker.estimator import Estimator 

container = retrieve("xgboost", region, version="1.2-1") 

xgb = Estimator(container,role, instance_count=1,instance_type="ml.m5.xlarge", disable_profiler=True,sagemaker_session=session,) 

xgb.set_hyperparameters(max_depth=5, eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,objective="binary:logistic",num_round=800,) 

xgb.fit({"train": train_input}, logs=False) 


2021-10-16 16:39:04 Starting - Starting the training job
2021-10-16 16:39:06 Starting - Launching requested ML instances.........
2021-10-16 16:39:57 Starting - Preparing the instances for training..............
2021-10-16 16:41:11 Downloading - Downloading input data..
2021-10-16 16:41:28 Training - Downloading the training image.......
2021-10-16 16:42:08 Training - Training image download completed. Training in progress.
2021-10-16 16:42:14 Uploading - Uploading generated training model.
2021-10-16 16:42:21 Completed - Training job completed


In [15]:
model_name = "churn-clarify-model" 

model = xgb.create_model(name=model_name) 

container_def = model.prepare_container_def() 

session.create_model(model_name, role, container_def) 

Using already existing model: churn-clarify-model


'churn-clarify-model'

In [16]:
from sagemaker import clarify 

clarify_processor = clarify.SageMakerClarifyProcessor( 

    role=role, instance_count=1, instance_type="ml.m5.xlarge", sagemaker_session=session) 

In [17]:
bias_report_output_path = "s3://{}/{}/clarify-bias".format(bucket, prefix) 

bias_data_config = clarify.DataConfig( 

    s3_data_input_path=train_uri, 

    s3_output_path=bias_report_output_path, 

    label="Exited", 

    headers=churn_train.columns.to_list(), 

    dataset_type="text/csv") 

In [18]:
model_config = clarify.ModelConfig( 

    model_name=model_name, instance_type="ml.m5.xlarge", 

    instance_count=1,accept_type="text/csv", 

content_type="text/csv",) 

In [19]:
predictions_config = clarify.ModelPredictedLabelConfig(probability_threshold=0.8) 

In [24]:
bias_config = clarify.BiasConfig( 

    label_values_or_threshold=[1], facet_name="Gender", facet_values_or_threshold=[0]) 

In [25]:
clarify_processor.run_bias( 

    data_config=bias_data_config, 

    bias_config=bias_config, 

    model_config=model_config, 

    model_predicted_label_config=predictions_config, 

    pre_training_methods="all", 

    post_training_methods="all") 


Job Name:  Clarify-Bias-2021-10-16-17-08-27-776
Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ca-central-1-300165273893/sagemaker/bias_explain/train_churn.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ca-central-1-300165273893/sagemaker/bias_explain/clarify-bias/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-ca-central-1-300165273893/sagemaker/bias_explain/clarify-bias', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
............................[3

In [26]:
shap_config = clarify.SHAPConfig( 

    baseline=[churn_features.iloc[0].values.tolist()], 

    num_samples=15, 

    agg_method="mean_abs", 

    save_local_shap_values=True,) 

In [27]:
explainability_output_path = "s3://{}/{}/clarify-explainability".format(bucket, prefix) 

explainability_data_config = clarify.DataConfig( 

    s3_data_input_path=train_uri, 

    s3_output_path=explainability_output_path, 

    label="Exited", 

    headers=churn_train.columns.to_list(), 

    dataset_type="text/csv") 

In [None]:
clarify_processor.run_explainability( 

    data_config=explainability_data_config, 

    model_config=model_config, 

    explainability_config=shap_config,) 


Job Name:  Clarify-Explainability-2021-10-16-17-18-17-561
Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ca-central-1-300165273893/sagemaker/bias_explain/train_churn.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ca-central-1-300165273893/sagemaker/bias_explain/clarify-explainability/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-ca-central-1-300165273893/sagemaker/bias_explain/clarify-explainability', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
.