In [1]:
from sagemaker import Session  

session = Session() 

bucket = session.default_bucket() 

prefix = "sagemaker/bias_explain" 

region = session.boto_region_name 

# Define IAM role 

from sagemaker import get_execution_role 

import pandas as pd 

import numpy as np 

import os 

import boto3  

role = get_execution_role() 

s3_client = boto3.client("s3") 

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
training_data = pd.read_csv("data/churn.csv").dropna() 

training_data.head() 

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
from sklearn.model_selection import train_test_split 

churn_train, churn_test = train_test_split (training_data, test_size=0.2) 

In [7]:
from sklearn import preprocessing
def number_encode_features(df): 

    result = df.copy() 

    encoders = {} 

    for column in result.columns: 

        if result.dtypes[column] == object: 

            encoders[column] = preprocessing.LabelEncoder() 

            result[column] = encoders[column].fit_transform(result[column].fillna("None")) 
    return result, encoders

In [8]:
churn_train = pd.concat([churn_train["Exited"], churn_train.drop(["Exited"], axis=1)], axis=1)
churn_train, _ = number_encode_features(churn_train)
churn_train.to_csv("data/train_churn.csv", index=False, header=False)

churn_test, _ = number_encode_features(churn_test)
churn_features = churn_test.drop(["Exited"], axis=1)
churn_target = churn_test["Exited"]
churn_features.to_csv("data/test_churn.csv", index=False, header=False)

In [9]:
from sagemaker.s3 import S3Uploader 

from sagemaker.inputs import TrainingInput 

train_uri = S3Uploader.upload("data/train_churn.csv", "s3://{}/{}".format(bucket, prefix)) 

train_input = TrainingInput(train_uri, content_type="csv") 

test_uri = S3Uploader.upload("data/test_churn.csv", "s3://{}/{}".format(bucket, prefix)) 

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [10]:
from sagemaker.image_uris import retrieve 

from sagemaker.estimator import Estimator 

container = retrieve("xgboost", region, version="1.2-1") 

xgb = Estimator(container,role, instance_count=1,instance_type="ml.m5.xlarge", disable_profiler=True,sagemaker_session=session,) 

xgb.set_hyperparameters(max_depth=5, eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,objective="binary:logistic",num_round=800,) 

xgb.fit({"train": train_input}, logs=False) 

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-03-12-01-51-01-628



2024-03-12 01:51:02 Starting - Starting the training job.
2024-03-12 01:51:16 Starting - Preparing the instances for training......
2024-03-12 01:51:51 Downloading - Downloading input data...
2024-03-12 01:52:09 Downloading - Downloading the training image.......
2024-03-12 01:52:50 Training - Training image download completed. Training in progress...
2024-03-12 01:53:05 Uploading - Uploading generated training model.
2024-03-12 01:53:16 Completed - Training job completed


In [12]:
model_name = "churn-clarify-model" 

model = xgb.create_model(name=model_name) 

container_def = model.prepare_container_def() 

session.create_model(model_name, role, container_def) 

INFO:sagemaker:Creating model with name: churn-clarify-model


'churn-clarify-model'

In [13]:
from sagemaker import clarify 

clarify_processor = clarify.SageMakerClarifyProcessor( 

    role=role, instance_count=1, instance_type="ml.m5.xlarge", sagemaker_session=session) 

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.0.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [14]:
bias_report_output_path = "s3://{}/{}/clarify-bias".format(bucket, prefix) 

bias_data_config = clarify.DataConfig( 

    s3_data_input_path=train_uri, 

    s3_output_path=bias_report_output_path, 

    label="Exited", 

    headers=churn_train.columns.to_list(), 

    dataset_type="text/csv") 

In [15]:
model_config = clarify.ModelConfig( 

    model_name=model_name, instance_type="ml.m5.xlarge", 

    instance_count=1,accept_type="text/csv", 

content_type="text/csv",) 

In [16]:
predictions_config = clarify.ModelPredictedLabelConfig(probability_threshold=0.8) 

In [17]:
bias_config = clarify.BiasConfig( 

    label_values_or_threshold=[1], facet_name="Gender", facet_values_or_threshold=[0]) 

In [18]:
clarify_processor.run_bias( 

    data_config=bias_data_config, 

    bias_config=bias_config, 

    model_config=model_config, 

    model_predicted_label_config=predictions_config, 

    pre_training_methods="all", 

    post_training_methods="all") 

INFO:sagemaker.clarify:Analysis Config: {'dataset_type': 'text/csv', 'headers': ['Exited', 'RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'], 'label': 'Exited', 'label_values_or_threshold': [1], 'facet': [{'name_or_index': 'Gender', 'value_or_threshold': [0]}], 'methods': {'report': {'name': 'report', 'title': 'Analysis Report'}, 'pre_training_bias': {'methods': 'all'}, 'post_training_bias': {'methods': 'all'}}, 'predictor': {'model_name': 'churn-clarify-model', 'instance_type': 'ml.m5.xlarge', 'initial_instance_count': 1, 'accept_type': 'text/csv', 'content_type': 'text/csv'}, 'probability_threshold': 0.8}
INFO:sagemaker:Creating processing-job with name Clarify-Bias-2024-03-12-02-00-00-572


.............................[34m2024-03-12 02:04:43,265 logging.conf not found when configuring logging, using default logging configuration.[0m
[34m2024-03-12 02:04:43,266 Starting SageMaker Clarify Processing job[0m
[34m2024-03-12 02:04:43,267 Analysis config path: /opt/ml/processing/input/config/analysis_config.json[0m
[34m2024-03-12 02:04:43,267 Analysis result path: /opt/ml/processing/output[0m
[34m2024-03-12 02:04:43,268 This host is algo-1.[0m
[34m2024-03-12 02:04:43,268 This host is the leader.[0m
[34m2024-03-12 02:04:43,268 Number of hosts in the cluster is 1.[0m
[34m2024-03-12 02:04:43,546 Running Python / Pandas based analyzer.[0m
[34m2024-03-12 02:04:43,546 Dataset type: text/csv uri: /opt/ml/processing/input/data[0m
[34m2024-03-12 02:04:43,556 Loading dataset...[0m
  df = df.append(df_tmp, ignore_index=True)[0m
[34m2024-03-12 02:04:43,575 Loaded dataset. Dataset info:[0m
[34m<class 'pandas.core.frame.DataFrame'>[0m
[34mRangeIndex: 8000 entries, 0

In [19]:
shap_config = clarify.SHAPConfig( 

    baseline=[churn_features.iloc[0].values.tolist()], 

    num_samples=15, 

    agg_method="mean_abs", 

    save_local_shap_values=True,) 

In [20]:
explainability_output_path = "s3://{}/{}/clarify-explainability".format(bucket, prefix) 

explainability_data_config = clarify.DataConfig( 

    s3_data_input_path=train_uri, 

    s3_output_path=explainability_output_path, 

    label="Exited", 

    headers=churn_train.columns.to_list(), 

    dataset_type="text/csv") 

In [21]:
clarify_processor.run_explainability( 

    data_config=explainability_data_config, 

    model_config=model_config, 

    explainability_config=shap_config,) 

INFO:sagemaker.clarify:Analysis Config: {'dataset_type': 'text/csv', 'headers': ['Exited', 'RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'], 'label': 'Exited', 'predictor': {'model_name': 'churn-clarify-model', 'instance_type': 'ml.m5.xlarge', 'initial_instance_count': 1, 'accept_type': 'text/csv', 'content_type': 'text/csv'}, 'methods': {'report': {'name': 'report', 'title': 'Analysis Report'}, 'shap': {'use_logit': False, 'save_local_shap_values': True, 'baseline': [[8741.0, 15570908.0, 447.0, 687.0, 2.0, 0.0, 29.0, 7.0, 93617.07, 1.0, 0.0, 1.0, 113050.92]], 'num_samples': 15, 'agg_method': 'mean_abs'}}}
INFO:sagemaker:Creating processing-job with name Clarify-Explainability-2024-03-12-02-12-04-862


.............................[34m2024-03-12 02:16:53,488 logging.conf not found when configuring logging, using default logging configuration.[0m
[34m2024-03-12 02:16:53,489 Starting SageMaker Clarify Processing job[0m
[34m2024-03-12 02:16:53,491 Analysis config path: /opt/ml/processing/input/config/analysis_config.json[0m
[34m2024-03-12 02:16:53,491 Analysis result path: /opt/ml/processing/output[0m
[34m2024-03-12 02:16:53,492 This host is algo-1.[0m
[34m2024-03-12 02:16:53,492 This host is the leader.[0m
[34m2024-03-12 02:16:53,492 Number of hosts in the cluster is 1.[0m
[34m2024-03-12 02:16:53,776 Running Python / Pandas based analyzer.[0m
[34m2024-03-12 02:16:53,776 Dataset type: text/csv uri: /opt/ml/processing/input/data[0m
[34m2024-03-12 02:16:53,786 Loading dataset...[0m
  df = df.append(df_tmp, ignore_index=True)[0m
[34m2024-03-12 02:16:53,805 Loaded dataset. Dataset info:[0m
[34m<class 'pandas.core.frame.DataFrame'>[0m
[34mRangeIndex: 8000 entries, 0