This notebook is developed using the `Python 3 (Data Science)` kernel on an `ml.t3.medium` instance.

In [None]:
!pip install -q sagemaker-experiments

In [None]:
import sagemaker
import boto3

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()
prefix = 'sagemaker-studio-book/chapter06'

In [None]:
from datetime import datetime, timedelta, timezone
import json, os, re, uuid
from time import sleep, gmtime, strftime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from botocore.exceptions import ClientError

from sagemaker import image_uris
from sagemaker.s3 import S3Uploader, S3Downloader
from sagemaker.inputs import TrainingInput

from sagemaker import clarify

In [None]:
orig_columns=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
              'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
              'hours-per-week', 'native-country', 'target']

df=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', 
               names=orig_columns, sep=r'\s*,\s*', na_values='?', engine='python')
df_valtest=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', 
               names=orig_columns, skiprows=1, sep=r'\s*,\s*', na_values='?', engine='python')

df.drop(columns='education', inplace=True)
df_valtest.drop(columns='education', inplace=True)
df_valtest['target'] = df_valtest['target'].replace(to_replace=['<=50K.', '>50K.'], 
                                                    value=['<=50K', '>50K'])

# move the target column to first for XGBoost
new_columns=df.columns.tolist()
new_columns=[new_columns[-1]]+new_columns[:-1]
df=df[new_columns]
df_valtest=df_valtest[new_columns]

In [None]:
df.head()

In [None]:
from sklearn import preprocessing

categorical_columns=df.select_dtypes(include='object').columns

encoder=preprocessing.OrdinalEncoder(dtype=int)
df[categorical_columns]=encoder.fit_transform(df[categorical_columns].fillna('NaN'))
df_valtest[categorical_columns]=encoder.transform(df_valtest[categorical_columns].fillna('NaN'))

After the ordinal encoding, the main features of interest are converted as below.
- sex: Male (1), Female (0)
- target: >50K (1), <=50K (0)

You can see the full mapping in `encoder.categories_`.

In [None]:
from sklearn.model_selection import train_test_split
df_val, df_test = train_test_split(df_valtest, test_size=0.9, random_state=42, 
                                   shuffle=True, stratify=df_valtest['target'])

In [None]:
df.shape, df_valtest.shape, df_val.shape, df_test.shape

In [None]:
df['sex'].value_counts(sort=False).plot(kind='bar', title='Total count by sex', rot=0)
plt.xlabel('Sex (0: Female, 1: Male)')

In [None]:
df['target'].value_counts(sort=False).plot(kind='bar', title='Target distribution', rot=0)
plt.xlabel('target (0: <=50K, 1: >50K)')

In [None]:
df[df['target']==1]['sex'].value_counts(sort=False).plot(
            kind='bar', title='Earning >$50K by sex', rot=0)
plt.xlabel('Sex (0: Female, 1: Male)')

In [None]:
os.makedirs('adult', exist_ok=True)
df.to_csv('adult/adult-data-train.csv', index=False, header=False)
df_val.to_csv('adult/adult-data-val.csv', index=False, header=False)

train_s3_uri = S3Uploader.upload('adult/adult-data-train.csv', 
                              f's3://{bucket}/{prefix}/adult-data')
val_s3_uri = S3Uploader.upload('adult/adult-data-val.csv', 
                            f's3://{bucket}/{prefix}/adult-data')

In [None]:
from sagemaker import clarify

clarify_processor = clarify.SageMakerClarifyProcessor(
                                    role=role, 
                                    instance_count=1, 
                                    instance_type='ml.m5.xlarge', 
                                    sagemaker_session=sess)

In [None]:
experiment_name = 'adult-income-clarify'

try:
    experiment = Experiment.create(
        experiment_name=experiment_name, 
        description='Analyzing ML bias in adult income dataset using SageMaker Clarify.')
except ClientError as e:
    print(f'{experiment_name} experiment already exists! Reusing the existing experiment.')

## Pre-training bias analysis

In [None]:
# Creating a new trial for the experiment
exp_datetime_1 = strftime('%Y-%m-%d-%H-%M-%S', gmtime())

exp_trial_1 = Trial.create(experiment_name=experiment_name, 
                           trial_name=exp_datetime_1)

experiment_config={'ExperimentName': experiment_name,
                   'TrialName': exp_trial_1.trial_name,
                   'TrialComponentDisplayName': 'Pretraining-BiasAnalysis'}

In [None]:
pretraining_bias_report_output_path = f's3://{bucket}/{prefix}/{experiment_name}-{exp_trial_1.trial_name}/clarify-pretraining-bias'

bias_data_config = clarify.DataConfig(
    s3_data_input_path=train_s3_uri,
    s3_output_path=pretraining_bias_report_output_path,
    label='target',
    headers=df.columns.tolist(),
    dataset_type='text/csv',
)

In [None]:
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1], 
    facet_name=['sex', 'race'], 
    facet_values_or_threshold=[[0], None])

In [None]:
jobname = f'adult-clarify-pretraining-bias-{exp_datetime_1}'

clarify_processor.run_pre_training_bias(
    data_config=bias_data_config,
    data_bias_config=bias_config,
    methods='all',
    job_name=jobname,
    experiment_config=experiment_config,
    wait=False,
    logs=False)

## Mitigating bias

In [None]:
max_female_sample=df.groupby(['sex', 'target'], 
                             group_keys=False).count().loc[(0, 1)]['age']

In [None]:
df_sampled=df.groupby(['sex', 'target'], 
                      group_keys=False).apply(lambda x: x.sample(max_female_sample))

In [None]:
df_sampled.shape

In [None]:
df_sampled['sex'].value_counts().sort_values().plot(kind='bar', 
                                                    title='Total count by sex', 
                                                    rot=0)

In [None]:
df_sampled['sex'].where(df_sampled['target'] == 1).value_counts().sort_values().plot(
    kind='bar', title='Earning >$50K by sex', rot=0)

In [None]:
df_sampled.to_csv('adult/adult-data-train-sampled.csv', index=False, header=False)
train_sampled_s3_uri = S3Uploader.upload('adult/adult-data-train-sampled.csv', 
                                      f's3://{bucket}/{prefix}/adult-data')

### Rerun the pre-training bias analysis to confirm

In [None]:
exp_datetime_2 = strftime('%Y-%m-%d-%H-%M-%S', gmtime())

# Creating a new trial as we are working on a new dataset
exp_trial_2 = Trial.create(experiment_name=experiment_name, 
                         trial_name=exp_datetime_2)

experiment_config={'ExperimentName': experiment_name,
                   'TrialName': exp_trial_2.trial_name,
                   'TrialComponentDisplayName': 'Pretraining-BiasAnalysis'}

pretraining_bias_report_output_path = f's3://{bucket}/{prefix}/{experiment_name}-{exp_trial_2.trial_name}/clarify-pretraining-bias'

bias_data_config = clarify.DataConfig(
    s3_data_input_path=train_sampled_s3_uri,
    s3_output_path=pretraining_bias_report_output_path,
    label='target',
    headers=df_sampled.columns.tolist(),
    dataset_type='text/csv')

bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1], 
    facet_name=['sex', 'race'], 
    facet_values_or_threshold=[[0], None])

jobname = f'adult-sampled-clarify-pretraining-bias-{exp_datetime_2}'

clarify_processor.run_pre_training_bias(
    data_config=bias_data_config,
    data_bias_config=bias_config,
    methods='all',
    job_name=jobname,
    experiment_config=experiment_config,
    wait=False,
    logs=False)

## Training a ML model using XGBoost

In [None]:
exp_datetime_3 = strftime('%Y-%m-%d-%H-%M-%S', gmtime())
jobname = f'adult-xgb-{exp_datetime_3}'

experiment_config={'ExperimentName': experiment_name,
                   'TrialName': exp_trial_2.trial_name,
                   'TrialComponentDisplayName': 'Training'}

image = image_uris.retrieve(region=region, framework='xgboost', version='1.3-1')

train_s3_output = f's3://{bucket}/{prefix}/{experiment_name}-{exp_trial_2.trial_name}/training'

xgb = sagemaker.estimator.Estimator(image,
                                    role,
                                    instance_type='ml.m5.xlarge',
                                    instance_count=1,
                                    output_path=train_s3_output,
                                    enable_sagemaker_metrics=True,
                                    sagemaker_session=sess)
xgb.set_hyperparameters(objective='binary:logistic',
                        eval_metric='error',
                        num_round=50)

train_input = sagemaker.inputs.TrainingInput(s3_data=train_sampled_s3_uri, 
                                             content_type='csv')

val_input = sagemaker.inputs.TrainingInput(s3_data=val_s3_uri, 
                                             content_type='csv')

data_channels={'train': train_input, 'validation': val_input}

xgb.fit(inputs=data_channels, 
        job_name=jobname, 
        experiment_config=experiment_config, 
        wait=True)

In [None]:
model_name = f'adult-xgb-model-{exp_datetime_3}'
model = xgb.create_model(name=model_name)

container_def = model.prepare_container_def()
sess.create_model(model_name, role, container_def)

## Detecting post-training bias

In [None]:
experiment_config={'ExperimentName': experiment_name,
                   'TrialName': exp_trial_2.trial_name,
                   'TrialComponentDisplayName': 'Posttraining-BiasAnalysis'}

posttraining_bias_report_output_path = f's3://{bucket}/{prefix}/{experiment_name}-{exp_trial_2.trial_name}/clarify-posttraining-bias'

bias_data_config = clarify.DataConfig(
    s3_data_input_path=train_sampled_s3_uri,
    s3_output_path=posttraining_bias_report_output_path,
    label='target',
    headers=df_sampled.columns.tolist(),
    dataset_type='text/csv')

bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1], 
    facet_name=['sex', 'race'], 
    facet_values_or_threshold=[[0], None])

In [None]:
model_config = clarify.ModelConfig(
    model_name=model_name,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    accept_type='text/csv',
    content_type='text/csv')

predictions_config = clarify.ModelPredictedLabelConfig(probability_threshold=0.5)

In [None]:
exp_datetime_4 = strftime('%Y-%m-%d-%H-%M-%S', gmtime())
jobname = f'adult-sampled-clarify-posttraining-bias-{exp_datetime_4}'

clarify_processor.run_post_training_bias(
    data_config=bias_data_config,
    data_bias_config=bias_config,
    model_config=model_config,
    model_predicted_label_config=predictions_config,
    methods='all',    
    job_name=jobname,
    experiment_config=experiment_config,
    wait=False,
    logs=False)

## Explaining ML models using SHAP values

In [None]:
experiment_config={'ExperimentName': experiment_name,
                   'TrialName': exp_trial_2.trial_name,
                   'TrialComponentDisplayName': 'Posttraining-SHAP'}

explainability_output_path = f's3://{bucket}/{prefix}/{experiment_name}-{exp_trial_2.trial_name}/clarify-explainability'

explainability_data_config = clarify.DataConfig(
    s3_data_input_path=train_sampled_s3_uri,
    s3_output_path=explainability_output_path,
    label='target',
    headers=df_sampled.columns.tolist(),
    dataset_type='text/csv')

model_config = clarify.ModelConfig(
    model_name=model_name,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    accept_type='text/csv',
    content_type='text/csv')

baseline = df_sampled.query('target == 1').mode().iloc[0, 1:].astype(int).tolist()
shap_config = clarify.SHAPConfig(
    baseline=[baseline],
    num_samples=15,
    agg_method='mean_abs')

In [None]:
exp_datetime_5 = strftime('%Y-%m-%d-%H-%M-%S', gmtime())
jobname = f'adult-sampled-clarify-posttraining-shap-{exp_datetime_5}'

clarify_processor.run_explainability(
    data_config=explainability_data_config,
    model_config=model_config,
    explainability_config=shap_config,
    job_name=jobname,
    experiment_config=experiment_config)

In [None]:
S3Downloader.download(f'{explainability_output_path}/explanations_shap/out.csv', 
                      './',
                      sagemaker_session=sess)
local_explanations_out = pd.read_csv('out.csv')
feature_names = [str.replace(c, '_label0', '') 
                 for c in local_explanations_out.columns.to_series()]
local_explanations_out.columns = feature_names

selected_example = 500
sample_prediction='>50K' if sum(local_explanations_out.iloc[selected_example]) > 0 else '<=50K'
print(f'Example number: {selected_example}')
print(f'with model prediction: {sample_prediction}')
print()
print(f'Feature values: \n{df_sampled.iloc[selected_example].to_frame().T}')

local_explanations_out.iloc[selected_example].plot(
    kind='barh', 
    title=f'Local explanation for the {selected_example}th example.', 
    rot=0)