# Bias Detection with SageMaker Clarify

In [None]:
%%sh
pip -q install sagemaker --upgrade
pip -q install scikit-learn imbalanced-learn

## 1 - Load dataset

In [None]:
import boto3, io
import pandas as pd

dataset = pd.read_csv('dataset.csv')

In [None]:
dataset.shape

In [None]:
dataset.head()

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(dataset, test_size=0.1)

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
train_data.to_csv('train_data.csv', index=False, header=False)
test_data.to_csv('test_data.csv', index=False, header=False)

In [None]:
test_data_no_labels = test_data.drop(['Label'], axis=1)
test_data_no_labels = test_data_no_labels[:100]

test_data_no_labels.to_csv('test_data_no_labels.csv', index=False, header=False)

## 3 - Train a classification model with XGBoost on Amazon SageMaker

In [None]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker import Session

print(sagemaker.__version__)

session = Session()
bucket  = session.default_bucket()
prefix  = 'bias-detection-adult-dataset'
region  = session.boto_region_name
role    = get_execution_role()

In [None]:
from sagemaker.s3 import S3Uploader
from sagemaker.inputs import TrainingInput

train_uri = S3Uploader.upload('train_data.csv', 's3://{}/{}'.format(bucket, prefix))
train_input = TrainingInput(train_uri, content_type='csv')

test_uri = S3Uploader.upload('test_data.csv', 's3://{}/{}'.format(bucket, prefix))
test_input = TrainingInput(test_uri, content_type='csv')

test_no_labels_uri = S3Uploader.upload('test_data_no_labels.csv', 's3://{}/{}'.format(bucket, prefix))

In [None]:
from sagemaker.image_uris import retrieve
from sagemaker.estimator import Estimator

container = retrieve('xgboost', region, version='latest')

xgb = Estimator(container,
                role,
                instance_count=1,
                instance_type='ml.m5.large',
                disable_profiler=True)

xgb.set_hyperparameters(objective='binary:logistic',
                        eval_metric='auc',
                        num_round=100,
                        early_stopping_rounds=20)

In [None]:
xgb.fit({'train': train_input, 'validation': test_input})

In [None]:
xgb_predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium')

In [None]:
xgb.delete_endpoint()

## 4 - Analyze bias with Amazon SageMaker Clarify

### Define a SageMaker Processing processor

In [None]:
from sagemaker import clarify

clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    sagemaker_session=session)

### Configuring bias detection

In [None]:
bias_report_output_path = 's3://{}/{}/clarify-bias'.format(bucket, prefix)

data_config = clarify.DataConfig(
    s3_data_input_path=train_uri,
    s3_output_path=bias_report_output_path,
    label='Label',
    headers=train_data.columns.to_list(),
    dataset_type='text/csv')

In [None]:
model_config = clarify.ModelConfig(
    model_name=xgb_predictor.endpoint_name,
    instance_type='ml.t2.medium',
    instance_count=1,
    accept_type='text/csv')

In [None]:
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1],   # Label for positive outcome
    facet_name='Sex_',
    facet_values_or_threshold=[1])   # Male: Sex_=0, Female: Sex_=1

### Compute pre-training and post-training bias metrics

In [None]:
clarify_processor.run_bias(
    data_config=data_config,
    model_config=model_config,
    bias_config=bias_config)

In [None]:
bias_report_output_path

In [None]:
%%sh -s $bias_report_output_path
aws s3 cp --recursive $1/ .

## 5 - Run explainability analysis

In [None]:
shap_config = clarify.SHAPConfig(
    baseline=test_no_labels_uri,
    num_samples=10,
    agg_method='mean_abs',
    save_local_shap_values=True,
)

explainability_output_path = "s3://{}/{}/clarify-explainability".format(bucket, prefix)

explainability_data_config = clarify.DataConfig(
    s3_data_input_path=train_uri,
    s3_output_path=explainability_output_path,
    label='Label',
    headers=train_data.columns.to_list(),
    dataset_type="text/csv",
)

In [None]:
clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type='ml.c5.4xlarge',
    sagemaker_session=session)

clarify_processor.run_explainability(
    data_config=explainability_data_config,
    model_config=model_config,
    explainability_config=shap_config,
)

## 6 - Inspect data

In [None]:
# Count male (Sex=1) and female (Sex=0) instances

female_male_count = train_data['Sex_'].value_counts()
print(female_male_count)

In [None]:
# Plot them

female_male_count.sort_values().plot(kind='bar', title='Counts of Sex', rot=0, figsize=(6, 3))

In [None]:
# Count male and female not50k (Target=0) and 50k instances (Target=1)

female_male_not_50k_count = train_data['Sex_'].where(train_data['Label']==0).value_counts()
female_male_50k_count     = train_data['Sex_'].where(train_data['Label']==1).value_counts()
print(female_male_not_50k_count)
print(female_male_50k_count)

In [None]:
# Plot male and females making more than 50k

female_male_50k_count.sort_values().plot(kind='bar', title='Counts of Sex earning >$50K', rot=0, figsize=(6, 3))

In [None]:
# Compute male and female 50k/not 50k ratios

ratios = female_male_50k_count/female_male_not_50k_count
print(ratios)

## 7 - Rebalance the data set
We'll do this in two steps:
1. Use SMOTE to generate new female 50k instances, in order to get the same 50k/not50k ratio as males.
2. Use under-sampling to have the same number of male and female instances.

In [None]:
import imblearn
from collections import Counter

print(imblearn.__version__)

### Generate new female 50k instances

In [None]:
# Keep female instances only

male_instances = train_data[train_data['Sex_']==0]
female_instances = train_data[train_data['Sex_']==1]

female_X = female_instances.drop(['Label'], axis=1)
female_Y = female_instances['Label']

Counter(female_Y)

In [None]:
from imblearn.over_sampling import SMOTE

# Rebalance female instances with the same (50k/not50k) ratio as male instances 
oversample = SMOTE(sampling_strategy=ratios[0])
balanced_female_X, balanced_female_Y = oversample.fit_resample(female_X, female_Y)

In [None]:
Counter(balanced_female_Y)

In [None]:
balanced_female=pd.concat([balanced_female_X, balanced_female_Y], axis=1)

In [None]:
balanced_female

### Rebuild dataset with original male instances plus balanced female instance

In [None]:
balanced_train_data=pd.concat([male_instances, balanced_female], axis=0)

In [None]:
balanced_train_data['Sex_'].value_counts().sort_values().plot(kind='bar', title='Counts of Sex', rot=0, figsize=(6, 3))

In [None]:
balanced_train_data['Sex_'].where(balanced_train_data['Label']==1).value_counts().sort_values().plot(kind='bar', title='Counts of Sex earning >$50K', rot=0, figsize=(6, 3))

### Undersample males to balance male and female instances

In [None]:
from imblearn.under_sampling import RandomUnderSampler

X = balanced_train_data.drop(['Sex_'], axis=1)
Y = balanced_train_data['Sex_']

undersample = RandomUnderSampler(sampling_strategy='not minority')
X,Y = undersample.fit_resample(X, Y)

In [None]:
Counter(Y)

In [None]:
balanced_train_data=pd.concat([X, Y], axis=1)

In [None]:
balanced_train_data['Sex_'].value_counts().sort_values().plot(kind='bar', title='Counts of Sex', rot=0, figsize=(6, 3))

In [None]:
balanced_train_data['Sex_'].where(balanced_train_data['Label']==1).value_counts().sort_values().plot(kind='bar', title='Counts of Sex earning >$50K', rot=0, figsize=(6, 3))

In [None]:
female_male_count = balanced_train_data['Sex_'].value_counts()
print(female_male_count)
female_male_50k_count = balanced_train_data['Sex_'].where(balanced_train_data['Label']==1).value_counts()
print(female_male_50k_count)
ratios = female_male_50k_count/female_male_count
print(ratios)

Now we have the same number of male and female instances, and both classes have the same 50k/not50k ratio.

## 8 - Train again on the balanced dataset

In [None]:
balanced_train_data.to_csv('balanced_train_data.csv', index=False, header=False)

In [None]:
balanced_train_uri = S3Uploader.upload('balanced_train_data.csv', 's3://{}/{}'.format(bucket, prefix))
balanced_train_input = TrainingInput(balanced_train_uri, content_type='csv')

In [None]:
xgb.fit({'train': balanced_train_input, 'validation': test_input})

In [None]:
xgb_predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium')

In [None]:
xgb_predictor.delete_endpoint()

## 8 - Run SageMaker Clarify again

In [None]:
data_config = clarify.DataConfig(
    s3_data_input_path=balanced_train_uri,
    s3_output_path=bias_report_output_path,
    label='Label',
    headers=balanced_train_data.columns.to_list(),
    dataset_type='text/csv')

In [None]:
model_config = clarify.ModelConfig(
    model_name=xgb_predictor.endpoint_name,
    instance_type='ml.t2.medium',
    instance_count=1,
    accept_type='text/csv')

In [None]:
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1],
    facet_name='Sex_',
    facet_values_or_threshold=[1])

In [None]:
clarify_processor.run_bias(
    data_config=data_config,
    model_config=model_config,
    bias_config=bias_config)