# Predict Crime in the UK Using SageMaker (with Clarify)

In [2]:
import sagemaker
bucket=sagemaker.Session().default_bucket()
prefix = 'sagemaker/sagemaker-clarify-test'
 
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()

In [3]:
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker 
import zipfile     # Amazon SageMaker's Python SDK provides many helper functions

# Get the data from SageMaker Feature Store

In [4]:
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup

region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

feature_group_name = "FG-ndcCrime-7dee9362" # replace with your feature group name 
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)

In [5]:
# Build SQL query to features group
fs_query = feature_group.athena_query()
fs_table = fs_query.table_name
query_string = 'SELECT * FROM "'+fs_table+'"'
print('Running ' + query_string)

Running SELECT * FROM "fg-ndccrime-7dee9362-1673879553"


In [6]:
# Run Athena query. The output is loaded to a Pandas dataframe.
fs_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+prefix+'/fs_query_results/')
fs_query.wait()
model_data = fs_query.as_dataframe()

In [7]:
print('s3://'+bucket+'/'+prefix+'/fs_query_results/')

s3://sagemaker-us-east-1-241215432415/sagemaker/sagemaker-clarify-test/fs_query_results/


In [8]:
model_data.head()

Unnamed: 0,outcome,gender_male,gender_female,gender_other,age_range_18-24,age_range_25-34,age_range_over_34,age_range_10-17,age_range_under_10,date_month_1,...,date_quarter_2,ethnicity_white,ethnicity_black,ethnicity_asian,ethnicity_other,ethnicity_mixed,coords,write_time,api_invocation_time,is_deleted
0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.866025,...,1.192488e-08,0.0,0.0,0.0,0.0,0.0,-0.036181,2023-01-16 14:52:15.097,2023-01-16 14:52:15.097,False
1,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.866025,...,1.192488e-08,0.0,0.0,0.0,0.0,0.0,-0.028591,2023-01-16 14:52:15.097,2023-01-16 14:52:15.097,False
2,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.866025,...,1.192488e-08,0.0,0.0,0.0,0.0,0.0,-0.035673,2023-01-16 14:52:15.097,2023-01-16 14:52:15.097,False
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.866025,...,1.192488e-08,0.0,0.0,0.0,0.0,0.0,-0.037587,2023-01-16 14:52:15.097,2023-01-16 14:52:15.097,False
4,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.866025,...,1.192488e-08,0.0,0.0,1.0,0.0,0.0,-0.037587,2023-01-16 14:52:15.097,2023-01-16 14:52:15.097,False


In [9]:
#remove unwanted columns
model_data = model_data.drop(['write_time', 'api_invocation_time', 'is_deleted'], axis=1)

In [10]:
model_data.head()

Unnamed: 0,outcome,gender_male,gender_female,gender_other,age_range_18-24,age_range_25-34,age_range_over_34,age_range_10-17,age_range_under_10,date_month_1,...,date_day_of_year_1,date_day_of_year_2,date_quarter_1,date_quarter_2,ethnicity_white,ethnicity_black,ethnicity_asian,ethnicity_other,ethnicity_mixed,coords
0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0.0,0.0,0.0,0.0,-0.036181
1,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0.0,0.0,0.0,0.0,-0.028591
2,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0.0,0.0,0.0,0.0,-0.035673
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0.0,0.0,0.0,0.0,-0.037587
4,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0.0,1.0,0.0,0.0,-0.037587


In [11]:
model_data['ethnicity_black'] = model_data['ethnicity_black'].astype(int)

In [12]:
model_data.head()

Unnamed: 0,outcome,gender_male,gender_female,gender_other,age_range_18-24,age_range_25-34,age_range_over_34,age_range_10-17,age_range_under_10,date_month_1,...,date_day_of_year_1,date_day_of_year_2,date_quarter_1,date_quarter_2,ethnicity_white,ethnicity_black,ethnicity_asian,ethnicity_other,ethnicity_mixed,coords
0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0,0.0,0.0,0.0,-0.036181
1,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0,0.0,0.0,0.0,-0.028591
2,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0,0.0,0.0,0.0,-0.035673
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0,0.0,0.0,0.0,-0.037587
4,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0,1.0,0.0,0.0,-0.037587


In [13]:
# needed for SageMaker Clarify
columns_as_list = list(model_data.columns.values)

print(columns_as_list)

['outcome', 'gender_male', 'gender_female', 'gender_other', 'age_range_18-24', 'age_range_25-34', 'age_range_over_34', 'age_range_10-17', 'age_range_under_10', 'date_month_1', 'date_month_2', 'date_day_1', 'date_day_2', 'date_hour_1', 'date_hour_2', 'date_week_of_year_1', 'date_week_of_year_2', 'date_day_of_year_1', 'date_day_of_year_2', 'date_quarter_1', 'date_quarter_2', 'ethnicity_white', 'ethnicity_black', 'ethnicity_asian', 'ethnicity_other', 'ethnicity_mixed', 'coords']


In [14]:
# needed for SageMaker Clarify 

#dataframe to CSV w/headers
model_data.to_csv('output.csv',index=False,header=True)

In [15]:
# Prepare data SageMaker's Linear Learner algorithm
# Amazon SageMaker's Linear Learner container expects data in CSV data format. 
# Note that the first column must be the target variable and the CSV should not include headers. 

In [16]:
# remove the headers
model_data.columns = range(model_data.shape[1])   # Delete headers

model_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0,0.0,0.0,0.0,-0.036181
1,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0,0.0,0.0,0.0,-0.028591
2,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0,0.0,0.0,0.0,-0.035673
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0,0.0,0.0,0.0,-0.037587
4,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.866025,...,-0.551102,0.834438,-1.0,1.192488e-08,0.0,0,1.0,0.0,0.0,-0.037587


In [17]:
# separate data into train/test data split

#The model will be trained on 70% of data, it will then be evaluated on 20% of data to give us an estimate of the accuracy 
#we hope to have on "new" data, and 10% will be held back as a final testing dataset which will be used later on.

# Randomly sort the data then split out first 70%, second 20%, and last 10%
train_data, validation_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), 
                                                  [int(0.7 * len(model_data)), int(0.9 * len(model_data))]) 

In [18]:
# Convert data to CSV
train_data.to_csv('train.csv', index=False, header=False)
validation_data.to_csv('validation.csv', index=False, header=False)
test_data.to_csv('test.csv', index=False, header=False)

In [19]:
#copy data to S3 for SageMaker to access
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')

# Training

In [20]:
#specify the ECR container location for Amazon SageMaker's implementation of XGBoost

container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='xgboost', version='latest')

In [21]:
#Then, because we're training with the CSV file format, we'll create s3_inputs that our training function can use 
# as a pointer to the files in S3, which also specify that the content type is CSV.
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/test/'.format(bucket, prefix), content_type='csv')

In [22]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)

xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',
                        eval_metric='auc', num_round=100)

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

INFO:sagemaker:Creating training-job with name: xgboost-2023-03-12-17-31-26-139


2023-03-12 17:31:26 Starting - Starting the training job...
2023-03-12 17:31:51 Starting - Preparing the instances for training......
2023-03-12 17:32:55 Downloading - Downloading input data......
2023-03-12 17:33:41 Training - Downloading the training image...
2023-03-12 17:34:21 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[2023-03-12:17:34:33:INFO] Running standalone xgboost training.[0m
[34m[2023-03-12:17:34:33:INFO] File size need to be processed in the node: 597.54mb. Available memory size in the node: 8597.08mb[0m
[34m[2023-03-12:17:34:33:INFO] Determined delimiter of CSV input is ','[0m
[34m[17:34:33] S3DistributionType set as FullyReplicated[0m
[34m[17:34:35] 1762710x26 matrix with 45830460 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-03-12:17:34:35:INFO] Determined delimiter of CSV input is ','[0m
[34m[17:34:35] S3DistributionType set as FullyReplicated[0m
[

# Model Hosting w/ Endpoint Creation & Model Evaluation
Now that we have successfully trained our model, let’s deploy it and see how it does on the test data!

In [23]:
xgb_predictor = xgb.deploy(initial_instance_count=1,
                           instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2023-03-12-17-38-19-679
INFO:sagemaker:Creating endpoint-config with name xgboost-2023-03-12-17-38-19-679
INFO:sagemaker:Creating endpoint with name xgboost-2023-03-12-17-38-19-679


-------!

In [24]:
#need to make sure data is in correct format for deployed model
from sagemaker.predictor import csv_serializer, json_deserializer

xgb_predictor.serializer = sagemaker.serializers.CSVSerializer()
xgb_predictor.deserializer = sagemaker.deserializers.JSONDeserializer()

In [25]:
test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
880546,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.866025,...,-0.96015,-0.279485,-8.742278e-08,-1.0,1.0,0,0.0,0.0,0.0,-0.009349
2085095,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5,...,0.17083,-0.985301,1.0,-4.371139e-08,1.0,0,0.0,0.0,0.0,-0.001748
1734339,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5,...,0.522133,-0.852864,1.0,-4.371139e-08,1.0,0,0.0,0.0,0.0,-0.057999
1494162,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.5,...,0.08573,-0.996318,1.0,-4.371139e-08,1.0,0,0.0,0.0,0.0,-0.026637
220295,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.5,...,-0.507415,0.861702,-1.0,1.192488e-08,1.0,0,0.0,0.0,0.0,-0.057631


# Inference

In [26]:
# run the prediction on a single observation
prediction = xgb_predictor.predict(np.array([1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.8660252094268799,0.5000002980232239,0.2993628978729248,
                                             -0.9541393518447876,0.7071067690849304,0.7071067690849304,-0.8124867677688599,0.582979679107666,
                                             -0.7221164703369141,0.6917715072631836,-1.0,1.1924880638503056e-08,1.0,0.0,0.0,0.0,0.0,
                                             -0.0260128165460627]))
       
#The response is the probability that the person will commit a crime
#Target value is 1
prediction

0.7085631489753723

In [27]:
# run the prediction on a single observation
prediction = xgb_predictor.predict(np.array([1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.866025447845459,0.4999999701976776,0.7247928380966187,
                                             0.6889668703079224,-0.4999997615814209,0.8660255670547485,0.8124868869781494,0.5829794406890869,
                                             0.8826788663864136,0.4699766635894775,0.0,1.0,1.0,0.0,0.0,0.0,0.0,-0.0005222935888651238]))
       
#The response is the probability that the person will commit a crime
#Target value is 0
prediction

0.6435620188713074

In [28]:
#drop first column (ie target) of test data, the first column shouldn't be the target
test_data.drop(columns=test_data.columns[0], axis=1, inplace=True)

In [29]:
test_data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,17,18,19,20,21,22,23,24,25,26
880546,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.866025,-0.5,...,-0.96015,-0.279485,-8.742278e-08,-1.0,1.0,0,0.0,0.0,0.0,-0.009349
2085095,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5,-0.866026,...,0.17083,-0.985301,1.0,-4.371139e-08,1.0,0,0.0,0.0,0.0,-0.001748
1734339,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5,-0.866026,...,0.522133,-0.852864,1.0,-4.371139e-08,1.0,0,0.0,0.0,0.0,-0.057999
1494162,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.5,-0.866026,...,0.08573,-0.996318,1.0,-4.371139e-08,1.0,0,0.0,0.0,0.0,-0.026637
220295,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.5,0.866026,...,-0.507415,0.861702,-1.0,1.192488e-08,1.0,0,0.0,0.0,0.0,-0.057631


In [30]:
#run predictions on the entire test data set
#It takes about 30 minutes for this code to execute

def predict(data, predictor, rows=500 ):
    #splits test dataset into 5 arrays of observations
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    
    for array in split_array:
        for observation in array:
            prediction = predictor.predict(observation)
            predictions = ','.join([predictions, str(prediction)])
            
    return np.fromstring(predictions[1:], sep=',')

#numpy array of predictions
predictions = predict(test_data.to_numpy(), xgb_predictor)
print(predictions)

[0.60838455 0.64816582 0.59278286 ... 0.58237094 0.41627416 0.50758809]


In [31]:
print(predictions[0])

0.6083845496177673


In [32]:
print(predictions[1])

0.6481658220291138


In [33]:
print(predictions[2])

0.5927828550338745


# SageMaker Clarify

In [46]:
from sagemaker import clarify

clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role, instance_count=1, instance_type="ml.m5.xlarge", sagemaker_session=feature_store_session
)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.0.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [47]:
bias_report_output_path = "s3://{}/{}/clarify-bias".format(bucket, prefix)
bias_data_config = clarify.DataConfig(
    #s3_data_input_path='s3://sagemaker-us-east-1-241215432415/sagemaker/DEMO-linlearn-dm/fs_query_results/6359cf54-e7be-486b-8fdb-c498c6292bd4.csv',
    s3_data_input_path='output.csv',
    s3_output_path=bias_report_output_path,
    label="outcome",
    headers=columns_as_list,
    dataset_type="text/csv",
)

In [48]:
model_config = clarify.ModelConfig(
    endpoint_name='xgboost-2023-03-12-17-38-19-679', #pass the name, not the endpoint object or you will see pickle error
    accept_type="text/csv",
    content_type="text/csv",
)

In [49]:
predictions_config = clarify.ModelPredictedLabelConfig(probability_threshold=0.8)

In [53]:
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[0], #the positive outcome is 0, b/c no crime is positive
    facet_name='ethnicity_black', 
    facet_values_or_threshold=[1])

In [None]:
clarify_processor.run_bias(
    data_config=bias_data_config,
    bias_config=bias_config,
    model_config=model_config,
    model_predicted_label_config=predictions_config,
    pre_training_methods="all",
    post_training_methods="all",
)

INFO:sagemaker.clarify:Analysis Config: {'dataset_type': 'text/csv', 'headers': ['outcome', 'gender_male', 'gender_female', 'gender_other', 'age_range_18-24', 'age_range_25-34', 'age_range_over_34', 'age_range_10-17', 'age_range_under_10', 'date_month_1', 'date_month_2', 'date_day_1', 'date_day_2', 'date_hour_1', 'date_hour_2', 'date_week_of_year_1', 'date_week_of_year_2', 'date_day_of_year_1', 'date_day_of_year_2', 'date_quarter_1', 'date_quarter_2', 'ethnicity_white', 'ethnicity_black', 'ethnicity_asian', 'ethnicity_other', 'ethnicity_mixed', 'coords'], 'label': 'outcome', 'label_values_or_threshold': [0], 'facet': [{'name_or_index': 'ethnicity_black', 'value_or_threshold': [1]}], 'methods': {'report': {'name': 'report', 'title': 'Analysis Report'}, 'pre_training_bias': {'methods': 'all'}, 'post_training_bias': {'methods': 'all'}}, 'predictor': {'endpoint_name': 'xgboost-2023-03-12-17-38-19-679', 'accept_type': 'text/csv', 'content_type': 'text/csv'}, 'probability_threshold': 0.8}
IN

............................[34m2023-03-12 23:04:49,420 Calculated global analysis with predictor[0m
[34m2023-03-12 23:04:49,420 Stop using endpoint: xgboost-2023-03-12-17-38-19-679[0m
[34m2023-03-12 23:04:49,420 Model endpoint delivered 0.00084 requests per second and a total of 2 requests over 2372 seconds[0m
[34m2023-03-12 23:04:49,420 Calculating pre-training bias metrics[0m
[34m2023-03-12 23:04:49,435 Column outcome with data uniqueness fraction 7.942313389390181e-07 is classifed as a CATEGORICAL column[0m
[34m2023-03-12 23:04:49,474 Column ethnicity_black with data uniqueness fraction 7.942313389390181e-07 is classifed as a CATEGORICAL column[0m
  df = df.drop(facet_column.name, 1)[0m
[34m2023-03-12 23:04:49,676 Column outcome with data uniqueness fraction 7.942313389390181e-07 is classifed as a CATEGORICAL column[0m
[34m2023-03-12 23:04:49,702 CDDL metrics failed[0m
[34mTraceback (most recent call last):
  File "/usr/local/lib/python3.9/site-packages/smclarify/

# Clean-up
If you’re ready to be done with this notebook, please run the cell below. This will remove the hosted endpoint you created and avoid any charges from a stray instance being left on.

In [None]:
xgb_predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: xgboost-2023-03-12-17-38-19-679
INFO:sagemaker:Deleting endpoint with name: xgboost-2023-03-12-17-38-19-679
