In [None]:
%%sh
apt-get update
apt-get install -y build-essential
pip -q install --upgrade pip
pip -q install smdebug shap --upgrade

In [None]:
import pandas as pd

dataset = pd.read_csv('housing.csv')

In [None]:
print(dataset.shape)
dataset[:5]

In [None]:
# Move 'medv' column to front
dataset = pd.concat([dataset['medv'], dataset.drop(['medv'], axis=1)], axis=1)

In [None]:
training_dataset = dataset.sample(frac=0.90, random_state=59)
validation_dataset = dataset.loc[~dataset.index.isin(training_dataset.index), :]
print(training_dataset.shape)
print(validation_dataset.shape)

In [None]:
training_dataset.to_csv('training_dataset.csv', index=False, header=False)
validation_dataset.to_csv('validation_dataset.csv', index=False, header=False)

In [None]:
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()

prefix = 'boston-housing'
training_data_path = sess.upload_data(path='training_dataset.csv', key_prefix=prefix + '/input/training')
validation_data_path = sess.upload_data(path='validation_dataset.csv', key_prefix=prefix + '/input/validation')

print(training_data_path)
print(validation_data_path)

In [None]:
training_data_channel   = sagemaker.TrainingInput(s3_data=training_data_path, content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(s3_data=validation_data_path, content_type='text/csv')

xgb_data = {'train': training_data_channel, 'validation': validation_data_channel}

In [None]:
import boto3
from sagemaker import image_uris

region = boto3.Session().region_name    
container = image_uris.retrieve('xgboost', region, version='1.3-1', instance_type='ml.m5.large')
print(container)

In [None]:
from sagemaker.estimator import Estimator
from sagemaker.debugger import rule_configs, Rule

role = sagemaker.get_execution_role() 

xgb_estimator = Estimator(container,
    role=role, 
    instance_count=1,
    instance_type='ml.m5.large',
    output_path='s3://{}/{}/output'.format(bucket, prefix),

    rules=[
        Rule.sagemaker(rule_configs.overfit()),
        Rule.sagemaker(rule_configs.overtraining())
    ]
)

In [None]:
xgb_estimator.set_hyperparameters(objective='reg:linear', num_round=50)

In [None]:
xgb_estimator.fit(xgb_data)

In [None]:
description = xgb_estimator.latest_training_job.rule_job_summary()

for rule in description:
    rule.pop('LastModifiedTime')
    rule.pop('RuleEvaluationJobArn')
    print(rule)

In [None]:
from sagemaker.debugger import DebuggerHookConfig, CollectionConfig

save_interval = '1'

xgb_estimator = Estimator(container,
    role=role, 
    instance_count=1,
    instance_type='ml.m5.large',
    output_path='s3://{}/{}/output'.format(bucket, prefix),
                          
    debugger_hook_config=DebuggerHookConfig(                 
        s3_output_path='s3://{}/{}/debug'.format(bucket, prefix), 
        collection_configs=[
            CollectionConfig(name='metrics', parameters={"save_interval": save_interval}),
            CollectionConfig(name='average_shap', parameters={"save_interval": save_interval}),
            CollectionConfig(name='full_shap', parameters={"save_interval": save_interval}),
            CollectionConfig(name='feature_importance', parameters={"save_interval": save_interval})
        ],
    ),
)

In [None]:
xgb_estimator.set_hyperparameters(objective='reg:linear', num_round=50)

In [None]:
xgb_estimator.fit(xgb_data)

In [None]:
from smdebug.trials import create_trial

s3_output_path = xgb_estimator.latest_job_debugger_artifacts_path()
print(s3_output_path)

In [None]:
trial = create_trial(s3_output_path)

In [None]:
trial.tensor_names()

In [None]:
%matplotlib inline 
import matplotlib.pyplot as plt

steps = trial.tensor("train-rmse").steps()
train_rmse = [trial.tensor('train-rmse').value(s) for s in steps]
val_rmse = [trial.tensor('validation-rmse').value(s) for s in steps]

plt.title('RMSE over steps')
plt.autoscale()
plt.plot(steps, train_rmse, label='train', color='black')
plt.plot(steps, val_rmse, label='val', color='grey')
plt.legend()

In [None]:
trial.tensor_names(collection="feature_importance")

In [None]:
def plot_features(tensor_prefix):
    for i in range(0,12):
        f_name = tensor_prefix+'/f'+str(i)
        steps = trial.tensor(f_name).steps()
        v = [trial.tensor(f_name).value(s) for s in steps]
        plt.plot(steps, v, label=dataset.columns[i+1])
    plt.autoscale()
    plt.title(tensor_prefix)
    plt.legend(loc='upper left')
    plt.show()

In [None]:
plot_features('average_shap')

In [None]:
plot_features('feature_importance/weight')

In [None]:
import shap

shap_values = trial.tensor("full_shap/f0").value(trial.last_complete_step)
shap_no_base = shap_values[:, :-1]
shap_base_value = shap_values[0, -1]
shap.summary_plot(shap_no_base, plot_type='bar', feature_names=dataset.columns[1:])

In [None]:
from time import strftime, gmtime
timestamp = strftime('%d-%H-%M-%S', gmtime())

endpoint_name = 'xgb-demo'+'-'+timestamp
print(endpoint_name)

In [None]:
xgb_predictor = xgb_estimator.deploy(endpoint_name=endpoint_name, 
                        initial_instance_count=1, 
                        instance_type='ml.t2.medium')

In [None]:
test_sample = '0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98'

In [None]:
xgb_predictor.serializer = sagemaker.serializers.CSVSerializer()
xgb_predictor.deserializer = sagemaker.deserializers.CSVDeserializer()

response = xgb_predictor.predict(test_sample)
print(response)

In [None]:
test_samples = ['0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98',
                '0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,9.14']

response = xgb_predictor.predict(test_samples)
print(response)

In [None]:
runtime = boto3.Session().client(service_name='runtime.sagemaker') 

response = runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                  ContentType='text/csv', 
                                  Body=test_sample)

print(response['Body'].read())

In [None]:
xgb_predictor.delete_endpoint()