This notebook is developed using the `Python 3 (Data Science)` kernel on an `ml.t3.medium` instance.

In [None]:
!pip install -q sagemaker-experiments

In [None]:
import sagemaker
import boto3

role = sagemaker.get_execution_role()
sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()
local_prefix='abalone'
prefix = f'sagemaker-studio-book/chapter10/{local_prefix}'

In [None]:
from datetime import datetime, timedelta, timezone
import json, os, re, uuid
from time import sleep, gmtime, strftime
from threading import Thread

import pandas as pd
import numpy as np

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from botocore.exceptions import ClientError

from sagemaker import image_uris
from sagemaker.s3 import S3Downloader, S3Uploader
from sagemaker.predictor import Predictor
from sagemaker.processing import ProcessingJob
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

## Getting data

In [None]:
# column names taken from https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.names
columns = ['Sex', 'Length', 'Diameter', 'Height', 'WholeWeight', 
           'ShuckedWeight', 'VisceraWeight', 'ShellWeight', 'Rings']
df=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data', 
               names=columns)

In [None]:
df_processed = df.copy()
# Convert Rings to float so that model prediction (regression) and 
# the ground truth are both of float type for model monitor to work with
df_processed['Rings']=df_processed['Rings'].astype(float)
df_processed['Sex'] = df_processed['Sex'].replace(to_replace=['M', 'F', 'I'], 
                                                  value=[2., 1., 0.])
# moving the target Rings to the first so that we can train with XGBoost.
columns=['Rings', 'Sex', 'Length', 'Diameter', 'Height', 'WholeWeight', 
         'ShuckedWeight', 'VisceraWeight', 'ShellWeight']
df_processed = df_processed[columns]

In [None]:
from sklearn.model_selection import train_test_split
df_build, df_test = train_test_split(df_processed, test_size=0.1, random_state=42, 
                                     shuffle=True, stratify=df_processed['Sex'])
df_train, df_val = train_test_split(df_build, test_size=1/9., random_state=42, 
                                    shuffle=True, stratify=df_build['Sex'])

In [None]:
columns_no_target = ['Sex', 'Length', 'Diameter', 'Height', 'WholeWeight', 
                     'ShuckedWeight', 'VisceraWeight', 'ShellWeight']

In [None]:
os.makedirs(local_prefix, exist_ok=True)
df_train.to_csv(f'./{local_prefix}/abalone_train.csv', index=False)
df_val.to_csv(f'./{local_prefix}/abalone_val.csv', index=False)
df_test.to_csv(f'./{local_prefix}/abalone_test.csv', index=False)

desired_s3_uri = f's3://{bucket}/{prefix}/data'
train_data_s3 = sagemaker.s3.S3Uploader.upload(local_path=f'./{local_prefix}/abalone_train.csv',
                                               desired_s3_uri=desired_s3_uri,
                                               sagemaker_session=sess)
val_data_s3 = sagemaker.s3.S3Uploader.upload(local_path=f'./{local_prefix}/abalone_val.csv',
                                             desired_s3_uri=desired_s3_uri,
                                             sagemaker_session=sess)
test_data_s3 = sagemaker.s3.S3Uploader.upload(local_path=f'./{local_prefix}/abalone_test.csv',
                                              desired_s3_uri=desired_s3_uri,
                                              sagemaker_session=sess)

## Train a ML model to predict `Rings`

In [None]:
image = image_uris.retrieve(region=region, framework='xgboost', version='1.3-1')

exp_datetime = strftime('%Y-%m-%d-%H-%M-%S', gmtime())
jobname = f'abalone-xgb-{exp_datetime}'

experiment_name = 'abalone-age-prediction'

try:
    experiment = Experiment.create(
        experiment_name=experiment_name, 
        description='Predicting age for abalone based on physical measurements.')
except ClientError as e:
    print(f'{experiment_name} experiment already exists! Reusing the existing experiment.')
    
# Creating a new trial for the experiment
exp_trial = Trial.create(experiment_name=experiment_name, 
                         trial_name=jobname)

experiment_config={'ExperimentName': experiment_name,
                   'TrialName': exp_trial.trial_name,
                   'TrialComponentDisplayName': 'Training'}

train_s3_output = f's3://{bucket}/{prefix}/abalone_data/training'

xgb = sagemaker.estimator.Estimator(image,
                                    role,
                                    instance_type='ml.m5.xlarge',
                                    instance_count=1,
                                    output_path=train_s3_output,
                                    enable_sagemaker_metrics=True,
                                    sagemaker_session=sess)

xgb.set_hyperparameters(objective='reg:squarederror', num_round=20)

train_input = sagemaker.inputs.TrainingInput(s3_data=train_data_s3, 
                                             content_type='csv')
val_input = sagemaker.inputs.TrainingInput(s3_data=val_data_s3, 
                                           content_type='csv')
data_channels={'train': train_input, 'validation': val_input}

xgb.fit(inputs=data_channels, 
        job_name=jobname, 
        experiment_config=experiment_config, 
        wait=True)

## Deploy the model with data capture

In [None]:
##S3 prefixes
data_capture_prefix = f'{prefix}/datacapture'
s3_capture_upload_path = f's3://{bucket}/{data_capture_prefix}'

ground_truth_upload_path = f's3://{bucket}/{prefix}/ground-truth-data/{exp_datetime}'

reports_prefix = f'{prefix}/reports'
s3_report_path = f's3://{bucket}/{reports_prefix}'

print(f'Capture path: {s3_capture_upload_path}')
print(f'Ground truth path: {ground_truth_upload_path}')
print(f'Report path: {s3_report_path}')

In [None]:
from sagemaker.model_monitor import DataCaptureConfig

data_capture_config = DataCaptureConfig(enable_capture=True, 
                                        sampling_percentage=100, 
                                        destination_s3_uri=s3_capture_upload_path)

In [None]:
endpoint_name = f'abalone-xgb-{exp_datetime}'
print(f'EndpointName: {endpoint_name}')

predictor = xgb.deploy(initial_instance_count=1,
                       instance_type='ml.m5.large',
                       endpoint_name=endpoint_name,
                       serializer=CSVSerializer(),
                       data_capture_config=data_capture_config)

## Creating prediction for validation set as model quality baseline dataset

In [None]:
predictor_np = Predictor(endpoint_name=endpoint_name, 
                         sagemaker_session=sess,
                         serializer=CSVSerializer(),
                         deserializer=CSVDeserializer())

pred=predictor_np.predict(df_val[columns_no_target].values)
pred_f = [float(i) for i in pred[0]]
df_val['Prediction']=pred_f
model_quality_baseline_suffix = f'{local_prefix}/abalone_val_model_quality_baseline.csv'
df_val[['Rings', 'Prediction']].to_csv(model_quality_baseline_suffix, index=False)
model_quality_baseline_s3 = sagemaker.s3.S3Uploader.upload(local_path=model_quality_baseline_suffix,
                                                           desired_s3_uri=desired_s3_uri,
                                                           sagemaker_session=sess)

## Establish a persistent load with randomness and ground truth

In [None]:
def add_randomness(series, probability = 0.1):
    random_rate=(np.random.rand(series.shape[0])<probability).astype(float)
    sigma_scale=0.5
    
    new_series = series * np.random.normal(loc=1, scale=sigma_scale*random_rate, 
                                           size=series.shape)
    
    if random_rate[0] != 1.:
        # if random_rate for Sex (first cell in random_rate) is not 1,
        # then assign a random value from [0,2].
        new_series[0] = float(np.random.randint(0, 2))
    else:
        new_series[0] = series[0]

    return new_series


def drop_randomly(series, probability = 0.05):
    random_rate=(np.random.rand(series.shape[0])<probability)
    new_series = series.copy()
    new_series[random_rate]=np.nan
    
    return new_series

def convert_nparray_to_string(series):
    new_series = ','.join([str(i) for i in series])
    new_series = new_series.replace('nan', '')
    
    return new_series
    
def upload_ground_truth(records, ground_truth_upload_path, upload_time):
    records_json = [json.dumps(r) for r in records]
    data_to_upload = '\n'.join(records_json)
    target_s3_uri = f'{ground_truth_upload_path}/{upload_time:%Y/%m/%d/%H/%M%S}.jsonl'
    sagemaker.s3.S3Uploader.upload_string_as_file_body(data_to_upload, target_s3_uri)

In [None]:
def generate_load_and_ground_truth():
    gt_records=[]
    for i, row in df_test.iterrows():
        suffix = uuid.uuid1().hex
        inference_id = f'{i}-{suffix}'
        
        gt = row['Rings']
        data = row[columns_no_target].values
        new_data = drop_randomly(add_randomness(data))
        new_data = convert_nparray_to_string(new_data)
        out = predictor.predict(data = new_data, inference_id = inference_id)

        gt_data =  {'groundTruthData': {
                            'data': str(gt), 
                            'encoding': 'CSV',
                        },
                    'eventMetadata': {
                            'eventId': inference_id,
                        },
                    'eventVersion': '0',
                    }
        gt_records.append(gt_data)

    upload_ground_truth(gt_records, ground_truth_upload_path, datetime.utcnow())
    
def generate_load_and_ground_truth_forever():
    while True:
        generate_load_and_ground_truth()

In [None]:
generate_load_and_ground_truth()

In [None]:
thread = Thread(target=generate_load_and_ground_truth_forever)
thread.start()

## (Optional) Test out the endpoint

In [None]:
def get_obj_body(obj_key):
    return s3_client.get_object(Bucket=bucket, Key=obj_key).get('Body').read().decode('utf-8')

In [None]:
s3_client = boto3.Session().client('s3')
current_endpoint_capture_prefix = '{}/{}'.format(data_capture_prefix, endpoint_name)
result = s3_client.list_objects(Bucket=bucket, Prefix=current_endpoint_capture_prefix)
capture_files = [capture_file.get('Key') for capture_file in result.get('Contents')]
print('Found Capture Files:')
print('\n '.join(capture_files))

In [None]:
capture_file = get_obj_body(capture_files[-1])
print(json.dumps(json.loads(capture_file.split("\n")[-2]), indent=2))

Uncomment and run the next two cells to delete monitoring schedules and the endpoint to stop incurring cost.

In [None]:
# ## uncomment the lines below and run them to delete
# client=sess.sagemaker_client
# response=client.list_monitoring_schedules(EndpointName = endpoint_name)
# for schedule in response['MonitoringScheduleSummaries']:
#     schedule_name = schedule['MonitoringScheduleName']
#     print(schedule_name)
#     r = client.delete_monitoring_schedule(MonitoringScheduleName = schedule_name)

In [None]:
# predictor.delete_endpoint()