In [None]:
import pandas as pd

dataset = pd.read_csv('housing.csv')

In [None]:
print(dataset.shape)
dataset[:5]

In [None]:
# Move 'medv' column to front
dataset = pd.concat([dataset['medv'], dataset.drop(['medv'], axis=1)], axis=1)

In [None]:
training_dataset = dataset.sample(frac=0.90, random_state=59)
validation_dataset = dataset.loc[~dataset.index.isin(training_dataset.index), :]
print(training_dataset.shape)
print(validation_dataset.shape)

In [None]:
training_dataset.to_csv('training_dataset.csv', index=False, header=False)
validation_dataset.to_csv('validation_dataset.csv', index=False, header=False)

In [None]:
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()

prefix = 'll-boston-housing'
training_data_path = sess.upload_data(path='training_dataset.csv', key_prefix=prefix + '/input/training')
validation_data_path = sess.upload_data(path='validation_dataset.csv', key_prefix=prefix + '/input/validation')

print(training_data_path)
print(validation_data_path)

In [None]:
import boto3
from sagemaker import image_uris

region = boto3.Session().region_name    
container = image_uris.retrieve('linear-learner', region)

from sagemaker.estimator import Estimator

role = sagemaker.get_execution_role() 

ll_estimator = Estimator(container,
    role=role, 
    instance_count=1,
    instance_type='ml.m5.large',
    output_path='s3://{}/{}/output'.format(bucket, prefix)
)

ll_estimator.set_hyperparameters(predictor_type='regressor', mini_batch_size=32)

In [None]:
training_data_channel   = sagemaker.TrainingInput(s3_data=training_data_path, content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(s3_data=validation_data_path, content_type='text/csv')

ll_data = {'train': training_data_channel, 'validation': validation_data_channel}

In [None]:
ll_estimator.fit(ll_data)

In [None]:
from time import strftime, gmtime
timestamp = strftime('%d-%H-%M-%S', gmtime())

endpoint_name = prefix+'-ep-'+timestamp
print(endpoint_name)

In [None]:
from sagemaker.model_monitor.data_capture_config import DataCaptureConfig

capture_path = 's3://{}/{}/capture'.format(bucket, prefix)

ll_predictor = ll_estimator.deploy(
    endpoint_name=endpoint_name, 
    initial_instance_count=1, 
    instance_type='ml.t2.medium',
    data_capture_config=DataCaptureConfig(       
        enable_capture=True,                     # Capture data
        sampling_percentage=100,                 
        capture_options=['REQUEST', 'RESPONSE'], # Default value
        destination_s3_uri=capture_path          # Save data here
    )
)

In [None]:
test_sample = '0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98'

In [None]:
ll_predictor.serializer = sagemaker.serializers.CSVSerializer()
ll_predictor.deserializer = sagemaker.deserializers.CSVDeserializer()

response = ll_predictor.predict(test_sample)
print(response)

In [None]:
print(capture_path)

In [None]:
%%bash -s "$capture_path"
# Wait for a minute or two for data to show up
echo $1
aws s3 ls --recursive $1
aws s3 cp --recursive $1 .

In [None]:
%%sh
head ll-boston-housing-ep-07-16-35-25/AllTraffic/2021/07/07/16/45-47-410-519762dc-1394-44c0-985a-957129a9c5e1.jsonl

In [None]:
baseline_data = sess.upload_data(path='housing.csv', key_prefix=prefix + "/baseline")
print(baseline_data)

In [None]:
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat

ll_monitor = DefaultModelMonitor(
    role=role,
    instance_count=1, 
    instance_type='ml.m5.large'
)

baseline_path = 's3://{}/{}/baseline'.format(bucket, prefix)

ll_monitor.suggest_baseline(
    baseline_dataset=baseline_data,
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=baseline_path
)

In [None]:
%%sh -s "$baseline_path"
aws s3 ls --recursive $1

In [None]:
baseline_results = ll_monitor.latest_baselining_job

schema = pd.io.json.json_normalize(baseline_results.baseline_statistics().body_dict["features"])
schema

In [None]:
constraints = pd.io.json.json_normalize(baseline_results.suggested_constraints().body_dict["features"])
constraints

In [None]:
from sagemaker.model_monitor import CronExpressionGenerator

ll_monitor_name = prefix+'-mon-'+strftime("%Y-%m-%d-%H-%M-%S", gmtime())
report_path = 's3://{}/{}/report'.format(bucket, prefix)

ll_monitor.create_monitoring_schedule(
    monitor_schedule_name=ll_monitor_name,
    endpoint_input=ll_predictor.endpoint_name,
    output_s3_uri=report_path,
    statistics=ll_monitor.baseline_statistics(),
    constraints=ll_monitor.suggested_constraints(),
    schedule_cron_expression=CronExpressionGenerator.hourly()
)

In [None]:
ll_monitor.describe_schedule()

In [None]:
ll_executions = ll_monitor.list_executions()
ll_executions

In [None]:
test_sample = '0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98'

# Multiply 1st feature by 10000
bad_sample_1 = '632.0,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98'
# Negate last feature
bad_sample_2 = '0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,-4.98'

response = ll_predictor.predict(bad_sample_1)
print(response)

response = ll_predictor.predict(bad_sample_2)
print(response)

In [None]:
violations = ll_monitor.latest_monitoring_constraint_violations()
violations = pd.io.json.json_normalize(violations.body_dict["violations"])
violations

In [None]:
%%bash -s "$report_path"
echo $1
aws s3 ls --recursive $1
aws s3 cp --recursive $1 .

In [None]:
%%sh
pygmentize <VIOLATIONS_REPORT>

In [None]:
response = ll_monitor.delete_monitoring_schedule()

In [None]:
ll_predictor.delete_endpoint()