In [1]:
import pandas as pd

dataset = pd.read_csv('housing.csv')

In [2]:
print(dataset.shape)
dataset[:5]

(506, 13)


Unnamed: 0,crim,zn,indus,chas,nox,age,rm,dis,rad,tax,ptratio,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,5.33,36.2


In [3]:
# Move 'medv' column to front
dataset = pd.concat([dataset['medv'], dataset.drop(['medv'], axis=1)], axis=1)

In [4]:
training_dataset = dataset.sample(frac=0.90, random_state=59)
validation_dataset = dataset.loc[~dataset.index.isin(training_dataset.index), :]
print(training_dataset.shape)
print(validation_dataset.shape)

(455, 13)
(51, 13)


In [5]:
training_dataset.to_csv('training_dataset.csv', index=False, header=False)
validation_dataset.to_csv('validation_dataset.csv', index=False, header=False)

In [6]:
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()

prefix = 'll-boston-housing'
training_data_path = sess.upload_data(path='training_dataset.csv', key_prefix=prefix + '/input/training')
validation_data_path = sess.upload_data(path='validation_dataset.csv', key_prefix=prefix + '/input/validation')

print(training_data_path)
print(validation_data_path)

s3://sagemaker-eu-west-1-613904931467/ll-boston-housing/input/training/training_dataset.csv
s3://sagemaker-eu-west-1-613904931467/ll-boston-housing/input/validation/validation_dataset.csv


In [7]:
import boto3
from sagemaker import image_uris

region = boto3.Session().region_name    
container = image_uris.retrieve('linear-learner', region)

from sagemaker.estimator import Estimator

role = sagemaker.get_execution_role() 

ll_estimator = Estimator(container,
    role=role, 
    instance_count=1,
    instance_type='ml.m5.large',
    output_path='s3://{}/{}/output'.format(bucket, prefix)
)

ll_estimator.set_hyperparameters(predictor_type='regressor', mini_batch_size=32)

In [8]:
training_data_channel   = sagemaker.TrainingInput(s3_data=training_data_path, content_type='text/csv')
validation_data_channel = sagemaker.TrainingInput(s3_data=validation_data_path, content_type='text/csv')

ll_data = {'train': training_data_channel, 'validation': validation_data_channel}

In [9]:
ll_estimator.fit(ll_data)

2021-07-07 16:31:43 Starting - Starting the training job...
2021-07-07 16:32:06 Starting - Launching requested ML instancesProfilerReport-1625675503: InProgress
...
2021-07-07 16:32:37 Starting - Preparing the instances for training.........
2021-07-07 16:34:07 Downloading - Downloading input data...
2021-07-07 16:34:37 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[07/07/2021 16:34:41 INFO 139816769492800] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sig

In [10]:
from time import strftime, gmtime
timestamp = strftime('%d-%H-%M-%S', gmtime())

endpoint_name = prefix+'-ep-'+timestamp
print(endpoint_name)

ll-boston-housing-ep-07-16-35-25


In [11]:
from sagemaker.model_monitor.data_capture_config import DataCaptureConfig

capture_path = 's3://{}/{}/capture'.format(bucket, prefix)

ll_predictor = ll_estimator.deploy(
    endpoint_name=endpoint_name, 
    initial_instance_count=1, 
    instance_type='ml.t2.medium',
    data_capture_config=DataCaptureConfig(       
        enable_capture=True,                     # Capture data
        sampling_percentage=100,                 
        capture_options=['REQUEST', 'RESPONSE'], # Default value
        destination_s3_uri=capture_path          # Save data here
    )
)

-----------------!

In [12]:
test_sample = '0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98'

In [13]:
ll_predictor.serializer = sagemaker.serializers.CSVSerializer()
ll_predictor.deserializer = sagemaker.deserializers.CSVDeserializer()

response = ll_predictor.predict(test_sample)
print(response)

[['30.413358688354492']]


In [14]:
print(capture_path)

s3://sagemaker-eu-west-1-613904931467/ll-boston-housing/capture


In [15]:
%%bash -s "$capture_path"
# Wait for a minute or two for data to show up
echo $1
aws s3 ls --recursive $1
aws s3 cp --recursive $1 .

s3://sagemaker-eu-west-1-613904931467/ll-boston-housing/capture
2021-07-07 16:46:55        431 ll-boston-housing/capture/ll-boston-housing-ep-07-16-35-25/AllTraffic/2021/07/07/16/45-47-410-519762dc-1394-44c0-985a-957129a9c5e1.jsonl
2020-07-30 13:37:40        426 ll-boston-housing/capture/ll-boston-housing-ep-30-13-25-24/AllTraffic/2020/07/30/13/36-30-025-36c0ea66-0d7e-4e31-9f87-5af53005d475.jsonl
2020-07-30 14:07:00       1704 ll-boston-housing/capture/ll-boston-housing-ep-30-13-25-24/AllTraffic/2020/07/30/14/05-51-792-cb79b91a-cb86-4825-89b7-f6798238b8f8.jsonl
download: s3://sagemaker-eu-west-1-613904931467/ll-boston-housing/capture/ll-boston-housing-ep-07-16-35-25/AllTraffic/2021/07/07/16/45-47-410-519762dc-1394-44c0-985a-957129a9c5e1.jsonl to ll-boston-housing-ep-07-16-35-25/AllTraffic/2021/07/07/16/45-47-410-519762dc-1394-44c0-985a-957129a9c5e1.jsonl
download: s3://sagemaker-eu-west-1-613904931467/ll-boston-housing/capture/ll-boston-housing-ep-30-13-25-24/AllTraffic/2020/07/30/14/0

In [17]:
%%sh
head ll-boston-housing-ep-07-16-35-25/AllTraffic/2021/07/07/16/45-47-410-519762dc-1394-44c0-985a-957129a9c5e1.jsonl

{"captureData":{"endpointInput":{"observedContentType":"text/csv","mode":"INPUT","data":"0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98","encoding":"CSV"},"endpointOutput":{"observedContentType":"text/csv; charset=utf-8","mode":"OUTPUT","data":"30.413358688354492","encoding":"CSV"}},"eventMetadata":{"eventId":"44de16a8-6203-4a34-bf62-e52d6466a9f8","inferenceTime":"2021-07-07T16:45:47Z"},"eventVersion":"0"}


In [18]:
baseline_data = sess.upload_data(path='housing.csv', key_prefix=prefix + "/baseline")
print(baseline_data)

s3://sagemaker-eu-west-1-613904931467/ll-boston-housing/baseline/housing.csv


In [19]:
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat

ll_monitor = DefaultModelMonitor(
    role=role,
    instance_count=1, 
    instance_type='ml.m5.large'
)

baseline_path = 's3://{}/{}/baseline'.format(bucket, prefix)

ll_monitor.suggest_baseline(
    baseline_dataset=baseline_data,
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri=baseline_path
)


Job Name:  baseline-suggestion-job-2021-07-07-18-51-33-819
Inputs:  [{'InputName': 'baseline_dataset_input', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-eu-west-1-613904931467/ll-boston-housing/baseline/housing.csv', 'LocalPath': '/opt/ml/processing/input/baseline_dataset_input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'monitoring_output', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-eu-west-1-613904931467/ll-boston-housing/baseline', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
........................[34m2021-07-07 18:55:22,530 - __main__ - INFO - All params:{'ProcessingJobArn': 'arn:aws:sagemaker:eu-west-1:613904931467:processing-job/baseline-suggestion-job-2021-07-07-18-51-33-819', 'ProcessingJobName': 'baseline-suggestion-job-2021-07-07-18-51-33-819', 'Environment': {'dataset_format': '{"csv": {"header": true, "outp

<sagemaker.processing.ProcessingJob at 0x7fe66c46c190>

In [20]:
%%sh -s "$baseline_path"
aws s3 ls --recursive $1

2021-07-07 18:56:43       2354 ll-boston-housing/baseline/constraints.json
2021-07-07 18:51:17      35101 ll-boston-housing/baseline/housing.csv
2021-07-07 18:56:43      66341 ll-boston-housing/baseline/statistics.json


In [21]:
baseline_results = ll_monitor.latest_baselining_job

schema = pd.io.json.json_normalize(baseline_results.baseline_statistics().body_dict["features"])
schema

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,inferred_type,numerical_statistics.common.num_present,numerical_statistics.common.num_missing,numerical_statistics.mean,numerical_statistics.sum,numerical_statistics.std_dev,numerical_statistics.min,numerical_statistics.max,numerical_statistics.distribution.kll.buckets,numerical_statistics.distribution.kll.sketch.parameters.c,numerical_statistics.distribution.kll.sketch.parameters.k,numerical_statistics.distribution.kll.sketch.data
0,crim,Fractional,506,0,3.613524,1828.44292,8.593041,0.00632,88.9762,"[{'lower_bound': 0.00632, 'upper_bound': 8.903...",0.64,2048.0,"[[0.00632, 0.02731, 0.02729, 0.032369999999999..."
1,zn,Fractional,506,0,11.363636,5750.0,23.299396,0.0,100.0,"[{'lower_bound': 0.0, 'upper_bound': 10.0, 'co...",0.64,2048.0,"[[18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 1..."
2,indus,Fractional,506,0,11.136779,5635.21,6.853571,0.46,27.74,"[{'lower_bound': 0.46, 'upper_bound': 3.187999...",0.64,2048.0,"[[2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7...."
3,chas,Integral,506,0,0.06917,35.0,0.253743,0.0,1.0,"[{'lower_bound': 0.0, 'upper_bound': 0.1, 'cou...",0.64,2048.0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,nox,Fractional,506,0,0.554695,280.6757,0.115763,0.385,0.871,"[{'lower_bound': 0.385, 'upper_bound': 0.4336,...",0.64,2048.0,"[[0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0...."
5,age,Fractional,506,0,6.284634,3180.025,0.701923,3.561,8.78,"[{'lower_bound': 3.561, 'upper_bound': 4.08289...",0.64,2048.0,"[[6.575, 6.421, 7.185, 6.998, 7.147, 6.43, 6.0..."
6,rm,Fractional,506,0,68.574901,34698.9,28.121033,2.9,100.0,"[{'lower_bound': 2.9, 'upper_bound': 12.61, 'c...",0.64,2048.0,"[[65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96..."
7,dis,Fractional,506,0,3.795043,1920.2916,2.103628,1.1296,12.1265,"[{'lower_bound': 1.1296, 'upper_bound': 2.2292...",0.64,2048.0,"[[4.09, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622..."
8,rad,Integral,506,0,9.549407,4832.0,8.698651,1.0,24.0,"[{'lower_bound': 1.0, 'upper_bound': 3.3, 'cou...",0.64,2048.0,"[[1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 5.0, 5.0, 5.0,..."
9,tax,Fractional,506,0,408.237154,206568.0,168.370495,187.0,711.0,"[{'lower_bound': 187.0, 'upper_bound': 239.4, ...",0.64,2048.0,"[[296.0, 242.0, 242.0, 222.0, 222.0, 222.0, 31..."


In [22]:
constraints = pd.io.json.json_normalize(baseline_results.suggested_constraints().body_dict["features"])
constraints

  """Entry point for launching an IPython kernel.


Unnamed: 0,name,inferred_type,completeness,num_constraints.is_non_negative
0,crim,Fractional,1.0,True
1,zn,Fractional,1.0,True
2,indus,Fractional,1.0,True
3,chas,Integral,1.0,True
4,nox,Fractional,1.0,True
5,age,Fractional,1.0,True
6,rm,Fractional,1.0,True
7,dis,Fractional,1.0,True
8,rad,Integral,1.0,True
9,tax,Fractional,1.0,True


In [23]:
from sagemaker.model_monitor import CronExpressionGenerator

ll_monitor_name = prefix+'-mon-'+strftime("%Y-%m-%d-%H-%M-%S", gmtime())
report_path = 's3://{}/{}/report'.format(bucket, prefix)

ll_monitor.create_monitoring_schedule(
    monitor_schedule_name=ll_monitor_name,
    endpoint_input=ll_predictor.endpoint_name,
    output_s3_uri=report_path,
    statistics=ll_monitor.baseline_statistics(),
    constraints=ll_monitor.suggested_constraints(),
    schedule_cron_expression=CronExpressionGenerator.hourly()
)

In [24]:
ll_monitor.describe_schedule()

{'MonitoringScheduleArn': 'arn:aws:sagemaker:eu-west-1:613904931467:monitoring-schedule/ll-boston-housing-mon-2021-07-07-19-15-44',
 'MonitoringScheduleName': 'll-boston-housing-mon-2021-07-07-19-15-44',
 'MonitoringScheduleStatus': 'Pending',
 'MonitoringType': 'DataQuality',
 'CreationTime': datetime.datetime(2021, 7, 7, 19, 15, 44, 647000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2021, 7, 7, 19, 15, 44, 673000, tzinfo=tzlocal()),
 'MonitoringScheduleConfig': {'ScheduleConfig': {'ScheduleExpression': 'cron(0 * ? * * *)'},
  'MonitoringJobDefinitionName': 'data-quality-job-definition-2021-07-07-19-15-44-400',
  'MonitoringType': 'DataQuality'},
 'EndpointName': 'll-boston-housing-ep-07-16-35-25',
 'ResponseMetadata': {'RequestId': '3e4b057c-f57e-4bea-8142-39c42a40b229',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '3e4b057c-f57e-4bea-8142-39c42a40b229',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '589',
   'date': 'Wed, 07 J

In [25]:
ll_executions = ll_monitor.list_executions()
ll_executions

No executions found for schedule. monitoring_schedule_name: ll-boston-housing-mon-2021-07-07-19-15-44


[]

In [26]:
test_sample = '0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98'

# Multiply 1st feature by 10000
bad_sample_1 = '632.0,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,4.98'
# Negate last feature
bad_sample_2 = '0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,-4.98'

response = ll_predictor.predict(bad_sample_1)
print(response)

response = ll_predictor.predict(bad_sample_2)
print(response)

[['-35.72457504272461']]
[['34.424537658691406']]


In [28]:
violations = ll_monitor.latest_monitoring_constraint_violations()
violations = pd.io.json.json_normalize(violations.body_dict["violations"])
violations

  


Unnamed: 0,feature_name,constraint_check_type,description
0,tax,data_type_check,Data type match requirement is not met. Expect...
1,nox,data_type_check,Data type match requirement is not met. Expect...
2,rad,data_type_check,Data type match requirement is not met. Expect...
3,chas,data_type_check,Data type match requirement is not met. Expect...


In [29]:
%%bash -s "$report_path"
echo $1
aws s3 ls --recursive $1
aws s3 cp --recursive $1 .

s3://sagemaker-eu-west-1-613904931467/ll-boston-housing/report
2021-07-07 20:08:00        996 ll-boston-housing/report/ll-boston-housing-ep-07-16-35-25/ll-boston-housing-mon-2021-07-07-19-15-44/2021/07/07/20/constraint_violations.json
2021-07-07 20:08:00       2356 ll-boston-housing/report/ll-boston-housing-ep-07-16-35-25/ll-boston-housing-mon-2021-07-07-19-15-44/2021/07/07/20/constraints.json
2021-07-07 20:08:00      22162 ll-boston-housing/report/ll-boston-housing-ep-07-16-35-25/ll-boston-housing-mon-2021-07-07-19-15-44/2021/07/07/20/statistics.json
2020-07-30 15:07:20        996 ll-boston-housing/report/ll-boston-housing-ep-30-13-25-24/ll-boston-housing-mon-2020-07-30-14-03-22/2020/07/30/15/constraint_violations.json
2020-07-30 15:07:20       2356 ll-boston-housing/report/ll-boston-housing-ep-30-13-25-24/ll-boston-housing-mon-2020-07-30-14-03-22/2020/07/30/15/constraints.json
2020-07-30 15:07:20      22272 ll-boston-housing/report/ll-boston-housing-ep-30-13-25-24/ll-boston-housing-m

In [30]:
%%sh
pygmentize <VIOLATIONS_REPORT>

{
  [94m"violations"[39;49;00m : [ {
    [94m"feature_name"[39;49;00m : [33m"tax"[39;49;00m,
    [94m"constraint_check_type"[39;49;00m : [33m"data_type_check"[39;49;00m,
    [94m"description"[39;49;00m : [33m"Data type match requirement is not met. Expected data type: Fractional, Expected match: 100.0%. Observed: Only 0.0% of data is Fractional."[39;49;00m
  }, {
    [94m"feature_name"[39;49;00m : [33m"nox"[39;49;00m,
    [94m"constraint_check_type"[39;49;00m : [33m"data_type_check"[39;49;00m,
    [94m"description"[39;49;00m : [33m"Data type match requirement is not met. Expected data type: Fractional, Expected match: 100.0%. Observed: Only 0.0% of data is Fractional."[39;49;00m
  }, {
    [94m"feature_name"[39;49;00m : [33m"rad"[39;49;00m,
    [94m"constraint_check_type"[39;49;00m : [33m"data_type_check"[39;49;00m,
    [94m"description"[39;49;00m : [33m"Data type match requirement is not met. Expected data type: Integral, Expected match: 100.0%. Ob

In [None]:
response = ll_monitor.delete_monitoring_schedule()

In [None]:
ll_predictor.delete_endpoint()