In [283]:
%store -r s3_bucket_name
%store -r prefix

In [284]:
base = f's3://{s3_bucket_name}/{prefix}'
baseline_data_uri = f'{base}/input/training_data.csv'
baseline_results_uri = f"{base}/model-monitor/baseline-results"

In [285]:
local_file = "tmp/baseline.csv"
!aws s3 cp {baseline_data_uri} {local_file}

import pandas as pd
baseline_df = pd.read_csv(local_file)
baseline_df

download: s3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv to tmp/baseline.csv


Unnamed: 0,label,a,b,c,d
0,1,-8.837413,-6.551265,23,-75
1,1,-9.216749,-2.483494,2,-51
2,1,-2.017317,-6.326533,91,34
3,1,-10.748736,-4.622519,8,-78
4,0,-3.675848,12.629029,47,32
...,...,...,...,...,...
2995,0,-5.786462,-6.790668,-65,70
2996,1,-2.552410,-1.793217,42,4
2997,0,-10.692197,1.583437,-90,-62
2998,1,-14.109003,-4.745680,37,64


In [286]:
import sagemaker
role = sagemaker.get_execution_role()

In [287]:
from sagemaker.model_monitor import DefaultModelMonitor

default_monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
)

In [288]:
%%time

from sagemaker.model_monitor import dataset_format

dsf = dataset_format.DatasetFormat.csv(header=True)

default_monitor.suggest_baseline(
    baseline_dataset=baseline_data_uri,
    dataset_format=dsf,
    output_s3_uri=baseline_results_uri,
    wait=True
)


Job Name:  baseline-suggestion-job-2021-05-24-20-24-18-822
Inputs:  [{'InputName': 'baseline_dataset_input', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv', 'LocalPath': '/opt/ml/processing/input/baseline_dataset_input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'monitoring_output', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/model-monitor/baseline-results', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
........................[34m2021-05-24 20:28:05,336 - __main__ - INFO - All params:{'ProcessingJobArn': 'arn:aws:sagemaker:us-east-1:581320662326:processing-job/baseline-suggestion-job-2021-05-24-20-24-18-822', 'ProcessingJobName': 'baseline-suggestion-job-2021-05-24-20-24-18-822', 'Environment': {'dataset_format': '{"csv": {"header": true, "output_co

<sagemaker.processing.ProcessingJob at 0x7f791e64a4d0>

In [289]:
baseline_job = default_monitor.latest_baselining_job
baseline_job.__dict__

{'inputs': [<sagemaker.processing.ProcessingInput at 0x7f791689c410>],
 'outputs': [<sagemaker.processing.ProcessingOutput at 0x7f791e586090>],
 'output_kms_key': None,
 'sagemaker_session': <sagemaker.session.Session at 0x7f791e586210>,
 'job_name': 'baseline-suggestion-job-2021-05-24-20-24-18-822'}

In [290]:
stats = baseline_job.baseline_statistics()
schema_dict = stats.body_dict["features"]

In [291]:
import pandas as pd
schema_df = pd.json_normalize(schema_dict)
schema_df.head(5)

Unnamed: 0,name,inferred_type,numerical_statistics.common.num_present,numerical_statistics.common.num_missing,numerical_statistics.mean,numerical_statistics.sum,numerical_statistics.std_dev,numerical_statistics.min,numerical_statistics.max,numerical_statistics.distribution.kll.buckets,numerical_statistics.distribution.kll.sketch.parameters.c,numerical_statistics.distribution.kll.sketch.parameters.k,numerical_statistics.distribution.kll.sketch.data
0,label,Integral,3000,0,0.500667,1502.0,0.5,0.0,1.0,"[{'lower_bound': 0.0, 'upper_bound': 0.1, 'cou...",0.64,2048.0,"[[0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0,..."
1,a,Fractional,3000,0,-7.392525,-22177.574392,5.375835,-28.929055,13.99153,"[{'lower_bound': -28.929055280547928, 'upper_b...",0.64,2048.0,"[[-13.963251881128011, -8.799759531272912, -9...."
2,b,Fractional,3000,0,-1.425141,-4275.423435,6.497591,-19.092297,23.3785,"[{'lower_bound': -19.0922971731506, 'upper_bou...",0.64,2048.0,"[[-2.325207883948827, -4.766406758392868, -9.4..."
3,c,Integral,3000,0,-0.356333,-1069.0,57.16836,-100.0,99.0,"[{'lower_bound': -100.0, 'upper_bound': -80.1,...",0.64,2048.0,"[[47.0, -40.0, -13.0, 93.0, 53.0, -49.0, 5.0, ..."
4,d,Integral,3000,0,-1.480667,-4442.0,58.179724,-100.0,99.0,"[{'lower_bound': -100.0, 'upper_bound': -80.1,...",0.64,2048.0,"[[-2.0, -57.0, -54.0, 86.0, -94.0, 38.0, 13.0,..."


In [292]:
constraints = baseline_job.suggested_constraints()
constraints_dict = constraints.body_dict["features"]

In [293]:
constraints_df = pd.json_normalize(constraints_dict)
constraints_df.head(5)

Unnamed: 0,name,inferred_type,completeness,num_constraints.is_non_negative
0,label,Integral,1.0,True
1,a,Fractional,1.0,False
2,b,Fractional,1.0,False
3,c,Integral,1.0,False
4,d,Integral,1.0,False


In [294]:
!aws s3 cp {baseline_results_uri}/ tmp/ --recursive

download: s3://sagemaker-cookbook-bucket/chapter07/model-monitor/baseline-results/constraints.json to tmp/constraints.json
download: s3://sagemaker-cookbook-bucket/chapter07/model-monitor/baseline-results/statistics.json to tmp/statistics.json


In [295]:
!cat tmp/constraints.json

{
  "version" : 0.0,
  "features" : [ {
    "name" : "label",
    "inferred_type" : "Integral",
    "completeness" : 1.0,
    "num_constraints" : {
      "is_non_negative" : true
    }
  }, {
    "name" : "a",
    "inferred_type" : "Fractional",
    "completeness" : 1.0,
    "num_constraints" : {
      "is_non_negative" : false
    }
  }, {
    "name" : "b",
    "inferred_type" : "Fractional",
    "completeness" : 1.0,
    "num_constraints" : {
      "is_non_negative" : false
    }
  }, {
    "name" : "c",
    "inferred_type" : "Integral",
    "completeness" : 1.0,
    "num_constraints" : {
      "is_non_negative" : false
    }
  }, {
    "name" : "d",
    "inferred_type" : "Integral",
    "completeness" : 1.0,
    "num_constraints" : {
      "is_non_negative" : false
    }
  } ],
  "monitoring_config" : {
    "evaluate_constraints" : "Enabled",
    "emit_metrics" : "Enabled",
    "datatype_check_threshold" : 1.0,
    "domain_content_threshold" : 1.0,
    "distribution_constraints" : {

In [296]:
!cat tmp/statistics.json

{
  "version" : 0.0,
  "dataset" : {
    "item_count" : 3000
  },
  "features" : [ {
    "name" : "label",
    "inferred_type" : "Integral",
    "numerical_statistics" : {
      "common" : {
        "num_present" : 3000,
        "num_missing" : 0
      },
      "mean" : 0.5006666666666667,
      "sum" : 1502.0,
      "std_dev" : 0.499999555555358,
      "min" : 0.0,
      "max" : 1.0,
      "distribution" : {
        "kll" : {
          "buckets" : [ {
            "lower_bound" : 0.0,
            "upper_bound" : 0.1,
            "count" : 1498.0
          }, {
            "lower_bound" : 0.1,
            "upper_bound" : 0.2,
            "count" : 0.0
          }, {
            "lower_bound" : 0.2,
            "upper_bound" : 0.3,
            "count" : 0.0
          }, {
            "lower_bound" : 0.3,
            "upper_bound" : 0.4,
            "count" : 0.0
          }, {
            "lower_bound" : 0.4,
            "upper_bound" : 0.5,
            "count" : 0.0
          }, {
     

In [297]:
from sagemaker.model_monitor import CronExpressionGenerator
from time import gmtime, strftime

In [298]:
import random
from string import ascii_uppercase

def generate_schedule_name():
    chars = random.choices(ascii_uppercase, k=5)
    output = 'schedule-' + ''.join(chars)
    return output

In [299]:
schedule_name = generate_schedule_name()
schedule_name

'schedule-KYTXY'

In [300]:
s3_report_path = f'{base}/report-path'

In [301]:
baseline_statistics = default_monitor.baseline_statistics()

In [302]:
constraints = default_monitor.suggested_constraints()

In [303]:
cron_expression = CronExpressionGenerator.hourly()
cron_expression

'cron(0 * ? * * *)'

In [304]:
%store -r endpoint_name

from sagemaker import Predictor
predictor = Predictor(endpoint_name=endpoint_name)

In [305]:
try:
    default_monitor.delete_monitoring_schedule()
except:
    pass


Deleting Monitoring Schedule with name: None


In [306]:
%store -r csv_input
csv_input

'-3.585282409963491,-4.820820583530444,88,51'

In [307]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import CSVSerializer

predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

In [308]:
predictor.predict(csv_input)

0.9837374687194824

In [309]:
constraints.__dict__

{'body_dict': {'version': 0.0,
  'features': [{'name': 'label',
    'inferred_type': 'Integral',
    'completeness': 1.0,
    'num_constraints': {'is_non_negative': True}},
   {'name': 'a',
    'inferred_type': 'Fractional',
    'completeness': 1.0,
    'num_constraints': {'is_non_negative': False}},
   {'name': 'b',
    'inferred_type': 'Fractional',
    'completeness': 1.0,
    'num_constraints': {'is_non_negative': False}},
   {'name': 'c',
    'inferred_type': 'Integral',
    'completeness': 1.0,
    'num_constraints': {'is_non_negative': False}},
   {'name': 'd',
    'inferred_type': 'Integral',
    'completeness': 1.0,
    'num_constraints': {'is_non_negative': False}}],
  'monitoring_config': {'evaluate_constraints': 'Enabled',
   'emit_metrics': 'Enabled',
   'datatype_check_threshold': 1.0,
   'domain_content_threshold': 1.0,
   'distribution_constraints': {'perform_comparison': 'Enabled',
    'comparison_threshold': 0.1,
    'comparison_method': 'Robust'}}},
 'file_s3_uri': '

In [310]:
constraints.body_dict['features'][0]['inferred_type'] = 'Fractional'
constraints.body_dict

{'version': 0.0,
 'features': [{'name': 'label',
   'inferred_type': 'Fractional',
   'completeness': 1.0,
   'num_constraints': {'is_non_negative': True}},
  {'name': 'a',
   'inferred_type': 'Fractional',
   'completeness': 1.0,
   'num_constraints': {'is_non_negative': False}},
  {'name': 'b',
   'inferred_type': 'Fractional',
   'completeness': 1.0,
   'num_constraints': {'is_non_negative': False}},
  {'name': 'c',
   'inferred_type': 'Integral',
   'completeness': 1.0,
   'num_constraints': {'is_non_negative': False}},
  {'name': 'd',
   'inferred_type': 'Integral',
   'completeness': 1.0,
   'num_constraints': {'is_non_negative': False}}],
 'monitoring_config': {'evaluate_constraints': 'Enabled',
  'emit_metrics': 'Enabled',
  'datatype_check_threshold': 1.0,
  'domain_content_threshold': 1.0,
  'distribution_constraints': {'perform_comparison': 'Enabled',
   'comparison_threshold': 0.1,
   'comparison_method': 'Robust'}}}

In [311]:
constraints.save()

's3://sagemaker-cookbook-bucket/chapter07/model-monitor/baseline-results/constraints.json'

In [312]:
default_monitor.create_monitoring_schedule(
    monitor_schedule_name=schedule_name,
    endpoint_input=predictor.endpoint,
    output_s3_uri=s3_report_path,
    statistics=baseline_statistics,
    constraints=constraints,
    schedule_cron_expression=cron_expression,
    enable_cloudwatch_metrics=True,
)

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [313]:
default_monitor.describe_schedule()

{'MonitoringScheduleArn': 'arn:aws:sagemaker:us-east-1:581320662326:monitoring-schedule/schedule-kytxy',
 'MonitoringScheduleName': 'schedule-KYTXY',
 'MonitoringScheduleStatus': 'Pending',
 'MonitoringType': 'DataQuality',
 'CreationTime': datetime.datetime(2021, 5, 24, 20, 30, 6, 651000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2021, 5, 24, 20, 30, 6, 687000, tzinfo=tzlocal()),
 'MonitoringScheduleConfig': {'ScheduleConfig': {'ScheduleExpression': 'cron(0 * ? * * *)'},
  'MonitoringJobDefinitionName': 'data-quality-job-definition-2021-05-24-20-30-06-310',
  'MonitoringType': 'DataQuality'},
 'EndpointName': 'model-ACBOE-2021-05-24-14-51-44-733',
 'ResponseMetadata': {'RequestId': 'efbc00db-f2be-420c-80c8-19743a568823',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'efbc00db-f2be-420c-80c8-19743a568823',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '538',
   'date': 'Mon, 24 May 2021 20:30:06 GMT'},
  'RetryAttempts': 0}}

In [314]:
sleep(300)

In [315]:
def perform_good_input():
    predictor.predict(csv_input)
    print("good input")

In [316]:
def perform_bad_input():
    csv_bad_input = '-3.585,-4.8208,88.123,51.456'
    predictor.predict(csv_bad_input)
    print("bad input")

In [317]:
perform_good_input()
perform_bad_input()

good input
bad input


In [318]:
dm = default_monitor
monitoring_violations = dm.latest_monitoring_constraint_violations()
monitoring_statistics = dm.latest_monitoring_statistics()

No executions found for schedule. monitoring_schedule_name: schedule-KYTXY
No executions found for schedule. monitoring_schedule_name: schedule-KYTXY
No executions found for schedule. monitoring_schedule_name: schedule-KYTXY
No executions found for schedule. monitoring_schedule_name: schedule-KYTXY


In [None]:
%%time

from time import sleep

violations = monitoring_violations

while not violations:
    print("No executions yet. Sleeping for 5 minutes...")
    sleep(300)
    
    perform_good_input()
    perform_bad_input()
    
    try:
        v = dm.latest_monitoring_constraint_violations()
        violations = v
    except:
        pass
    
print("Executions found!")

No executions yet. Sleeping for 5 minutes...
good input
bad input
No executions found for schedule. monitoring_schedule_name: schedule-KYTXY
No executions found for schedule. monitoring_schedule_name: schedule-KYTXY
No executions yet. Sleeping for 5 minutes...
good input
bad input
No executions found for schedule. monitoring_schedule_name: schedule-KYTXY
No executions found for schedule. monitoring_schedule_name: schedule-KYTXY
No executions yet. Sleeping for 5 minutes...
good input
bad input
No executions found for schedule. monitoring_schedule_name: schedule-KYTXY
No executions found for schedule. monitoring_schedule_name: schedule-KYTXY
No executions yet. Sleeping for 5 minutes...
good input
bad input
No executions found for schedule. monitoring_schedule_name: schedule-KYTXY
No executions found for schedule. monitoring_schedule_name: schedule-KYTXY
No executions yet. Sleeping for 5 minutes...
good input
bad input
No executions found for schedule. monitoring_schedule_name: schedule-K

In [None]:
violations = dm.latest_monitoring_constraint_violations()
violations.__dict__

{'body_dict': {'violations': [{'feature_name': 'd',
    'constraint_check_type': 'data_type_check',
    'description': 'Data type match requirement is not met. Expected data type: Integral, Expected match: 100.0%. Observed: Only 66.66666666666666% of data is Integral.'},
   {'feature_name': 'c',
    'constraint_check_type': 'data_type_check',
    'description': 'Data type match requirement is not met. Expected data type: Integral, Expected match: 100.0%. Observed: Only 66.66666666666666% of data is Integral.'}]},
 'file_s3_uri': 's3://sagemaker-cookbook-bucket/chapter07/report-path/model-ACBOE-2021-05-24-14-51-44-733/schedule-KYTXY/2021/05/24/21/constraint_violations.json',
 'kms_key': None,
 'session': None}

In [None]:
!aws s3 cp {violations.file_s3_uri} tmp/violations.json

download: s3://sagemaker-cookbook-bucket/chapter07/report-path/model-ACBOE-2021-05-24-14-51-44-733/schedule-KYTXY/2021/05/24/21/constraint_violations.json to tmp/violations.json


In [None]:
!cat tmp/violations.json

{
  "violations" : [ {
    "feature_name" : "d",
    "constraint_check_type" : "data_type_check",
    "description" : "Data type match requirement is not met. Expected data type: Integral, Expected match: 100.0%. Observed: Only 66.66666666666666% of data is Integral."
  }, {
    "feature_name" : "c",
    "constraint_check_type" : "data_type_check",
    "description" : "Data type match requirement is not met. Expected data type: Integral, Expected match: 100.0%. Observed: Only 66.66666666666666% of data is Integral."
  } ]
}

In [None]:
monitoring_statistics = dm.latest_monitoring_statistics()
monitoring_statistics.__dict__

{'body_dict': {'version': 0.0,
  'dataset': {'item_count': 18},
  'features': [{'name': 'label',
    'inferred_type': 'Fractional',
    'numerical_statistics': {'common': {'num_present': 18, 'num_missing': 0},
     'mean': 0.9837374687194824,
     'sum': 17.707274436950684,
     'std_dev': 4.972724545902011e-17,
     'min': 0.9837374687194824,
     'max': 0.9837374687194824,
     'distribution': {'kll': {'buckets': [{'lower_bound': 0.9837374687194824,
         'upper_bound': 0.9837374687194824,
         'count': 0.0},
        {'lower_bound': 0.9837374687194824,
         'upper_bound': 0.9837374687194824,
         'count': 0.0},
        {'lower_bound': 0.9837374687194824,
         'upper_bound': 0.9837374687194824,
         'count': 0.0},
        {'lower_bound': 0.9837374687194824,
         'upper_bound': 0.9837374687194824,
         'count': 0.0},
        {'lower_bound': 0.9837374687194824,
         'upper_bound': 0.9837374687194824,
         'count': 0.0},
        {'lower_bound': 0.98

In [None]:
default_monitor.delete_monitoring_schedule()


Deleting Monitoring Schedule with name: schedule-KYTXY


In [None]:
# predictor.delete_endpoint()