In [1]:
import sagemaker 
import boto3
from sagemaker import get_execution_role

In [2]:
role_arn = get_execution_role()
session = sagemaker.Session()

In [3]:
s3_bucket = 'sagemaker-cookbook-bucket'
prefix = "chapter05"

training_s3_input_location = f"s3://{s3_bucket}/{prefix}/input/training_data.csv" 
validation_s3_input_location = f"s3://{s3_bucket}/{prefix}/input/validation_data.csv" 
test_s3_input_location = f"s3://{s3_bucket}/{prefix}/input/test_data.csv" 

In [4]:
from sagemaker.image_uris import retrieve
container = retrieve('xgboost', 
                     boto3.Session().region_name, 
                     version="0.90-2")
container

'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:0.90-2-cpu-py3'

In [5]:
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig, CollectionConfig

save_interval = 2
prefix = "debugger"
bucket_path = 's3://{}/{}'.format(s3_bucket, prefix)

In [6]:
metrics_collection_config = CollectionConfig(
    name="metrics",
    parameters={
        "save_interval": str(save_interval)
    })

debugger_hook_config = DebuggerHookConfig(
    s3_output_path=bucket_path,  
    collection_configs=[metrics_collection_config]
)

In [7]:
loss_not_decreasing_rule = Rule.sagemaker(
    rule_configs.loss_not_decreasing(),
    rule_parameters={
        "collection_names": "metrics",
        "diff_percent": "5",
        "num_steps": "2",
    },
)

rules = [loss_not_decreasing_rule]

In [8]:
estimator = sagemaker.estimator.Estimator( 
    role=role_arn,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    image_uri=container,
    debugger_hook_config=debugger_hook_config,
    rules=rules,
    sagemaker_session=session)

In [9]:
estimator.set_hyperparameters(max_depth=16,
                              objective='binary:logistic',
                              num_round=10000)

In [10]:
from sagemaker.inputs import TrainingInput
    
s3_input_training = TrainingInput(training_s3_input_location, 
                                  content_type="text/csv")
s3_input_validation = TrainingInput(validation_s3_input_location, 
                                    content_type="text/csv")

In [11]:
estimator.fit({'train': s3_input_training, 
               'validation': s3_input_validation}, 
              wait=False)

In [12]:
import time

job_name = estimator.latest_training_job.name
client = estimator.sagemaker_session.sagemaker_client

print("Job Name:", job_name)

EVALUATION_STOP_STATES = ["Stopped", "IssuesFound", "NoIssuesFound", "Error"]

Job Name: sagemaker-xgboost-2021-04-23-12-46-34-431


In [13]:
def display_rule_job_summary(rule_job_summary):
    break_after_this = False 

    for rule_job in rule_job_summary:
        rule_name = rule_job["RuleConfigurationName"]
        evaluation_status = rule_job["RuleEvaluationStatus"]
        print("Rule [{}]: {}".format(rule_name, evaluation_status))

        if evaluation_status == 'IssuesFound':
            status_details = rule_job_summary[0]['StatusDetails']        
            print("{}".format(status_details))

        if evaluation_status in EVALUATION_STOP_STATES and 'ProfilerReport' not in rule_name:
            break_after_this = True
            
    return break_after_this

In [14]:
for _ in range(200):
    description = client.describe_training_job(
        TrainingJobName=job_name
    )
    
    training_job_status = description["TrainingJobStatus"]
    
    print("\nTraining job Status: {}".format(training_job_status))
    latest_job = estimator.latest_training_job
    rule_job_summary = latest_job.rule_job_summary()
    
    break_after_this = display_rule_job_summary(rule_job_summary)
    if break_after_this:
        break

    time.sleep(10)


Training job Status: InProgress
Rule [LossNotDecreasing]: InProgress
Rule [ProfilerReport-1619181994]: InProgress

Training job Status: InProgress
Rule [LossNotDecreasing]: InProgress
Rule [ProfilerReport-1619181994]: InProgress

Training job Status: InProgress
Rule [LossNotDecreasing]: InProgress
Rule [ProfilerReport-1619181994]: InProgress

Training job Status: InProgress
Rule [LossNotDecreasing]: InProgress
Rule [ProfilerReport-1619181994]: InProgress

Training job Status: InProgress
Rule [LossNotDecreasing]: InProgress
Rule [ProfilerReport-1619181994]: InProgress

Training job Status: InProgress
Rule [LossNotDecreasing]: InProgress
Rule [ProfilerReport-1619181994]: InProgress

Training job Status: InProgress
Rule [LossNotDecreasing]: InProgress
Rule [ProfilerReport-1619181994]: InProgress

Training job Status: InProgress
Rule [LossNotDecreasing]: InProgress
Rule [ProfilerReport-1619181994]: InProgress

Training job Status: InProgress
Rule [LossNotDecreasing]: InProgress
Rule [Prof

In [16]:
artifacts_path = estimator.latest_job_debugger_artifacts_path()
artifacts_path

's3://sagemaker-cookbook-bucket/debugger/sagemaker-xgboost-2021-04-23-12-46-34-431/debug-output'

In [17]:
%store artifacts_path

Stored 'artifacts_path' (str)


In [20]:
rule_job_summary = estimator.latest_training_job.rule_job_summary()
rule_job_summary

[{'RuleConfigurationName': 'LossNotDecreasing',
  'RuleEvaluationJobArn': 'arn:aws:sagemaker:us-east-1:581320662326:processing-job/sagemaker-xgboost-2021-04--lossnotdecreasing-ee3df9a4',
  'RuleEvaluationStatus': 'IssuesFound',
  'StatusDetails': 'RuleEvaluationConditionMet: Evaluation of the rule LossNotDecreasing at step 8 resulted in the condition being met\n',
  'LastModifiedTime': datetime.datetime(2021, 4, 23, 12, 57, 6, 274000, tzinfo=tzlocal())},
 {'RuleConfigurationName': 'ProfilerReport-1619181994',
  'RuleEvaluationJobArn': 'arn:aws:sagemaker:us-east-1:581320662326:processing-job/sagemaker-xgboost-2021-04--profilerreport-1619181994-2774f38b',
  'RuleEvaluationStatus': 'NoIssuesFound',
  'LastModifiedTime': datetime.datetime(2021, 4, 23, 12, 51, 19, 442000, tzinfo=tzlocal())}]

In [21]:
%store rule_job_summary

Stored 'rule_job_summary' (list)
