# Identifying Issues with SageMaker Debugger

<img align="left" width="130" src="https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Extra/cover-small-padded.png"/>

This notebook contains the code to help readers work through one of the recipes of the book [Machine Learning with Amazon SageMaker Cookbook: 80 proven recipes for data scientists and developers to perform ML experiments and deployments](https://www.amazon.com/Machine-Learning-Amazon-SageMaker-Cookbook/dp/1800567030)

### How to do it...

In [None]:
import sagemaker 
import boto3
from sagemaker import get_execution_role

In [None]:
role_arn = get_execution_role()
session = sagemaker.Session()

In [None]:
s3_bucket = '<insert S3 bucket name here>'
prefix = "chapter05"
path = f"s3://{s3_bucket}/{prefix}/input"

training_path = f"{path}/training_data.csv" 
validation_path = f"{path}/validation_data.csv" 

In [None]:
from sagemaker.image_uris import retrieve
container = retrieve('xgboost', 
                     boto3.Session().region_name, 
                     version="0.90-2")
container

In [None]:
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig, CollectionConfig

save_interval = 2
prefix = "debugger"
bucket_path = 's3://{}/{}'.format(s3_bucket, prefix)

In [None]:
metrics_collection_config = CollectionConfig(
    name="metrics",
    parameters={
        "save_interval": str(save_interval)
    })

debugger_hook_config = DebuggerHookConfig(
    s3_output_path=bucket_path,  
    collection_configs=[metrics_collection_config]
)

In [None]:
loss_not_decreasing_rule = Rule.sagemaker(
    rule_configs.loss_not_decreasing(),
    rule_parameters={
        "collection_names": "metrics",
        "diff_percent": "5",
        "num_steps": "2",
    },
)

rules = [loss_not_decreasing_rule]

In [None]:
estimator = sagemaker.estimator.Estimator( 
    role=role_arn,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    image_uri=container,
    debugger_hook_config=debugger_hook_config,
    rules=rules,
    sagemaker_session=session)

In [None]:
estimator.set_hyperparameters(max_depth=16,
                              objective='binary:logistic',
                              num_round=10000)

In [None]:
from sagemaker.inputs import TrainingInput
    
s3_input_training = TrainingInput(training_path, 
                                  content_type="text/csv")
s3_input_validation = TrainingInput(validation_path, 
                                    content_type="text/csv")

In [None]:
estimator.fit({'train': s3_input_training, 
               'validation': s3_input_validation}, 
              wait=False)

In [None]:
import time

job_name = estimator.latest_training_job.name
client = estimator.sagemaker_session.sagemaker_client

print("Job Name:", job_name)

EVALUATION_STOP_STATES = ["Stopped", "IssuesFound", "NoIssuesFound", "Error"]

In [None]:
def display_rule_job_summary(rule_job_summary):
    break_after_this = False 

    for rule_job in rule_job_summary:
        rule_name = rule_job["RuleConfigurationName"]
        evaluation_status = rule_job["RuleEvaluationStatus"]
        print("Rule [{}]: {}".format(rule_name, 
                                     evaluation_status))

        if evaluation_status == 'IssuesFound':
            summary = rule_job_summary[0]
            status_details = summary['StatusDetails']        
            print("{}".format(status_details))
            
        stopped = evaluation_status in EVALUATION_STOP_STATES
        not_profiler = 'ProfilerReport' not in rule_name

        if stopped and not_profiler:
            break_after_this = True
            
    return break_after_this

In [None]:
for _ in range(200):
    description = client.describe_training_job(
        TrainingJobName=job_name
    )
    
    training_job_status = description["TrainingJobStatus"]
    
    print("\nTraining job Status: {}".format(
        training_job_status))
    latest_job = estimator.latest_training_job
    rule_job_summary = latest_job.rule_job_summary()
    
    break_after_this = display_rule_job_summary(
        rule_job_summary)
    
    if break_after_this:
        break

    time.sleep(10)

In [None]:
artifacts_path = estimator.latest_job_debugger_artifacts_path()
artifacts_path

In [None]:
%store artifacts_path

In [None]:
rule_job_summary = estimator.latest_training_job.rule_job_summary()
rule_job_summary

In [None]:
%store rule_job_summary