# Debugger and Profiler

In [1]:
# install dependencies
!pip install smdebug

Collecting smdebug
  Downloading smdebug-1.0.34-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting protobuf<=3.20.3,>=3.20.0 (from smdebug)
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Collecting pyinstrument==3.4.2 (from smdebug)
  Downloading pyinstrument-3.4.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting pyinstrument-cext>=0.2.2 (from pyinstrument==3.4.2->smdebug)
  Downloading pyinstrument_cext-0.2.4.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading smdebug-1.0.34-py2.py3-none-any.whl (280 kB)
Downloading pyinstrument-3.4.2-py2.py3-none-any.whl (83 kB)
Downloading protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
Building wheels for collected packages: pyinstrument-cext
  Building wheel for pyinstrument-cext (setup.py) ... [done
[?25h  Created wheel for pyinstrument-cext: filename=pyinstrument_cext-0.2.4-cp311-cp311-linux_x86_64.whl size=6444 sha256=7f3255866870c91aacc4b90513e8f69cf7e0d3f1d3d9ba5f85d1ddd39c23e38b
  Stored in dire

In [2]:
hyperparameters = {
    "batch_size": 2048,
    "gpu": True,
    "epoch": 2,
    "model": "resnet50",
}

In [3]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs

rules = [
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.poor_weight_initialization()),
]
#TODO: Can you add the rules you want to track]



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [4]:
from sagemaker.debugger import DebuggerHookConfig, ProfilerConfig, FrameworkProfile

#TODO: Can you create the profiler and debugger configs
profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=10)
)
debugger_config = DebuggerHookConfig(
    hook_parameters={"train.save_interval": "100", "eval.save_interval": "10"}
)

In [6]:
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role

estimator = PyTorch(
    entry_point="scripts/pytorch_cifar_profiling.py",
    base_job_name="sagemaker-script-mode",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.large",
    hyperparameters=hyperparameters,
    framework_version="1.8",
    py_version="py36",
)
#TODO: Create the estimator to train your model

In [7]:
estimator.fit(wait=True)

2025-04-21 10:57:50 Starting - Starting the training job...
..25-04-21 10:58:05 Starting - Preparing the instances for training.
..25-04-21 10:58:27 Downloading - Downloading input data.
.....04-21 10:59:07 Downloading - Downloading the training image.
2025-04-21 11:00:14 Training - Training image download completed. Training in progress.
2025-04-21 11:00:14 Uploading - Uploading generated training model[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2025-04-21 11:00:07,818 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2025-04-21 11:00:07,822 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-04-21 11:00:07,834 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2025-04-21 11:00:07,879 sagemaker_pytorch_container.training INFO     Invoking user training scr

In [None]:
import boto3

session = boto3.session.Session()
region = session.region_name

training_job_name = estimator.latest_training_job.name
print(f"Training jobname: {training_job_name}")
print(f"Region: {region}")

In [None]:
from smdebug.trials import create_trial
from smdebug.core.modes import ModeKeys

trial = create_trial(estimator.latest_job_debugger_artifacts_path())

In [None]:
# TODO: Can you print the names of all the tensors that were tracked
# TODO: Can you print the number of datapoints for one of those tensors
# for both train and eval mode

In [None]:
from smdebug.profiler.analysis.notebook_utils.training_job import TrainingJob

tj = TrainingJob(training_job_name, region)
tj.wait_for_sys_profiling_data_to_be_available()

In [None]:
from smdebug.profiler.analysis.notebook_utils.timeline_charts import TimelineCharts

system_metrics_reader = tj.get_systems_metrics_reader()
system_metrics_reader.refresh_event_file_list()

view_timeline_charts = TimelineCharts(
    system_metrics_reader,
    framework_metrics_reader=None,
    select_dimensions=["CPU", "GPU"],
    select_events=["total"],
)

In [None]:
rule_output_path = estimator.output_path + estimator.latest_training_job.job_name + "/rule-output"
print(f"You will find the profiler report in {rule_output_path}")

In [None]:
! aws s3 ls {rule_output_path} --recursive

In [None]:
! aws s3 cp {rule_output_path} ./ --recursive

In [None]:
import os

# get the autogenerated folder name of profiler report
profiler_report_name = [
    rule["RuleConfigurationName"]
    for rule in estimator.latest_training_job.rule_job_summary()
    if "Profiler" in rule["RuleConfigurationName"]
][0]

In [None]:
import IPython

IPython.display.HTML(filename=profiler_report_name + "/profiler-output/profiler-report.html")