In [37]:
# !pip install ipywidgets==7.6.0

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# !pip install smdebug

In [3]:
from datetime import datetime


In [4]:
import sagemaker
import boto3

from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role

from sagemaker.tuner import (
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)

In [5]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
prefix = "project-3-cola"
role = get_execution_role()

In [6]:
now = str(datetime.now())

estimator = PyTorch(
    entry_point="model_hpo.py",
    source_dir="../src",
    output_path=f"s3://{bucket}/{prefix}",
    checkpoint_s3_uri=f"s3://{bucket}/{prefix}/checkpoints/{now}",
    base_job_name="hyperparametere-tuning-cola",
    role=role,
    instance_count=1,
    instance_type="ml.g4dn.xlarge",
    framework_version="1.10",
    py_version="py38",

)

In [7]:
hyperparameter_ranges = {
    "lr": ContinuousParameter(2e-5, 2e-4),
    "batch-size": CategoricalParameter([16, 32, 64]),
    "max-length": CategoricalParameter([128, 256])
}

In [8]:
objective_metric_name = "Average Validation Loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "Average Validation Loss", "Regex": "Validation set: Average loss: ([0-9\\.]+)"}]

In [9]:
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    max_jobs=2,
    max_parallel_jobs=2,
    objective_type=objective_type,
)

In [10]:
tuner.fit(wait=True)

................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................!


In [11]:
tuner.best_training_job()

'pytorch-training-230930-1009-001-673f1072'

In [14]:
tuner.best_estimator().hyperparameters()


2023-09-30 10:54:49 Starting - Preparing the instances for training
2023-09-30 10:54:49 Downloading - Downloading input data
2023-09-30 10:54:49 Training - Training image download completed. Training in progress.
2023-09-30 10:54:49 Uploading - Uploading generated training model
2023-09-30 10:54:49 Completed - Resource released due to keep alive period expiry


{'_tuning_objective_metric': '"Average Validation Loss"',
 'batch-size': '"64"',
 'lr': '0.00015571726483079338',
 'max-length': '"256"',
 'sagemaker_container_log_level': '20',
 'sagemaker_estimator_class_name': '"PyTorch"',
 'sagemaker_estimator_module': '"sagemaker.pytorch.estimator"',
 'sagemaker_job_name': '"hyperparametere-tuning-cola-2023-09-30-10-09-01-165"',
 'sagemaker_program': '"model_hpo.py"',
 'sagemaker_region': '"us-east-1"',
 'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-046017406246/hyperparametere-tuning-cola-2023-09-30-10-09-01-165/source/sourcedir.tar.gz"'}

In [53]:
session = boto3.session.Session()
region = session.region_name

training_job_name = estimator.latest_training_job.name
print(f"Training jobname: {training_job_name}")
print(f"Region: {region}")

trial = create_trial(estimator.latest_job_debugger_artifacts_path())

print(trial.tensor_names())
print(len(trial.tensor("nll_loss_output_0").steps(mode=ModeKeys.TRAIN)))
print(len(trial.tensor("nll_loss_output_0").steps(mode=ModeKeys.EVAL)))

Training jobname: script-mode-cola-2023-09-24-17-48-04-920
Region: us-east-1
[2023-09-24 17:57:58.857 pytorch-1-10-cpu-py38-ml-t3-medium-9c8fa3a7688fa232cbbe37185a11:27 INFO s3_trial.py:42] Loading trial debug-output at path s3://sagemaker-us-east-1-046017406246/project-3-cola/script-mode-cola-2023-09-24-17-48-04-920/debug-output
['gradient/Net_W1.bias', 'gradient/Net_W1.weight', 'gradient/Net_bert.embeddings.LayerNorm.bias', 'gradient/Net_bert.embeddings.LayerNorm.weight', 'gradient/Net_bert.embeddings.position_embeddings.weight', 'gradient/Net_bert.embeddings.token_type_embeddings.weight', 'gradient/Net_bert.embeddings.word_embeddings.weight', 'gradient/Net_bert.encoder.layer.0.attention.output.LayerNorm.bias', 'gradient/Net_bert.encoder.layer.0.attention.output.LayerNorm.weight', 'gradient/Net_bert.encoder.layer.0.attention.output.dense.bias', 'gradient/Net_bert.encoder.layer.0.attention.output.dense.weight', 'gradient/Net_bert.encoder.layer.0.attention.self.key.bias', 'gradient/Net

In [54]:
tj = TrainingJob(training_job_name, region)
tj.wait_for_sys_profiling_data_to_be_available()

system_metrics_reader = tj.get_systems_metrics_reader()
system_metrics_reader.refresh_event_file_list()

view_timeline_charts = TimelineCharts(
    system_metrics_reader,
    framework_metrics_reader=None,
    select_dimensions=["CPU", "GPU"],
    select_events=["total"],
)

rule_output_path = os.path.join(estimator.output_path, estimator.latest_training_job.job_name, "rule-output")
print(f"You will find the profiler report in {rule_output_path}")

ProfilerConfig:{'S3OutputPath': 's3://sagemaker-us-east-1-046017406246/project-3-cola', 'ProfilingIntervalInMilliseconds': 500, 'ProfilingParameters': {'DataloaderProfilingConfig': '{"StartStep": 0, "NumSteps": 10, "MetricsRegex": ".*", }', 'DetailedProfilingConfig': '{"StartStep": 0, "NumSteps": 10, }', 'FileOpenFailThreshold': '50', 'HorovodProfilingConfig': '{"StartStep": 0, "NumSteps": 10, }', 'LocalPath': '/opt/ml/output/profiler', 'PythonProfilingConfig': '{"StartStep": 0, "NumSteps": 10, "ProfilerName": "cprofile", "cProfileTimer": "total_time", }', 'RotateFileCloseIntervalInSeconds': '60', 'RotateMaxFileSizeInBytes': '10485760', 'SMDataParallelProfilingConfig': '{"StartStep": 0, "NumSteps": 10, }'}}
s3 path:s3://sagemaker-us-east-1-046017406246/project-3-cola/script-mode-cola-2023-09-24-17-48-04-920/profiler-output


Profiler data from system is available
[2023-09-24 17:58:15.533 pytorch-1-10-cpu-py38-ml-t3-medium-9c8fa3a7688fa232cbbe37185a11:27 INFO metrics_reader_base.py:134]

You will find the profiler report in s3://sagemaker-us-east-1-046017406246/project-3-cola/script-mode-cola-2023-09-24-17-48-04-920/rule-output


In [55]:
! aws s3 ls {rule_output_path} --recursive

2023-09-24 17:55:30     382157 project-3-cola/script-mode-cola-2023-09-24-17-48-04-920/rule-output/ProfilerReport/profiler-output/profiler-report.html
2023-09-24 17:55:29     231108 project-3-cola/script-mode-cola-2023-09-24-17-48-04-920/rule-output/ProfilerReport/profiler-output/profiler-report.ipynb
2023-09-24 17:55:24        191 project-3-cola/script-mode-cola-2023-09-24-17-48-04-920/rule-output/ProfilerReport/profiler-output/profiler-reports/BatchSize.json
2023-09-24 17:55:24       5138 project-3-cola/script-mode-cola-2023-09-24-17-48-04-920/rule-output/ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json
2023-09-24 17:55:24       2114 project-3-cola/script-mode-cola-2023-09-24-17-48-04-920/rule-output/ProfilerReport/profiler-output/profiler-reports/Dataloader.json
2023-09-24 17:55:24        129 project-3-cola/script-mode-cola-2023-09-24-17-48-04-920/rule-output/ProfilerReport/profiler-output/profiler-reports/GPUMemoryIncrease.json
2023-09-24 17:55:24       1411 proje

In [56]:
! aws s3 cp {rule_output_path} ./ --recursive

download: s3://sagemaker-us-east-1-046017406246/project-3-cola/script-mode-cola-2023-09-24-17-48-04-920/rule-output/ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json to ProfilerReport/profiler-output/profiler-reports/CPUBottleneck.json
download: s3://sagemaker-us-east-1-046017406246/project-3-cola/script-mode-cola-2023-09-24-17-48-04-920/rule-output/ProfilerReport/profiler-output/profiler-reports/GPUMemoryIncrease.json to ProfilerReport/profiler-output/profiler-reports/GPUMemoryIncrease.json
download: s3://sagemaker-us-east-1-046017406246/project-3-cola/script-mode-cola-2023-09-24-17-48-04-920/rule-output/ProfilerReport/profiler-output/profiler-reports/BatchSize.json to ProfilerReport/profiler-output/profiler-reports/BatchSize.json
download: s3://sagemaker-us-east-1-046017406246/project-3-cola/script-mode-cola-2023-09-24-17-48-04-920/rule-output/ProfilerReport/profiler-output/profiler-reports/LowGPUUtilization.json to ProfilerReport/profiler-output/profiler-reports/Low

In [57]:
# get the autogenerated folder name of profiler report
profiler_report_name = [
    rule["RuleConfigurationName"]
    for rule in estimator.latest_training_job.rule_job_summary()
    if "Profiler" in rule["RuleConfigurationName"]
][0]

# IPython.display.HTML(filename=profiler_report_name + "/profiler-output/profiler-report.html")