## Unet 2D training using adascale in Sagemaker

In [29]:
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.session import TrainingInput

In [30]:
# sagemaker utilities
model_type = 'unet_2d'
num_nodes = 1
node_type = 'ml.p3.16xlarge'
training_s3 = 's3://mzanur-autoscaler/benchmarking_datasets/MSD/preprocessed/01_2d'
image_uri = '427566855058.dkr.ecr.us-east-1.amazonaws.com/unet:aws-unet'
output_dir_s3_addr = f's3://yuliu-dev-east-gryffindor/unet_res'
role = sagemaker.get_execution_role()
sess = sagemaker.Session()

In [31]:
# model hps    
batch_size = 64 # batch size per gpu
learning_rate = 0.001
optimizer = 'adam' # ['adam', 'radam', 'sgd']

# adascale hps
enable_adascale = True
lr_scale = 1.0
enable_gns = False
gns_smoothing = 0.0

In [None]:
# model hypter parameters
hyperparameters = {"num_nodes": num_nodes,
                   "learning_rate": learning_rate,
                   "optimizer": optimizer
                  }
global_batch_size = batch_size * num_nodes * 8 # 8 here is gpu numbers
label = f"unet2d-{num_nodes}nodes-bs{global_batch_size}-{optimizer}"

if enable_adascale:
    hyperparameters["enable_adascale"] = "",
    hyperparameters["lr_scale"] = lr_scale
    label += f"-adascale"
if enable_gns:
    hyperparameters["enable_gns"] = ""
    hyperparameters["gns_smoothing"] = gns_smoothing
    label += f"-gns"
    
custome_label = "-sanity-test"
label += custome_label

In [36]:
estimator = PyTorch(base_job_name=label,
                    source_dir="..",
                    entry_point="sagemaker/dist_train.py",
                    image_uri=image_uri,
                    role=role,
                    instance_count=num_nodes,
                    instance_type=node_type,
                    container_log_level=0,
                    debugger_hook_config=False,
                    hyperparameters=hyperparameters,
                    volume_size=200,
                    output_path=output_dir_s3_addr,
                    sagemaker_session=sess,
                    max_run=259200
                    )
train_input = TrainingInput(training_s3)

In [None]:
estimator.fit({"train": train_input}, wait=True)

2021-07-29 23:55:19 Starting - Starting the training job...
2021-07-29 23:55:42 Starting - Launching requested ML instancesProfilerReport-1627602918: InProgress
.........
2021-07-29 23:57:16 Starting - Preparing the instances for training............
2021-07-29 23:59:03 Downloading - Downloading input data.............