# Initial setup

In [2]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()
role = "arn:aws:iam::941656036254:role/service-role/AmazonSageMaker-ExecutionRole-20210904T193230"

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/sm-modelparallel-distribution-options'
print('Bucket:\n{}'.format(bucket))

Bucket:
sagemaker-us-east-1-941656036254


In [2]:
sagemaker.__version__

'2.84.0'

In [3]:
data_url = "s3://sagemaker-us-east-1-941656036254/hymenoptera_data"

In [3]:
# Debugger configuration

from sagemaker.debugger import ProfilerConfig, FrameworkProfile, Rule, ProfilerRule, rule_configs

profiler_config=ProfilerConfig(
    system_monitor_interval_millis=1000,
    framework_profile_params=None
)

rules=[ 
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

# Local Mode

# Remote Mode

In [12]:

from sagemaker.pytorch import PyTorch

mpioptions = "-verbose -x orte_base_help_aggregate=0 "
instance_type = 'ml.p2.xlarge'
instance_count = 2

smd_mp_estimator = PyTorch(
          entry_point="train_sm_mp.py", # Pick your train script
          source_dir='4_sources',
          role=role,
          instance_type=instance_type,
          sagemaker_session=sagemaker_session,
          framework_version='1.10',
          py_version='py38',
          instance_count=instance_count,
          hyperparameters={
              "batch-size":64,
              "epochs":30,
              "model-name":"squeezenet",
              "num-classes": 2,
              "feature-extract":True,
            #  "sync-s3-path":f"s3://{bucket}/distributed-training/output"
          },
          disable_profiler=True,
          debugger_hook_config=False,
          distribution={
              "smdistributed": {
                  "modelparallel": {
                      "enabled":True,
                      "parameters": {
                          "microbatches": 8, # The number of microbatches to perform pipelining over. 1 means no pipelining. Batch size must be divisible by the number of microbatches.
                                             # A microbatch is a smaller subset of a given training mini-batch. The pipeline schedule determines which microbatch is executed by which device for every time slot.   
                          "placement_strategy": "cluster", # more advanced topic: https://sagemaker.readthedocs.io/en/stable/api/training/smd_model_parallel_general.html#placement-strategy-with-tensor-parallelism 
                          "pipeline": "interleaved",
                          "optimize": "speed", 
                        #  "pipeline_parallel_degree": 2, # The number of partitions to split the model into. this is only in newer libs
                          "partitions": 2,
                          "auto_partition": True,
                          "ddp": False, # enables hybrid parallelism: model + data. Makes sense in multi-GPU nodes only.
                      }
                  }
              },
              "mpi": {
                    "enabled": True, # must be enabled
                    "processes_per_host": 1, # Pick your processes_per_host
                    "custom_mpi_options": mpioptions 
              },
          },
          base_job_name="SMD-MP-demo",
      )

In [13]:
smd_mp_estimator.fit(inputs={"train":f"{data_url}/train", "val":f"{data_url}/val"})

2022-04-09 20:35:32 Starting - Starting the training job......
2022-04-09 20:36:08 Starting - Preparing the instances for training.........
2022-04-09 20:37:38 Downloading - Downloading input data...
2022-04-09 20:38:14 Training - Downloading the training image..............................
2022-04-09 20:43:37 Training - Training image download completed. Training in progress..bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2022-04-09 20:43:40,910 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2022-04-09 20:43:40,934 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2022-04-09 20:43:40,944 sagemaker_pytorch_container.training INFO     Invoking user training script.
2022-04-09 20:43:41,759 sagemaker-training-toolkit INFO     Starting MPI run as worker node.
2022-04-09 20:43:41,759 sagemaker-training-toolkit INFO     Creating SSH daemon.
2022-