In [None]:
%%sh
pip -q install sagemaker --upgrade

In [None]:
import sagemaker

print(sagemaker.__version__)

session = sagemaker.Session()
bucket = session.default_bucket()

### Define channels

In [None]:
prefix = 'imagenet-split'
s3_train_path = 's3://{}/{}/input/training/'.format(bucket, prefix)
s3_val_path   = 's3://{}/{}/input/validation/'.format(bucket, prefix)
s3_output     = 's3://{}/{}/output/'.format(bucket, prefix)

from sagemaker.inputs import ShuffleConfig

train_data = sagemaker.TrainingInput(s3_train_path, 
                      shuffle_config=ShuffleConfig(59),
                      content_type='application/x-recordio',
                      input_mode='Pipe')

validation_data = sagemaker.TrainingInput(s3_val_path, 
                           content_type='application/x-recordio', 
                           input_mode='Pipe')

In [None]:
print(s3_train_path)
print(s3_val_path)
print(s3_output)

In [None]:
s3_channels = {'train': train_data, 'validation': validation_data}

### Get the name of the image classification algorithm in our region

In [None]:
region    = session.boto_session.region_name    
container = sagemaker.image_uris.retrieve('image-classification', region)

print(container)

### Configure the training job

In [None]:
role = sagemaker.get_execution_role()

ic = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1, 
    instance_type='ml.p3dn.24xlarge',   # 8 GPUs
    output_path=s3_output,
    volume_size=1
)

### Set algorithm parameters

In [None]:
ic.set_hyperparameters(
    num_layers=50,                 # Train a Resnet-50 model
    use_pretrained_model=0,        # Train from scratch
    num_classes=1000,              # ImageNet has 1000 classes
    num_training_samples=1281167,  # Number of training samples
    mini_batch_size=512,          
    epochs=2,
    top_k=3,
    kv_store='dist_sync',          # gradient updates are synchronized after each batch
)

In [None]:
ic.fit(inputs=s3_channels)

CloudWatch shows that total GPU memory utilization is only 300%, meaning 300/8=37.5% on each GPU. Let's bump batch size to (1024/0.375)=2730, rounded up to 2736 to be divisible by 8.

In [None]:
ic = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1, 
    instance_type='ml.p3dn.24xlarge',
    output_path=s3_output,
    volume_size=1
)

In [None]:
ic.set_hyperparameters(num_layers=50,                 
                       use_pretrained_model=0,       
                       num_classes=1000,             
                       num_training_samples=1281167,
                       mini_batch_size=2736,          # <-----
                       learning_rate=0.4,
                       epochs=2,
                       kv_store='dist_sync',
                       augmentation_type='crop',
                       top_k=3)

In [None]:
ic.fit(inputs=s3_channels)

In [None]:
# Add Distributed Training

ic = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=2,                 # <--------
    instance_type='ml.p3dn.24xlarge',
    output_path=s3_output,
    volume_size=1
)

In [None]:
# Same as above

ic.set_hyperparameters(num_layers=50,                 
                       use_pretrained_model=0,        
                       num_classes=1000,              
                       num_training_samples=1281167,  
                       mini_batch_size=2736,          
                       learning_rate=0.4,
                       epochs=2,
                       kv_store='dist_sync',
                       augmentation_type='crop',
                       top_k=3
                      )

In [None]:
ic.fit(inputs=s3_channels)

In [None]:
# Same job with 4 instances

ic = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=4,                 # <--------
    instance_type='ml.p3dn.24xlarge',
    output_path=s3_output,
    volume_size=1                    
)

# Same as above

ic.set_hyperparameters(num_layers=50,                 
                       use_pretrained_model=0,        
                       num_classes=1000,              
                       num_training_samples=1281167,  
                       mini_batch_size=2736,          
                       learning_rate=0.4,
                       epochs=2,
                       kv_store='dist_sync',
                       augmentation_type='crop',
                       top_k=3)

ic.fit(inputs=s3_channels)

In [None]:
# Same job with 8 instances

ic = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=8,                 # <-------- 64 GPUs
                                      # 327K CUDA cores, 2TB of GPU RAM
                                      # 8 Petaflops (!) for Fused Multiply Add matrix operations (A*B + C)
    instance_type='ml.p3dn.24xlarge',
    output_path=s3_output,
    volume_size=1         
)

# Same as above

ic.set_hyperparameters(num_layers=50,                
                       use_pretrained_model=0,       
                       num_classes=1000,             
                       num_training_samples=1281167,  
                       mini_batch_size=2000,          
                       learning_rate=0.4,
                       epochs=2,
                       kv_store='dist_sync',
                       augmentation_type='crop',
                       top_k=3)

ic.fit(inputs=s3_channels)

Now let's train this for a little while.

In [None]:
ic = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=8, 
    instance_type='ml.p3dn.24xlarge',
    output_path=s3_output,
    volume_size=1
)

In [None]:
# Increase epochs
# Add early stopping

ic.set_hyperparameters(
    num_layers=50,                 
    use_pretrained_model=0,        
    num_classes=1000,              
    num_training_samples=1281167,
    augmentation_type='crop',
    mini_batch_size=2000,
    epochs=250,
    #early_stopping=True,
    #early_stopping_patience=30,
    learning_rate=0.5,
    lr_scheduler_factor=0.5,
    lr_scheduler_step='10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200',
    weight_decay='0.0005',
    kv_store='dist_sync',
    top_k=3)

In [None]:
ic.fit(inputs=s3_channels)