In [None]:
!pip install --upgrade azureml-core azureml-widgets azureml-train-core azureml-sdk

In [None]:
import os
import requests
import sys

# AzureML libraries
import azureml.core
from azureml.core import Experiment, Workspace, Datastore, Run, Environment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import ScriptRunConfig
from azureml.widgets import RunDetails

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

In [None]:
ws = Workspace.get(name='', subscription_id='', resource_group='')

# Print workspace attributes
print('Workspace name: ' + ws.name, 
      'Workspace region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

In [None]:
# Create a datastore from blob storage containing training data.
# Consult README.md for instructions downloading and uploading training data.
ds = Datastore.register_azure_blob_container(workspace=ws, 
                                             datastore_name='',
                                             account_name='', 
                                             account_key='',
                                             container_name='')

In [None]:
# Print datastore attributes
print('Datastore name: ' + ds.name, 
      'Container name: ' + ds.container_name, 
      'Datastore type: ' + ds.datastore_type, 
      'Workspace name: ' + ds.workspace.name, sep = '\n')

In [None]:
# Create the compute cluster
gpu_cluster_name = "ndv2-cluster" 

# Verify that the cluster doesn't exist already
try:
    gpu_compute_target = ComputeTarget(workspace=ws, name=gpu_cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_ND40rs_v2', min_nodes=0, max_nodes=4)
    
    # create the cluster
    gpu_compute_target = ComputeTarget.create(ws, gpu_cluster_name, compute_config)
    gpu_compute_target.wait_for_completion(show_output=True)

In [None]:
# Create experiment for phase 1
experiment_name = 'hf-t5-ortmodule'
experiment = Experiment(ws, name=experiment_name)

In [None]:
hf_t5_large_fp16_env = Environment(name="hf-t5-large-fp16-env")
hf_t5_large_fp16_env.docker.enabled = True
hf_t5_large_fp16_env.python.user_managed_dependencies = True
#hf_t5_large_fp16_env.environment_variables = {"CUDA_VISIBLE_DEVICES":"0"}

In [None]:
# Set the container registry information.
hf_t5_large_fp16_env.docker.base_image = ''
hf_t5_large_fp16_env.docker.base_image_registry.address = ''
hf_t5_large_fp16_env.docker.base_image_registry.username = ''
hf_t5_large_fp16_env.docker.base_image_registry.password = ''

In [None]:
output_dir_ref = ds.path('tst-translation').as_mount() #str(output_dir_ref)
args = ['--source_prefix', 'translate English to Romanian:', '--dataset_name', 'wmt16', '--dataset_config', 'ro-en', '--model_name_or_path', 't5-large', '--output_dir', '/tmp/tst-translation', '--adam_eps', '1e-06', '--do_train', '--label_smoothing', 0.1, '--learning_rate', '3e-5', '--logging_first_step', '--logging_steps', 1000, '--max_source_length', 128, '--max_target_length', 128, '--num_train_epochs', 1, '--overwrite_output_dir', '--per_device_train_batch_size', 32, '--predict_with_generate', '--sortish_sampler', '--task', 'translation_en_to_ro', '--warmup_steps', 5, '--max_train_samples', 1024, '--fp16']

In [None]:
from azureml.core import ScriptRunConfig

hf_t5_large_fp16_src = ScriptRunConfig(source_directory='.',
                      script='run_seq2seq.py',
                      arguments=args,
                      compute_target=gpu_compute_target,
                      environment=hf_t5_large_fp16_env)

In [None]:
run = experiment.submit(hf_t5_large_fp16_src,  tags={'model':'large', 'config':'fp16', 'gpus':'1'})
#run.wait_for_completion(show_output=True)

In [None]:
dist_args = "--source_prefix \'translate English to Romanian:\' --dataset_name wmt16 --dataset_config \'ro-en\' --model_name_or_path t5-large --output_dir /tmp/tst-translation --adam_eps 1e-06 --do_train --label_smoothing 0.1 --learning_rate 3e-5 --logging_first_step --logging_steps 1000 --max_source_length 128 --max_target_length 128 --num_train_epochs 1 --overwrite_output_dir --per_device_train_batch_size 32 --predict_with_generate --sortish_sampler --task translation_en_to_ro --warmup_steps 5 --max_train_samples 1024 --fp16"


In [None]:
# create distributed config
distr_config = PyTorchConfiguration(process_count=2, node_count=1)
hf_t5_large_fp16_env.environment_variables = {"CUDA_VISIBLE_DEVICES":"0,1"}

# create job config
hf_t5_large_fp16_multigpu_src = ScriptRunConfig(source_directory='.',
                                                script='run_seq2seq.py',
                                                arguments=dist_args,
                                                compute_target=gpu_compute_target,
                                                environment=hf_t5_large_fp16_env,
                                                distributed_job_config=distr_config)

In [None]:
from azureml.core import ScriptRunConfig
from azureml.core.runconfig import PyTorchConfiguration

# create distributed config
distr_config = PyTorchConfiguration(process_count=2, node_count=1)

# define command
multigpu_cmd = "python -m torch.distributed.launch --nproc_per_node 2 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT --use_env run_seq2seq.py --source_prefix 'translate English to Romanian:' --dataset_name wmt16 --dataset_config 'ro-en' --model_name_or_path t5-large --output_dir /tmp/tst-translation --adam_eps 1e-06 --do_train --label_smoothing 0.1 --learning_rate 3e-5 --logging_first_step --logging_steps 1000 --max_source_length 128 --max_target_length 128 --num_train_epochs 1 --overwrite_output_dir --per_device_train_batch_size 32 --predict_with_generate --sortish_sampler --task translation_en_to_ro --warmup_steps 5 --max_train_samples 1024 --fp16"

# create job config
hf_t5_large_fp16_multigpu_src = ScriptRunConfig(source_directory='.',
                                command=multigpu_cmd,
                                compute_target=gpu_compute_target,
                                environment=hf_t5_large_fp16_env,
                                distributed_job_config=distr_config)

In [None]:
multigpu_run = experiment.submit(hf_t5_large_fp16_multigpu_src,  tags={'model':'large', 'config':'fp16', 'gpus':'2'})
#multigpu_run.wait_for_completion(show_output=True)

In [None]:
ds_ort_args = ['--source_prefix', 'translate English to Romanian:', '--dataset_name', 'wmt16', '--dataset_config', 'ro-en', '--model_name_or_path', 't5-large', '--output_dir', '/tmp/tst-translation', '--adam_eps', '1e-06', '--do_train', '--label_smoothing', 0.1, '--learning_rate', '3e-5', '--logging_first_step', '--logging_steps', 1000, '--max_source_length', 128, '--max_target_length', 128, '--num_train_epochs', 1, '--overwrite_output_dir', '--per_device_train_batch_size', 32, '--predict_with_generate', '--sortish_sampler', '--task', 'translation_en_to_ro', '--warmup_steps', 5, '--max_train_samples', 1024, '--fp16', '--ort', '--deepspeed', 'ds_config_zero_0.json']

In [None]:
from azureml.core import ScriptRunConfig
from azureml.core.runconfig import PyTorchConfiguration

# define command
ds_ort_cmd = ["python run_seq2seq.py ", str(ds_ort_args)]

# create job config
hf_t5_large_fp16_ds_ort_src = ScriptRunConfig(source_directory='.',
                                              command=ds_ort_cmd,
                                              compute_target=gpu_compute_target,
                                              environment=hf_t5_large_fp16_env)

In [None]:
ds_ort_run = experiment.submit(hf_t5_large_fp16_ds_ort_src,  tags={'model':'large', 'config':'ds_ort', 'gpus':'1'})
#ds_ort_run.wait_for_completion(show_output=True)

In [None]:
# create distributed config
multinode_distr_config = PyTorchConfiguration(node_count=2)

# define command
launch_cmd = ["CUDA_VISIBLE_DEVICES=0 python -m torch.distributed.launch --nproc_per_node 1 --nnodes 2" \
              "run_seq2seq.py ", args]

# create job config
hf_t5_large_fp16_multinode_src = ScriptRunConfig(source_directory='.',
                                command=launch_cmd,
                                compute_target=gpu_compute_target,
                                environment=hf_t5_large_fp16_env,
                                distributed_job_config=multinode_distr_config)

In [None]:
multinode_run = experiment.submit(hf_t5_large_fp16_multinode_src,  tags={'model':'large', 'config':'fp16', 'nodes':'2'})
#multinode_run.wait_for_completion(show_output=True)