In [None]:
# Check core SDK version number
import azureml.core

print("SDK version:", azureml.core.VERSION)

In [None]:
from azureml.core.workspace import Workspace

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cluster_name = "gpu-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(
        vm_size='Standard_NC6', 
        vm_priority="dedicated",
        min_nodes = 0,
        max_nodes = 4,
        idle_seconds_before_scaledown=300
    )

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

# can poll for a minimum number of nodes and for a specific timeout. 
# if no min node count is provided it uses the scale settings for the cluster
compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

In [None]:
import os
import shutil
import glob

project_folder = './train'
os.makedirs(project_folder, exist_ok=True)

for script in glob.glob('*.py'):
    shutil.copy(script, project_folder)

data_folder = project_folder + '/data'
os.makedirs(data_folder, exist_ok=True)
for txt in glob.glob('./data/*.txt'):
    shutil.copy(txt, data_folder)

In [None]:
from azureml.core import Experiment

experiment_name = 'tv-script-generation'
experiment = Experiment(ws, name=experiment_name)

In [None]:
%%writefile conda_dependencies.yml

channels:
- conda-forge
dependencies:
- python=3.6.2
- pip:
  - azureml-defaults
  - torch==1.6.0
  - torchvision==0.7.0
  - future==0.17.1
  - pillow

In [None]:
from azureml.core import Environment

pytorch_env = Environment.from_conda_specification(name = 'pytorch-1.6-gpu', file_path = './conda_dependencies.yml')

# Specify a GPU base image
#pytorch_env.docker.enabled = True
pytorch_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04'

In [None]:
from azureml.core import ScriptRunConfig
from azureml.core.runconfig import DockerConfiguration

args = [
    '--num_epochs', 10, 
    '--batch_size', 256,
    '--learning_rate', 0.001,
    '--sequence_length', 10,
    '--embedding_dim', 300,
    '--hidden_dim', 400,
    '--num_layers', 2,
    '--output_dir', './outputs'
]

docker_config = DockerConfiguration(use_docker=True)
src = ScriptRunConfig(source_directory=project_folder,
                      script='train.py',
                      arguments=args,
                      compute_target=compute_target,
                      environment=pytorch_env,
                      docker_runtime_config=docker_config)

In [None]:
run = experiment.submit(src)
print(run)

In [None]:
from azureml.widgets import RunDetails

RunDetails(run).show()

In [None]:
#run.wait_for_completion(show_output=True)

In [None]:
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy
from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, uniform, quniform

param_sampling = RandomParameterSampling( {
        'learning_rate': uniform(0.0001, 0.001),
        "sequence_length": choice(range(5, 51, 5)),
        "embedding_dim": quniform(200, 1000, 50),
        "hidden_dim": quniform(200, 1000, 50),
        "num_layers": choice(range(1,4))
    }
)

early_termination_policy = BanditPolicy(slack_factor=0.15, evaluation_interval=1, delay_evaluation=10)

hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=param_sampling, 
                                     policy=early_termination_policy,
                                     primary_metric_name='train_loss',
                                     primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
                                     max_total_runs=30,
                                     max_concurrent_runs=4)

In [None]:
# start the HyperDrive run
hyperdrive_run = experiment.submit(hyperdrive_config)

In [None]:
RunDetails(hyperdrive_run).show()