# Using Wandb with AWS Batch
**Use conda_tensorflow2_p36 kernel!**

In [76]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Install Wandb

In [2]:
!pip install wandb -q

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow2_p36/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
home = '/home/ec2-user/SageMaker/kolmogorov/model'

In [4]:
cd $home

/home/ec2-user/SageMaker/kolmogorov/model


## Notebook Configuration
TODO (Ryan): Explain these variables

In [5]:
region = 'us-east-2'
sweep = False
sweep_id = None #'rosenblatt/satellite-model-and-orientation/dqcsh4ar' 
test_locally = False
multi_gpu = True

## Import Packages

In [6]:
import yaml
import json
import wandb
import boto3
import base64
import tensorflow as tf
from botocore.exceptions import ClientError

## Obtain Wandb API Key from AWS Secret Manager
We suggest you put your Wandb API key in an AWS Secret named `wandb_api_key`

In [7]:
def get_secret(secret_name, region_name):
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    
    get_secret_value_response = client.get_secret_value(
        SecretId=secret_name
    )
    
    if 'SecretString' in get_secret_value_response:
        secret = get_secret_value_response['SecretString']
        return json.loads(secret)
    else:
        decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
        return decoded_binary_secret

In [8]:
assert wandb.login(key=get_secret("wandb_api_key", region)["wandb_api_key"])
!rm -r wandb

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ec2-user/.netrc


## Initialize the Sweep ID

In [9]:
def get_sweep_config(session):
    sweep_config = yaml.load(open(f'{home}/sessions/{session}'), Loader=yaml.FullLoader)
    return sweep_config

In [10]:
def get_sweep_id(entity, project, session):
    sweep_config = get_sweep_config(session)
    sweep_id = wandb.sweep(sweep_config, project=project, entity=entity)
    return sweep_id

In [11]:
if sweep:
    sessions = !ls $home/sessions
    print(f'Pick a session: {sessions}') # TODO: make this a widget

TODO: Explain the difference in the sessions.

In [12]:
if sweep:
    session = 'S1P2.yaml' # CHANGE  ME
    assert session in sessions, f"Session does not exist. Please use one of the following: {sessions}"

TODO: Explain the relevant variables.

In [13]:
if sweep and sweep_id is None:
    entity = 'rosenblatt' # REPLACE ME
    project = 'satellite-model-and-orientation' # REPLACE ME
    sweep_id = get_sweep_id(entity, project, session)
    sweep_id = f'{entity}/{project}/{sweep_id}'

In [14]:
if sweep:
    config = get_sweep_config(session)
    params = config['parameters']
    entrypoint_args_lis = [
        "--dataset_size",
        params['dataset_size']['value'],
        "--sample_distortion",
        params['sample_distortion']['value'],
        "--sweep_id",
        sweep_id
    ]
    entrypoint_args_str = ' '.join(entrypoint_args_lis)
else:
    entrypoint_args_lis = [
        "--dataset_size",
        'large',
        "--sample_distortion",
        'generic',
    ]
    entrypoint_args_str = ' '.join(entrypoint_args_lis)

## Build Model Image

In [15]:
cd $home/containers/training

/home/ec2-user/SageMaker/kolmogorov/model/containers/training


In [16]:
image_name = f'kolmogorov-model:training'
api_key = get_secret('wandb_api_key', region)['wandb_api_key']

### Authenticate with ECR
We set the base image using an [Amazon Deep Learning Container](https://github.com/aws/deep-learning-containers/blob/master/available_images.md). To access the container, you must authenticate docker appropriately.

In [17]:
# TODO: can we get this from or move this to DockerHub?
aws_dl_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com'
!aws ecr get-login-password --region $region | docker login --username AWS --password-stdin $aws_dl_uri

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


If this is your first time running the cell below in this sessions, docker will have to pull the base image from ECR. 

In [242]:
# TODO: use docker compose 
!docker build --tag $image_name .

Sending build context to Docker daemon  39.94kB
Step 1/7 : FROM 763104351884.dkr.ecr.us-east-2.amazonaws.com/tensorflow-training:2.3.0-gpu-py37-cu102-ubuntu18.04
 ---> d2706d36cacc
Step 2/7 : WORKDIR /opt/training
 ---> Using cache
 ---> cdbb358ab5c7
Step 3/7 : COPY model.py .
 ---> a7c979890f11
Step 4/7 : COPY entrypoint.sh .
 ---> 73a965ac5017
Step 5/7 : RUN pip install --upgrade pip -q
 ---> Running in a9fd62841433
Removing intermediate container a9fd62841433
 ---> f718606536cf
Step 6/7 : RUN pip install awscli pillow wandb boto3 -q
 ---> Running in 5e3e485326fa
Removing intermediate container 5e3e485326fa
 ---> db01c034506c
Step 7/7 : ENTRYPOINT ["./entrypoint.sh"]
 ---> Running in 7969f411d884
Removing intermediate container 7969f411d884
 ---> ed3087852907
Successfully built ed3087852907
Successfully tagged kolmogorov-model:training


## Test Image Locally

In [243]:
# MODEL_CODE_VOLUME needs to be kept up to date witht the model code location
if test_locally:
    WORKDIR = '/opt/training'
    MODEL_CODE_VOLUME = f'-v `pwd`/model.py:{WORKDIR}/model.py'
    ENTRYPOINT_VOLUME = f'-v `pwd`/entrypoint.sh:{WORKDIR}/entrypoint.sh'
    DATASET_VOLUME = f'-v $home/dataset:{WORKDIR}/dataset'

    if len(tf.config.experimental.list_physical_devices('GPU')):
        cmd  = 'nvidia-docker'
        print(f'Attempting to train on GPU, using {cmd} command.')
    else:
        cmd = 'docker'
        print(f'Attempting to train on CPU, using {cmd} command.')
    if sweep:
        !$cmd run --env-file $home/svc_account_env.list \
            -e WANDB_API_KEY=$api_key \
            $MODEL_CODE_VOLUME \
            $DATASET_VOLUME \
            $ENTRYPOINT_VOLUME \
            -it $image_name \
            $entrypoint_args_str
    else:
        !$cmd run --env-file $home/svc_account_env.list \
            -e WANDB_API_KEY=$api_key \
            $MODEL_CODE_VOLUME \
            $DATASET_VOLUME \
            $ENTRYPOINT_VOLUME \
            -it $image_name \
            $entrypoint_args_str

## Push Image to ECR
The repository for the kolmogorov-model image can be found [here](https://us-east-2.console.aws.amazon.com/ecr/repositories/kolmogorov-model/?region=us-east-2). You will not have authority to push to our repository but you are welcome to replace the `ecr_uri` with your own as you modify our image for your use case.

### Authenticate with ECR
In order to push to your own repository, you will have to reauthenticate docker.

In [244]:
ecr_uri = f'751398683966.dkr.ecr.{region}.amazonaws.com' # REPLACE ME
!aws ecr get-login-password --region $region | docker login --username AWS --password-stdin $ecr_uri

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [262]:
image_uri = f'{ecr_uri}/{image_name}'
!docker build --quiet --tag $image_name .
!docker tag $image_name $image_uri
!docker push $image_uri

sha256:db19bb14986795b3d21dccf9aea786f16a9becb235471dcff58998147f55970f
The push refers to repository [751398683966.dkr.ecr.us-east-2.amazonaws.com/kolmogorov-model]

[1B3e0777e7: Preparing 
[1Bb41bffcf: Preparing 
[1B140a06ab: Preparing 
[1Bc473aaf4: Preparing 
[1Bbef5debb: Preparing 
[1B9324184c: Preparing 
[1Becb1ba98: Preparing 
[1B79ce13df: Preparing 
[1Be91fd1cb: Preparing 
[1Bccebf161: Preparing 
[1B40cf1731: Preparing 
[1B6b92800c: Preparing 
[1Bf587501b: Preparing 
[1Bdf01dd6e: Preparing 
[1Bb32e31cb: Preparing 
[1B97806199: Preparing 
[1B76a08085: Preparing 
[1B78bc6dda: Preparing 
[1B4bdf7df0: Preparing 
[1B9929bff4: Preparing 
[1B7307b30b: Preparing 
[1B106bae06: Preparing 
[1B7a81b415: Preparing 
[1B56a4b5b9: Preparing 
[1B0eb25594: Preparing 
[1B63138511: Preparing 
[1B1fb7adcd: Preparing 
[1Bf9a74649: Preparing 
[1Bda143c91: Preparing 
[1B287e1f04: Preparing 
[31Be0777e7: Pushed   17.02MB/15.53MB8A[2K[25A[2K[31A[2K[24A[2K[31A[2K[3

## Run the image in AWS Batch
Running the following cells will setup a job using the same infastructure used in the paper.

### Register the Job Definition

In [263]:
batch = boto3.client('batch')

In [264]:
batch_job_definitons = batch.describe_job_definitions()['jobDefinitions']
if not any([job_def['jobDefinitionName'] == 'training' for job_def in batch_job_definitons]):
    # TODO (Justin): rewrite using boto3 but stil use the json file
    !aws batch register-job-definition --cli-input-json file://$home/definitions/training/job.json
else:
    print('Job definition already exists.')

Job definition already exists.


### Create and Enable the Compute Environment

In [265]:
#TODO (Justin): make this programatic please. Use boto3 or awscli and the json files in definitions/training.

### Create the Job Queue

In [266]:
#TODO (Justin): make this programatic please. Use boto3 or awscli and the json files in definitions/training.

### Stat the Job
After running the cell below, you can monitor the job on the [AWS Batch Dashboard](https://console.aws.amazon.com/batch/home).

In [267]:
num_jobs = 1

In [268]:
for _ in range(num_jobs):
    response = batch.submit_job(
        jobName = config['name'].replace(' ', '_') if sweep else 'training_job',
        jobQueue = 'training',
        jobDefinition = 'training',
        containerOverrides={
            'command': entrypoint_args_lis,
            'resourceRequirements': [
                {
                    'value': '8' if multi_gpu else '1',
                    'type': 'GPU'
                }
            ],
            'environment': [
                {'name':env_var.split('=')[0], 'value':env_var.split('=')[1]} for env_var in open(f'{home}/svc_account_env.list', "r").read().split('\n')]+[{'name':'WANDB_API_KEY', 'value':api_key}],
        }
    )
    print(f'Job ID is {response["jobId"]}.')

Job ID is 31abf7c1-4a9c-41e6-9bc5-9ecc470fb3f7.


## Shutting down the Run
As of now, you must end the sweep manually using Wandb's sweep dashboard. Once you have stopped all runs or killed the sweep, run the cell below to disable the compute environment.

In [49]:
#TODO (Justin): make this programatic please. Use boto3 or awscli and the json files in definitions/training.