In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Efficient PyTorch Training on Vertex AI

<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/community-content/pytorch_efficient_training/efficient_pytorch_training_on_vertex.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/community-content/pytorch_efficient_training/efficient_pytorch_training_on_vertex.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/community-content/pytorch_efficient_training/efficient_pytorch_training_on_vertex.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>
<br/><br/><br/>

## Overview


This tutorial demonstrates how to use Vertex AI for E2E MLOps on Google Cloud in production. This tutorial covers stage 2 : experimentation: get started with Vertex AI Training for PyTorch.

### Objective

In this tutorial, you learn how to use `Vertex AI Training` for training a PyTorch custom model.

This tutorial uses the following Google Cloud ML services:

* `Vertex AI Training`
* `Vertex AI Model` resource


The steps performed include:

- Single node training using a Python package.
- Report accuracy when hyperparameter tuning.
- Save the model artifacts to Cloud Storage using GCSFuse.
- Create a `Vertex AI Model` resource.

### Dataset

The dataset used for this tutorial is the [CIFAR10 dataset](https://pytorch.org/vision/stable/datasets.html#cifar) from [PyTorch Datasets](https://pytorch.org/vision/stable/datasets.html). The version of the dataset is built into TensorFlow. The trained model predicts which type of class an image is from ten classes: airplane, automobile, bird, cat, deer, dog, frog, horse, ship, or truck.

### Costs 

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and [Cloud Storage
pricing](https://cloud.google.com/storage/pricing), and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

### Set up your local development environment

**If you are using Colab or Vertex AI Workbench Notebooks**, your environment already meets
all the requirements to run this notebook. You can skip this step.

**Otherwise**, make sure your environment meets this notebook's requirements.
You need the following:

* The Google Cloud SDK
* Git
* Python 3
* virtualenv
* Jupyter notebook running in a virtual environment with Python 3

The Google Cloud guide to [Setting up a Python development
environment](https://cloud.google.com/python/setup) and the [Jupyter
installation guide](https://jupyter.org/install) provide detailed instructions
for meeting these requirements. The following steps provide a condensed set of
instructions:

1. [Install and initialize the Cloud SDK.](https://cloud.google.com/sdk/docs/)

1. [Install Python 3.](https://cloud.google.com/python/setup#installing_python)

1. [Install
   virtualenv](https://cloud.google.com/python/setup#installing_and_using_virtualenv)
   and create a virtual environment that uses Python 3. Activate the virtual environment.

1. To install Jupyter, run `pip3 install jupyter` on the
command-line in a terminal shell.

1. To launch Jupyter, run `jupyter notebook` on the command-line in a terminal shell.

1. Open this notebook in the Jupyter Notebook Dashboard.

## Installations

Install the following packages to execute this notebook.

In [None]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME") and not os.getenv("VIRTUAL_ENV")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"

! pip3 install --upgrade google-cloud-aiplatform $USER_FLAG -q
! pip3 install --upgrade cloudml-hypertune $USER_FLAG -q
! pip3 install --upgrade torchvision $USER_FLAG -q

### Restart the kernel

Once you've installed the additional packages, you need to restart the notebook kernel so it can find the packages.

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Before you begin

### Set up your Google Cloud project

**The following steps are required, regardless of your notebook environment.**

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Vertex AI, BigQuery, Compute Engine and Cloud Storage APIs](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com,bigquery,compute_component,storage_component).

1. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk).

1. Enter your project ID in the cell below. Then run the cell to make sure the
Cloud SDK uses the right project for all the commands in this notebook.

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands.

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
! gcloud config set project $PROJECT_ID

#### Region

You can also change the `REGION` variable, which is used for operations
throughout the rest of this notebook.  Below are regions supported for Vertex AI. We recommend that you choose the region closest to you.

- Americas: `us-central1`
- Europe: `europe-west4`
- Asia Pacific: `asia-east1`

You may not use a multi-regional bucket for training with Vertex AI. Not all regions provide support for all Vertex AI services.

Learn more about [Vertex AI regions](https://cloud.google.com/vertex-ai/docs/general/locations).

In [None]:
REGION = "[your-region]"  # @param {type: "string"}

if REGION == "[your-region]":
    REGION = "us-central1"

#### Timestamp

If you are in a live tutorial session, you might be using a shared test account or project. To avoid name collisions between users on resources created, you create a timestamp for each instance session, and append the timestamp onto the name of resources you create in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

### Authenticate your Google Cloud account

**If you are using Vertex AI Workbench Notebooks**, your environment is already authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

In the Cloud Console, go to the [Create service account key](https://console.cloud.google.com/apis/credentials/serviceaccountkey) page.

1. **Click Create service account**.

2. In the **Service account name** field, enter a name, and click **Create**.

3. In the **Grant this service account access to project** section, click the Role drop-down list. Type "Vertex AI" into the filter box, and select **Vertex AI Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

4. Click Create. A JSON file that contains your key downloads to your local environment.

5. Enter the path to your service account key as the GOOGLE_APPLICATION_CREDENTIALS variable in the cell below and run the cell.

In [None]:
# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.

import os
import sys

# If on Vertex AI Workbench, then don't execute this code
IS_COLAB = False
if not os.path.exists("/opt/deeplearning/metadata/env_version") and not os.getenv(
    "DL_ANACONDA_HOME"
):
    if "google.colab" in sys.modules:
        IS_COLAB = True
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

### Create a Cloud Storage bucket

**The following steps are required, regardless of your notebook environment.**

When you initialize the Vertex SDK for Python, you specify a Cloud Storage staging bucket. The staging bucket is where all the data associated with your dataset and model resources are retained across sessions.

Set the name of your Cloud Storage bucket below. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.

In [None]:
BUCKET_URI = "gs://[your-bucket-name]"  # @param {type:"string"}

In [None]:
if BUCKET_URI == "" or BUCKET_URI is None or BUCKET_URI == "gs://[your-bucket-name]":
    BUCKET_URI = "gs://" + PROJECT_ID + "aip-" + TIMESTAMP

**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket.

In [None]:
! gsutil mb -l $REGION $BUCKET_URI

Finally, validate access to your Cloud Storage bucket by examining its contents:

In [None]:
! gsutil ls -al $BUCKET_URI

### Set up variables

Next, set up some variables used throughout the tutorial.
### Import libraries and define constants

In [9]:
PROJECT_ID = 'rthallam-demo-project'  # Change to your project ID
REGION = 'us-central1'   # Change to your region (example: us-central1)
BUCKET_NAME = 'cloud-ai-platform-2f444b6a-a742-444b-b91a-c7519f51bd77'  # Change to your bucket name

In [10]:
BUCKET_URI = f"gs://{BUCKET_NAME}"

In [11]:
from datetime import datetime

def get_timestamp():
    """return formatted date/time"""
    return datetime.now().strftime("%Y%m%d%H%M%S")

In [12]:
import google.cloud.aiplatform as aiplatform

### Initialize Vertex AI SDK for Python

Initialize the Vertex AI SDK for Python for your project and corresponding bucket.

In [14]:
EXPERIMENT_NAME = "pytorch-efficient-training-test"
aiplatform.init(project=PROJECT_ID,
                location=REGION,
                experiment=EXPERIMENT_NAME,
                staging_bucket=BUCKET_URI)

### Create a TensorBoard instance to be used by the custom training job.

In [35]:
TENSORBOARD_NAME = EXPERIMENT_NAME  # @param {type:"string"}

tb_list = aiplatform.Tensorboard.list(
    filter=f'displayName="{TENSORBOARD_NAME}"',
    project=PROJECT_ID, 
    location=REGION
)

if len(tb_list) == 0:
    tensorboard = aiplatform.Tensorboard.create(
        display_name=TENSORBOARD_NAME, 
        project=PROJECT_ID, 
        location=REGION
    )

    TENSORBOARD_RESOURCE_NAME = tensorboard.gca_resource.name
    print("TensorBoard resource name:", TENSORBOARD_RESOURCE_NAME)
else:
    TENSORBOARD_RESOURCE_NAME = tb_list[0].to_dict()['name']
    print("There is an existing TensorBoard resource with TENSORBOARD_RESOURCE_NAME:", TENSORBOARD_RESOURCE_NAME)

There is an existing TensorBoard resource with TENSORBOARD_RESOURCE_NAME: projects/560224572293/locations/us-central1/tensorboards/3120792231628242944


#### Set hardware accelerators

You can set hardware accelerators for training.

Set the variable `TRAIN_GPU/TRAIN_NGPU` to use a container image supporting a GPU and the number of GPUs allocated to the virtual machine (VM) instance. For example, to use a GPU container image with 4 Nvidia Telsa K80 GPUs allocated to each VM, you would specify:

    (aip.AcceleratorType.NVIDIA_TESLA_K80, 4)

Otherwise specify `(None, None)` to use a container image to run on a CPU.

Learn more [here](https://cloud.google.com/vertex-ai/docs/general/locations#accelerators) hardware accelerator support for your region

In [15]:
TRAIN_GPU, TRAIN_NGPU = (aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_T4, 4)

#### Set pre-built containers

Set the pre-built Docker container image for training.

- Set the variable `TF` to the TensorFlow version of the container image. For example, `2-1` would be version 2.1, and `1-15` would be version 1.15. The following list shows some of the pre-built images available:


For the latest list, see [Pre-built containers for training](https://cloud.google.com/ai-platform-unified/docs/training/pre-built-containers).

In [16]:
TRAIN_VERSION = "pytorch-gpu.1-11"
TRAIN_IMAGE = "{}-docker.pkg.dev/vertex-ai/training/{}:latest".format(
    REGION.split("-")[0], TRAIN_VERSION
)

#### Set machine type

Next, set the machine type to use for training.

- Set the variable `TRAIN_COMPUTE` to configure  the compute resources for the VMs you will use for for training.
 - `machine type`
     - `n1-standard`: 3.75GB of memory per vCPU.
     - `n1-highmem`: 6.5GB of memory per vCPU
     - `n1-highcpu`: 0.9 GB of memory per vCPU
 - `vCPUs`: number of \[2, 4, 8, 16, 32, 64, 96 \]

*Note: The following is not supported for training:*

 - `standard`: 2 vCPUs
 - `highcpu`: 2, 4 and 8 vCPUs

*Note: You may also use n2 and e2 machine types for training and deployment, but they do not support GPUs*.

In [17]:
MACHINE_TYPE = "n1-standard"
VCPU = "8"
TRAIN_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Train machine type", TRAIN_COMPUTE)

Train machine type n1-standard-8


## Introduction to PyTorch training

The PyTorch package supports both single node and distributed model training.

Once you have trained a PyTorch model, you will want to save it at a Cloud Storage location, so it can subsequently be uploaded to a `Vertex AI Model` resource.
The PyTorch package does not have support to save the model to a Cloud Storage location. Instead, you will do the following steps to save to a Cloud Storage location.

1. Save the in-memory model to the local filesystem (e.g., model.pth).
2. Use gsutil to copy the local copy to the specified Cloud Storage location.

*Note*: You can do hyperparameter tuning with a PyTorch model.

### Examine the training package

#### Package layout

Before you start the training, you will look at how a Python package is assembled for a custom training job. When unarchived, the package contains the following directory/file layout.

- PKG-INFO
- README.md
- setup.cfg
- setup.py
- trainer
  - \_\_init\_\_.py
  - task.py

The files `setup.cfg` and `setup.py` are the instructions for installing the package into the operating environment of the Docker image.

The file `trainer/task.py` is the Python script for executing the custom training job. *Note*, when we referred to it in the worker pool specification, we replace the directory slash with a dot (`trainer.task`) and dropped the file suffix (`.py`).

#### Package Assembly

In the following cells, you will assemble the training package.

In [None]:
# Make folder for Python training script
! tree

In [None]:
! cat custom/setup.py

### Create the task script for the Python training package

Next, you create the `task.py` script for driving the training package. Some noteable steps include:

- Command-line arguments:
    - `model-dir`: The location to save the trained model. When using Vertex AI custom training, the location will be specified in the environment variable: `AIP_MODEL_DIR`,
    - `batch_size`/`lr` : Hyperparameter tuning variables
    - `distribute`: single node or distributed training.
- Data preprocessing (`get_data()`):
    - Download the dataset and split into training and test.
- Model architecture (`getmodel()`):
    - Get or build the model architecture.
- Training (`train_model()`):
    - Trains the model
- Evaluation (`evaluate_model()`):
    - Evaluates the model.
    - If hyperparameter tuning, reports the metric for accuracy.
- Model artifact saving
    - Saves the model artifacts and evaluation metrics where the Cloud Storage location specified by `model-dir`.

In [None]:
! cat trainer/main.py

### Test training package locally

Next, test your completed training package locally with just a few epochs.

In [None]:
! export WDS_TRAIN_PATH="/home/jupyter/data/imagenet/imagenet-shards/train-{000020..000039}.tar" && \
  export WDS_EVAL_PATH="/home/jupyter/data/imagenet/imagenet-shards/validation-{000000..000021}.tar" && \
  export PYTHONPATH="${PYTHONPATH}:${PWD}/custom/" && \
  python -m trainer.main \
    --train_data_path ${WDS_TRAIN_PATH} \
    --val_data_path ${WDS_EVAL_PATH} \
    --distributed-strategy=ddp \
    --webdataset

#### Store training script on your Cloud Storage bucket

Next, you package the training folder into a compressed tar ball, and then store it in your Cloud Storage bucket.

In [18]:
python_package_gcs_uri = f"{BUCKET_URI}/experiments/pytorch_eff_training/scripts/trainer_imagenet.tar.gz"    
python_module_name = "trainer.main"

In [19]:
! rm -f custom.tar.gz
! tar -czf custom.tar.gz custom
! gsutil cp custom.tar.gz $python_package_gcs_uri

Copying file://custom.tar.gz [Content-Type=application/x-tar]...
/ [1 files][ 19.6 KiB/ 19.6 KiB]                                                
Operation completed over 1 objects/19.6 KiB.                                     


### Create and run custom training job


To train a custom model, you perform two steps: 1) create a custom training job, and 2) run the job.

#### Create custom training job

A custom training job is created with the `CustomTrainingJob` class, with the following parameters:

- `display_name`: The human readable name for the custom training job.
- `container_uri`: The training container image.

- `python_package_gcs_uri`: The location of the Python training package as a tarball.
- `python_module_name`: The relative path to the training script in the Python package.
- `model_serving_container_uri`: The container image for deploying the model.

*Note:* There is no requirements parameter. You specify any requirements in the `setup.py` script in your Python package.

In [None]:
%%writefile custom/copy_to_local.sh
#!/bin/bash -xv
echo "Copying data from "$1" to /data"
mkdir -p data
gcloud storage cp --recursive $1 ~/data/

In [38]:
DISPLAY_NAME = "pytorch_imagenet_eff_" + get_timestamp()

job = aiplatform.CustomPythonPackageTrainingJob(
    display_name=DISPLAY_NAME,
    python_package_gcs_uri=python_package_gcs_uri,
    python_module_name=python_module_name,
    container_uri=TRAIN_IMAGE,
    staging_bucket=BUCKET_URI
)

### Prepare your command-line arguments

Now define the command-line arguments for your custom training container:

- `args`: The command-line arguments to pass to the executable that is set as the entry point into the container.
  - `--model-dir` : For our demonstrations, we use this command-line argument to specify where to store the model artifacts.
      - direct: You pass the Cloud Storage location as a command line argument to your training script (set variable `DIRECT = True`), or
      - indirect: The service passes the Cloud Storage location as the environment variable `AIP_MODEL_DIR` to your training script (set variable `DIRECT = False`). In this case, you tell the service the model artifact location in the job specification.
  - `--BLAH`:

In [49]:
REPLICA_COUNT = 1
BATCH_SIZE = 32
EPOCHS = 2
WDS_TRAIN_PATH = '/gcs/rt-image-datasets/data/images/imagenet/imagenet_tar/train-{000020..000039}.tar' 
WDS_EVAL_PATH = '/gcs/rt-image-datasets/data/images/imagenet/imagenet_tar/validation-{000000..000021}.tar'
BASE_OUTPUT_DIR = f'{BUCKET_URI}/experiments/pytorch_eff_training/runs/{DISPLAY_NAME}/'

exp_params = {
    'Replica Count': REPLICA_COUNT,
    'Number of GPU': TRAIN_NGPU,
    'Batch Size': BATCH_SIZE,
    'Epochs': EPOCHS,
    'Strategy': 'ddp',
    'WebDataset': 'Y' if WEBDATASET else 'N',
    'Reduction Server': 'Y' if REDUCTION_SERVER else 'N'
}


CMDARGS = [
    f'--train_data_path={WDS_TRAIN_PATH}',
    f'--val_data_path={WDS_EVAL_PATH}', 
    f'--distributed-strategy={exp_params["Strategy"]}',
    f'--gpus={TRAIN_NGPU}',
    f'--batch_size={BATCH_SIZE}',
    f'--epoch={EPOCHS}',
    f'--webdataset' if WEBDATASET else ''
]

EXPERIMENT_RUN_NAME = f"{DIST_STRATEGY}{'-wds' if WEBDATASET else ''}{'-gcsfuse' if WDS_TRAIN_PATH.startswith('/gcs') else ''}"



DIST_STRATEGY = 'ddp'
WEBDATASET = True
REDUCTION_SERVER = False


print(f"CMD_ARGS \n{chr(10).join(CMDARGS)}")
print(f"BASE_OUTPUT_DIR = {BASE_OUTPUT_DIR}")
print(f"EXPERIMENT_RUN_NAME = {EXPERIMENT_RUN_NAME}")

CMD_ARGS 
--train_data_path=/gcs/rt-image-datasets/data/images/imagenet/imagenet_tar/train-{000020..000039}.tar
--val_data_path=/gcs/rt-image-datasets/data/images/imagenet/imagenet_tar/validation-{000000..000021}.tar
--distributed-strategy=ddp
--gpus=4
--webdataset
BASE_OUTPUT_DIR = gs://cloud-ai-platform-2f444b6a-a742-444b-b91a-c7519f51bd77/experiments/pytorch_eff_training/runs/pytorch_imagenet_eff_20221210232854/
EXPERIMENT_RUN_NAME = ddp-wds-gcsfuse


#### Run the custom training job

Next, you run the custom job to start the training job by invoking the method `run`, with the following parameters:

- `model_display_name`: The human readable name for the `Model` resource.
- `args`: The command-line arguments to pass to the training script.
- `replica_count`: The number of compute instances for training (replica_count = 1 is single node training).
- `machine_type`: The machine type for the compute instances.
- `accelerator_type`: The hardware accelerator type.
- `accelerator_count`: The number of accelerators to attach to a worker replica.
- `base_output_dir`: The Cloud Storage location to write the model artifacts to.
- `sync`: Whether to block until completion of the job.

In [None]:
aip.start_run(EXPERIMENT_RUN_NAME)


aiplatform.log_params(hyperparams)


In [20]:
model = job.run(
    args=CMDARGS,
    replica_count=1,
    machine_type=TRAIN_COMPUTE,
    boot_disk_type='pd-ssd',
    boot_disk_size_gb=100,
    accelerator_type=TRAIN_GPU.name,
    accelerator_count=TRAIN_NGPU,
    base_output_dir=BASE_OUTPUT_DIR,
    tensorboard=TENSORBOARD_RESOURCE_NAME,
    sync=False,
)

Training Output directory:
gs://cloud-ai-platform-2f444b6a-a742-444b-b91a-c7519f51bd77/experiments/pytorch_eff_training/runs/pytorch_imagenet_eff_20221210212159/ 
View Training:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/6741435780821942272?project=560224572293
CustomPythonPackageTrainingJob projects/560224572293/locations/us-central1/trainingPipelines/6741435780821942272 current state:
PipelineState.PIPELINE_STATE_RUNNING
View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/5878644608459603968?project=560224572293


In [21]:
job.state

<PipelineState.PIPELINE_STATE_RUNNING: 3>

CustomPythonPackageTrainingJob projects/560224572293/locations/us-central1/trainingPipelines/6741435780821942272 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomPythonPackageTrainingJob projects/560224572293/locations/us-central1/trainingPipelines/6741435780821942272 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomPythonPackageTrainingJob projects/560224572293/locations/us-central1/trainingPipelines/6741435780821942272 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomPythonPackageTrainingJob projects/560224572293/locations/us-central1/trainingPipelines/6741435780821942272 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomPythonPackageTrainingJob projects/560224572293/locations/us-central1/trainingPipelines/6741435780821942272 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomPythonPackageTrainingJob projects/560224572293/locations/us-central1/trainingPipelines/6741435780821942272 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomPyth

In [1]:
! gsutil cat gs://cloud-ai-platform-2f444b6a-a742-444b-b91a-c7519f51bd77/experiments/pytorch_eff_training/runs/pytorch_imagenet_eff_20221210212159/checkpoints/metrics/summary_metrics.json | jq

[1;39m{
  [0m[34;1m"num_gpus"[0m[1;39m: [0m[0;39m4[0m[1;39m,
  [0m[34;1m"num_epochs"[0m[1;39m: [0m[0;39m1[0m[1;39m,
  [0m[34;1m"train_time"[0m[1;39m: [0m[0;39m114.701[0m[1;39m,
  [0m[34;1m"eval_time"[0m[1;39m: [0m[0;39m143.709[0m[1;39m,
  [0m[34;1m"data_load_time"[0m[1;39m: [0m[0;39m64.185[0m[1;39m,
  [0m[34;1m"data_througput"[0m[1;39m: [0m[0;39m3179[0m[1;39m,
  [0m[34;1m"forward_time"[0m[1;39m: [0m[0;39m36.112[0m[1;39m,
  [0m[34;1m"backward_time"[0m[1;39m: [0m[0;39m61.167[0m[1;39m
[1;39m}[0m


### List a custom training job

In [None]:
_job = job.list(filter=f"display_name={DISPLAY_NAME}")
print(_job)

### Wait for completion of custom training job

Next, wait for the custom training job to complete. Alternatively, one can set the parameter `sync` to `True` in the `run()` method to block until the custom training job is completed.

In [None]:
model.wait()

### Delete a custom training job

After a training job is completed, you can delete the training job with the method `delete()`.  Prior to completion, a training job can be cancelled with the method `cancel()`.

In [None]:
job.delete()

# Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

- Model
- Cloud Storage Bucket

In [None]:
# Delete the model using the Vertex model object
model.delete()

delete_bucket = False
if delete_bucket or os.getenv("IS_TESTING"):
    ! gsutil rm -r $BUCKET_URI

---

In [6]:
import glob
import json
metrics_dir = 'tmp/checkpoints/metrics'

def display_metrics(metrics_dir):
    # collect all metrics
    metrics_f = glob.glob(f'{metrics_dir}/metrics_*.json')
    m = [json.load(open(f)) for f in metrics_f]
    print(json.dumps(m))
    len_m = len(m)
    m_totals = [item['total'] for item in m]
    m_0 = [item for item in m if item['gpu'] == 0][0]

    # inline functions
    get_metric = lambda meter,key: [item[key] for item in meter]
    avg = lambda items: sum(items)/len(items)

    # calculate summary
    m_summary = {
        'num_gpus': len_m, 
        'num_epochs': len(m_0['epoch']), 
        'train_time': round(avg(get_metric(m_totals, 'train_time')), 3), 
        'eval_time': round(avg(get_metric(m_totals, 'eval_time')), 3), 
        'data_load_time': round(avg(get_metric(m_totals, 'data_load_time')), 3), 
        'data_througput': round(sum(get_metric(m_totals, 'data_throughput')), 3), 
        'forward_time': round(avg(get_metric(m_totals, 'forward_time')), 3), 
        'backward_time': round(avg(get_metric(m_totals, 'backward_time')), 3), 
    }

    # display summary
    metrics_fmt = '\n=> '.join([f'{k} = {v}' for k,v in m_summary.items()])
    print('-'*80)
    print(f'=> {metrics_fmt}')
    print('-'*80)

display_metrics(metrics_dir)

[{"gpu": 2, "total": {"train_time": 144.823, "eval_time": 116.842, "data_load_time": 4.329, "data_throughput": 3344.0, "forward_time": 77.065, "backward_time": 62.917}, "epoch": [{"epoch": 1, "dataset_size": 390, "data_time": 5.922, "data_throughput": 2113.0, "forward_time": 81.489, "backward_time": 64.126}, {"epoch": 2, "dataset_size": 390, "data_time": 2.735, "data_throughput": 4575.0, "forward_time": 72.641, "backward_time": 61.708}]}, {"gpu": 0, "total": {"train_time": 150.025, "eval_time": 113.454, "data_load_time": 2.572, "data_throughput": 4984.5, "forward_time": 80.29, "backward_time": 65.403}, "epoch": [{"epoch": 1, "dataset_size": 390, "data_time": 2.974, "data_throughput": 4207.0, "forward_time": 86.019, "backward_time": 66.013}, {"epoch": 2, "dataset_size": 390, "data_time": 2.171, "data_throughput": 5762.0, "forward_time": 74.56, "backward_time": 64.793}]}, {"gpu": 1, "total": {"train_time": 147.732, "eval_time": 126.185, "data_load_time": 5.088, "data_throughput": 2568.0,