In [1]:
# Sleeps are used because these notebooks are being tested automatically as part of the CI/CD. 
# In a normal user interaction, these sleeps would not be necessary.
from time import sleep

import os
import shutil
from uuid import uuid4
from time import sleep

In [2]:
def run_tensorboard_hparam_tuning(tensorboard_events_dir, epochs=2):
    # Code based on: https://www.tensorflow.org/tensorboard/hyperparameter_tuning_with_hparams
    wf_id = str(uuid4())
    import tensorflow as tf
    from tensorboard.plugins.hparams import api as hp

    fashion_mnist = tf.keras.datasets.fashion_mnist

    (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    HP_NUM_UNITS = hp.HParam("num_units", hp.Discrete([16, 32]))
    HP_DROPOUT = hp.HParam("dropout", hp.RealInterval(0.1, 0.2))
    HP_OPTIMIZER = hp.HParam("optimizer", hp.Discrete(["adam", "sgd"]))
    HP_BATCHSIZES = hp.HParam("batch_size", hp.Discrete([32, 64]))

    HP_MODEL_CONFIG = hp.HParam("model_config")
    HP_OPTIMIZER_CONFIG = hp.HParam("optimizer_config")

    METRIC_ACCURACY = "accuracy"

    with tf.summary.create_file_writer(tensorboard_events_dir).as_default():
        hp.hparams_config(
            hparams=[
                HP_NUM_UNITS,
                HP_DROPOUT,
                HP_OPTIMIZER,
                HP_BATCHSIZES,
                HP_MODEL_CONFIG,
                HP_OPTIMIZER_CONFIG,
            ],
            metrics=[hp.Metric(METRIC_ACCURACY, display_name="Accuracy")],
        )

    def train_test_model(hparams, tensorboard_events_dir):
        model = tf.keras.models.Sequential(
            [
                tf.keras.layers.Flatten(),
                tf.keras.layers.Dense(
                    hparams[HP_NUM_UNITS], activation=tf.nn.relu
                ),
                tf.keras.layers.Dropout(hparams[HP_DROPOUT]),
                tf.keras.layers.Dense(10, activation=tf.nn.softmax),
            ]
        )
        model.compile(
            optimizer=hparams[HP_OPTIMIZER],
            loss="sparse_categorical_crossentropy",
            metrics=["accuracy"],
        )

        model.fit(
            x_train,
            y_train,
            epochs=epochs,
            callbacks=[
                tf.keras.callbacks.TensorBoard(tensorboard_events_dir),
                # log metrics
                hp.KerasCallback(tensorboard_events_dir, hparams),  # log hparams
            ],
            batch_size=hparams[HP_BATCHSIZES],
        )  # Run with 1 epoch to speed things up for tests
        _, accuracy = model.evaluate(x_test, y_test)
        return accuracy

    def run(run_dir, hparams):
        with tf.summary.create_file_writer(run_dir).as_default():
            hp.hparams(hparams)  # record the values used in this trial
            accuracy = train_test_model(hparams, tensorboard_events_dir)
            tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

    session_num = 0

    for num_units in HP_NUM_UNITS.domain.values:
        for dropout_rate in (
            HP_DROPOUT.domain.min_value,
            HP_DROPOUT.domain.max_value,
        ):
            for optimizer in HP_OPTIMIZER.domain.values:
                for batch_size in HP_BATCHSIZES.domain.values:
                    # These two added ids below are optional and useful
                    # just to contextualize this run.
                    hparams = {
                        "workflow_id": wf_id,
                        "activity_id": "hyperparam_evaluation",
                        "epochs": epochs,
                        HP_NUM_UNITS: num_units,
                        HP_DROPOUT: dropout_rate,
                        HP_OPTIMIZER: optimizer,
                        HP_BATCHSIZES: batch_size,
                    }
                    run_name = f"wf_id_{wf_id}_{session_num}"
                    print("--- Starting trial: %s" % run_name)
                    print(f"{hparams}")
                    run(f"{tensorboard_events_dir}/" + run_name, hparams)
                    session_num += 1

    return wf_id
    

In [3]:
# Set log levels
os.environ['LOG_STREAM_LEVEL'] = "error"
os.environ['LOG_FILE_LEVEL'] = "debug"

## Set up tensorboard events directory

In [4]:
# Optional: Delete old tensorboard directories
for file in os.listdir("."):
    if file.startswith("tb_"):
        shutil.rmtree(file)

In [5]:
tensorboard_events_dir = f"tb_{uuid4()}"
print(f"Creating new directory: {tensorboard_events_dir} ...")
os.mkdir(tensorboard_events_dir)
sleep(5)
print(f"Created.")

Creating new directory: tb_fc667c6d-d92a-48ec-afa4-2faf8c19b695 ...
Created.


## Get training parameters from previous Dask workflow run

In [6]:
from flowcept import TaskQueryAPI
from flowcept.commons.utils import get_utc_minutes_ago
query_api = TaskQueryAPI()

In [7]:
_filter = {
    "utc_timestamp": { "$gte" : get_utc_minutes_ago(60) },
    "campaign_id": "super_campaign",
    "generated.epochs": { "$gte" : 0 }
}
docs = query_api.query(filter=_filter)
epochs_params = set()
for doc in docs:
    print(f"task={doc['task_id']}, generated epochs={doc['generated']['epochs']}")
    epochs_params.add(doc['generated']['epochs'])
epochs_params

task=calculate_batch_and_epochs-bf4ebf3fbbc76e7517ea3b4eab46c56e, generated epochs=2
task=calculate_batch_and_epochs-5af210f426b0a8c42a9302411639c68e, generated epochs=2
task=calculate_batch_and_epochs-2153e7442c9cb4ec0537bd7f6504f448, generated epochs=2
task=calculate_batch_and_epochs-4acef3484d0b37571f7656a9d5efd819, generated epochs=2
task=calculate_batch_and_epochs-cea197d7e360873a91a6bf0e015023a1, generated epochs=2
task=calculate_batch_and_epochs-fba26a979e3a9ad1d59824712b3b3e32, generated epochs=2
task=calculate_batch_and_epochs-6b979ef78f9983cedd596e25a1ac8746, generated epochs=2


{2}

## Initialize Tensorboard's interceptor

In [8]:
from flowcept import TensorboardInterceptor
interceptor = TensorboardInterceptor()
interceptor.settings.file_path = tensorboard_events_dir

## Initialize consumer API

In [9]:
from flowcept import FlowceptConsumerAPI
consumer = FlowceptConsumerAPI(interceptor)
consumer.start()

<flowcept.flowcept_api.consumer_api.FlowceptConsumerAPI at 0x2828024f0>

## Start training

In [10]:
for epochs in epochs_params:
    workflow_id = run_tensorboard_hparam_tuning(tensorboard_events_dir)
    print(f"{epochs}, {workflow_id}")


--- Starting trial: wf_id_46010403-1191-4136-b072-ff84f6683856_0
{'workflow_id': '46010403-1191-4136-b072-ff84f6683856', 'activity_id': 'hyperparam_evaluation', 'epochs': 2, HParam(name='num_units', domain=Discrete([16, 32]), display_name=None, description=None): 16, HParam(name='dropout', domain=RealInterval(0.1, 0.2), display_name=None, description=None): 0.1, HParam(name='optimizer', domain=Discrete(['adam', 'sgd']), display_name=None, description=None): 'adam', HParam(name='batch_size', domain=Discrete([32, 64]), display_name=None, description=None): 32}
Epoch 1/2


2023-06-27 12:29:24.897901: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/2
--- Starting trial: wf_id_46010403-1191-4136-b072-ff84f6683856_1
{'workflow_id': '46010403-1191-4136-b072-ff84f6683856', 'activity_id': 'hyperparam_evaluation', 'epochs': 2, HParam(name='num_units', domain=Discrete([16, 32]), display_name=None, description=None): 16, HParam(name='dropout', domain=RealInterval(0.1, 0.2), display_name=None, description=None): 0.1, HParam(name='optimizer', domain=Discrete(['adam', 'sgd']), display_name=None, description=None): 'adam', HParam(name='batch_size', domain=Discrete([32, 64]), display_name=None, description=None): 64}
Epoch 1/2
Epoch 2/2
--- Starting trial: wf_id_46010403-1191-4136-b072-ff84f6683856_2
{'workflow_id': '46010403-1191-4136-b072-ff84f6683856', 'activity_id': 'hyperparam_evaluation', 'epochs': 2, HParam(name='num_units', domain=Discrete([16, 32]), display_name=None, description=None): 16, HParam(name='dropout', domain=RealInterval(0.1, 0.2), display_name=None, description=None): 0.1, HParam(name='optimizer', domain=Discrete

In [None]:
sleep(10)
consumer.stop()

## Get the training metadata stored from this workflow

In [11]:
_filter = {
    "workflow_id": workflow_id
}
docs = query_api.query(filter=_filter)
docs

[{'task_id': 'events.out.tfevents.1687883364.MAC132633.54853.1.v2',
  'workflow_id': '46010403-1191-4136-b072-ff84f6683856',
  'activity_id': 'hyperparam_evaluation',
  'used': {'batch_size': 32.0,
   'dropout': 0.1,
   'epochs': 2.0,
   'num_units': 16.0,
   'optimizer': 'adam'},
  'generated': {'accuracy': 0.8389999866485596},
  'utc_timestamp': 1687897769.883982,
  'custom_metadata': {'event_file': 'wf_id_46010403-1191-4136-b072-ff84f6683856_0',
   'log_path': 'tb_fc667c6d-d92a-48ec-afa4-2faf8c19b695/wf_id_46010403-1191-4136-b072-ff84f6683856_0'},
  'plugin_id': 'tensorboard',
  'user': 'root',
  'campaign_id': 'super_campaign',
  'sys_name': 'Darwin',
  'node_name': 'MAC132633',
  'login_name': 'login_name',
  'hostname': 'mac132633.ornl.gov',
  'extra_metadata': {'place_holder': ''},
  'debug': True,
  'finished': True},
 {'task_id': 'events.out.tfevents.1687883367.MAC132633.54853.4.v2',
  'workflow_id': '46010403-1191-4136-b072-ff84f6683856',
  'activity_id': 'hyperparam_evaluati