In [1]:
# Sleeps are used because these notebooks are being tested automatically as part of the CI/CD. 
# In a normal user interaction, these sleeps would not be necessary.
from time import sleep

import os
import shutil
from uuid import uuid4
from time import sleep

In [2]:
def run_tensorboard_hparam_tuning(tensorboard_events_dir):
    # Code based on: https://www.tensorflow.org/tensorboard/hyperparameter_tuning_with_hparams
    wf_id = str(uuid4())
    import tensorflow as tf
    from tensorboard.plugins.hparams import api as hp

    fashion_mnist = tf.keras.datasets.fashion_mnist

    (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    HP_NUM_UNITS = hp.HParam("num_units", hp.Discrete([16, 32]))
    HP_DROPOUT = hp.HParam("dropout", hp.RealInterval(0.1, 0.2))
    HP_OPTIMIZER = hp.HParam("optimizer", hp.Discrete(["adam", "sgd"]))
    HP_BATCHSIZES = hp.HParam("batch_size", hp.Discrete([32, 64]))

    HP_MODEL_CONFIG = hp.HParam("model_config")
    HP_OPTIMIZER_CONFIG = hp.HParam("optimizer_config")

    METRIC_ACCURACY = "accuracy"

    with tf.summary.create_file_writer(tensorboard_events_dir).as_default():
        hp.hparams_config(
            hparams=[
                HP_NUM_UNITS,
                HP_DROPOUT,
                HP_OPTIMIZER,
                HP_BATCHSIZES,
                HP_MODEL_CONFIG,
                HP_OPTIMIZER_CONFIG,
            ],
            metrics=[hp.Metric(METRIC_ACCURACY, display_name="Accuracy")],
        )

    def train_test_model(hparams, tensorboard_events_dir):
        model = tf.keras.models.Sequential(
            [
                tf.keras.layers.Flatten(),
                tf.keras.layers.Dense(
                    hparams[HP_NUM_UNITS], activation=tf.nn.relu
                ),
                tf.keras.layers.Dropout(hparams[HP_DROPOUT]),
                tf.keras.layers.Dense(10, activation=tf.nn.softmax),
            ]
        )
        model.compile(
            optimizer=hparams[HP_OPTIMIZER],
            loss="sparse_categorical_crossentropy",
            metrics=["accuracy"],
        )

        model.fit(
            x_train,
            y_train,
            epochs=1,
            callbacks=[
                tf.keras.callbacks.TensorBoard(tensorboard_events_dir),
                # log metrics
                hp.KerasCallback(tensorboard_events_dir, hparams),  # log hparams
            ],
            batch_size=hparams[HP_BATCHSIZES],
        )  # Run with 1 epoch to speed things up for tests
        _, accuracy = model.evaluate(x_test, y_test)
        return accuracy

    def run(run_dir, hparams):
        with tf.summary.create_file_writer(run_dir).as_default():
            hp.hparams(hparams)  # record the values used in this trial
            accuracy = train_test_model(hparams, tensorboard_events_dir)
            tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

    session_num = 0

    for num_units in HP_NUM_UNITS.domain.values:
        for dropout_rate in (
            HP_DROPOUT.domain.min_value,
            HP_DROPOUT.domain.max_value,
        ):
            for optimizer in HP_OPTIMIZER.domain.values:
                for batch_size in HP_BATCHSIZES.domain.values:
                    # These two added ids below are optional and useful
                    # just to contextualize this run.
                    hparams = {
                        "workflow_id": wf_id,
                        "activity_id": "hyperparam_evaluation",
                        HP_NUM_UNITS: num_units,
                        HP_DROPOUT: dropout_rate,
                        HP_OPTIMIZER: optimizer,
                        HP_BATCHSIZES: batch_size,
                    }
                    run_name = f"wf_id_{wf_id}_{session_num}"
                    print("--- Starting trial: %s" % run_name)
                    print(f"{hparams}")
                    run(f"{tensorboard_events_dir}/" + run_name, hparams)
                    session_num += 1

    return wf_id
    

In [3]:
# Set log levels
os.environ['LOG_STREAM_LEVEL'] = "error"
os.environ['LOG_FILE_LEVEL'] = "debug"

## Set up tensorboard events directory

In [10]:
# Optional: Delete old tensorboard directories
for file in os.listdir("."):
    if file.startswith("tb_"):
        shutil.rmtree(file)

In [11]:
tensorboard_events_dir = f"tb_{uuid4()}"
print(f"Creating new directory: {tensorboard_events_dir} ...")
os.mkdir(tensorboard_events_dir)
sleep(5)
print(f"Created.")

Creating new directory: tb_c3066ae2-33a2-4263-b61e-2f2986bf3a71 ...
Created.


## Initialize Tensorboard's interceptor

In [12]:
from flowcept import TensorboardInterceptor
interceptor = TensorboardInterceptor()
interceptor.settings.file_path = tensorboard_events_dir

## Initialize consumer API

In [13]:
from flowcept import FlowceptConsumerAPI
consumer = FlowceptConsumerAPI(interceptor)
consumer.start()

<flowcept.flowcept_api.consumer_api.FlowceptConsumerAPI at 0x17ac2c9d0>

## Start training

In [14]:
workflow_id = run_tensorboard_hparam_tuning(tensorboard_events_dir)

--- Starting trial: wf_id_3377b78b-abe3-4e6f-8a95-014b8a381a26_0
{'workflow_id': '3377b78b-abe3-4e6f-8a95-014b8a381a26', 'activity_id': 'hyperparam_evaluation', HParam(name='num_units', domain=Discrete([16, 32]), display_name=None, description=None): 16, HParam(name='dropout', domain=RealInterval(0.1, 0.2), display_name=None, description=None): 0.1, HParam(name='optimizer', domain=Discrete(['adam', 'sgd']), display_name=None, description=None): 'adam', HParam(name='batch_size', domain=Discrete([32, 64]), display_name=None, description=None): 32}


2023-02-10 11:27:39.660407: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


--- Starting trial: wf_id_3377b78b-abe3-4e6f-8a95-014b8a381a26_1
{'workflow_id': '3377b78b-abe3-4e6f-8a95-014b8a381a26', 'activity_id': 'hyperparam_evaluation', HParam(name='num_units', domain=Discrete([16, 32]), display_name=None, description=None): 16, HParam(name='dropout', domain=RealInterval(0.1, 0.2), display_name=None, description=None): 0.1, HParam(name='optimizer', domain=Discrete(['adam', 'sgd']), display_name=None, description=None): 'adam', HParam(name='batch_size', domain=Discrete([32, 64]), display_name=None, description=None): 64}
--- Starting trial: wf_id_3377b78b-abe3-4e6f-8a95-014b8a381a26_2
{'workflow_id': '3377b78b-abe3-4e6f-8a95-014b8a381a26', 'activity_id': 'hyperparam_evaluation', HParam(name='num_units', domain=Discrete([16, 32]), display_name=None, description=None): 16, HParam(name='dropout', domain=RealInterval(0.1, 0.2), display_name=None, description=None): 0.1, HParam(name='optimizer', domain=Discrete(['adam', 'sgd']), display_name=None, description=None):

In [None]:
sleep(10)

## Get the training metadata stored from this workflow

In [16]:
from flowcept import TaskQueryAPI
query_api = TaskQueryAPI()

In [17]:
_filter = {
    "workflow_id": workflow_id
}
docs = query_api.query(filter=_filter)
docs

[{'task_id': 'events.out.tfevents.1676046459.MAC132633.66236.1.v2',
  'workflow_id': '3377b78b-abe3-4e6f-8a95-014b8a381a26',
  'activity_id': 'hyperparam_evaluation',
  'used': {'batch_size': 32.0,
   'dropout': 0.1,
   'num_units': 16.0,
   'optimizer': 'adam'},
  'generated': {'accuracy': 0.8203999996185303},
  'utc_timestamp': 1676064464.598542,
  'status': 'FINISHED',
  'custom_metadata': {'event_file': 'wf_id_3377b78b-abe3-4e6f-8a95-014b8a381a26_0',
   'log_path': 'tb_c3066ae2-33a2-4263-b61e-2f2986bf3a71/wf_id_3377b78b-abe3-4e6f-8a95-014b8a381a26_0'},
  'plugin_id': 'tensorboard',
  'user': 'root',
  'experiment_id': 'super-experiment',
  'sys_name': 'Darwin',
  'node_name': 'MAC132633',
  'login_name': 'rsr',
  'public_ip': '2620:0:2b30:e0::4b9',
  'private_ip': '10.158.26.233',
  'debug': True},
 {'task_id': 'events.out.tfevents.1676046461.MAC132633.66236.4.v2',
  'workflow_id': '3377b78b-abe3-4e6f-8a95-014b8a381a26',
  'activity_id': 'hyperparam_evaluation',
  'used': {'batch_s

In [18]:
consumer.stop()