# TinyVGG network

This notebook trains the TinyVGG network 

After training, model is serialized and uploaded to W&B project.

In [1]:
import wandb
import tensorflow as tf
import numpy as np
import pathlib
import shutil
from typing import List

POSSIBLE_DATASET_OPTIONS = ["uppercase_no_diacritics", "uppercase", "lowercase_no_diacritics", "lowercase", "numbers", "phcd_paper"]

def load_data(run, dataset_option = 'letters') -> List[tf.data.Dataset]:
    """
    Downloads datasets from a wandb artifact and loads them into a list of tf.data.Datasets.
    """

    artifact_name = f"{dataset_option}_splits_tfds"
    artifact = run.use_artifact(f"master-thesis/{artifact_name}:latest")
    artifact_dir = pathlib.Path(
        f"./artifacts/{artifact.name.replace(':', '-')}"
    ).resolve()
    if not artifact_dir.exists():
        artifact_dir = artifact.download()
        artifact_dir = pathlib.Path(artifact_dir).resolve()

    # if tf.__version__ minor is less than 10, use
    # tf.data.experimental.load instead of tf.data.Dataset.load

    if int(tf.__version__.split(".")[1]) < 10:
        load_function = tf.data.experimental.load
    else:
        load_function = tf.data.Dataset.load
    
    output_list = []
    for split in ["train", "test", "val"]:
        ds = load_function(str(artifact_dir / split), compression="GZIP")
        output_list.append(ds)
    
    return output_list

def get_number_of_classes(ds: tf.data.Dataset) -> int:
    """
    Returns the number of classes in a dataset.
    """
    labels_iterator= ds.map(lambda x, y: y).as_numpy_iterator()
    labels = np.concatenate(list(labels_iterator))
    return len(np.unique(labels))

def preprocess_dataset(ds: tf.data.Dataset, batch_size: int, cache: bool = True) -> tf.data.Dataset:
    ds = ds.map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, y))  # normalize
    ds = ds.unbatch().batch(batch_size)
    if cache:
        ds = ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    return ds

def calculate_model_size_on_disk(path: str) -> int:
    return pathlib.Path(path).stat().st_size    

def calculate_model_num_parameters(model: tf.keras.Model) -> int:
    return model.count_params()

def calculate_model_flops(summary) -> float:
    # from run.summary get GFLOPs or GFLOPS whichever is available
    if "GFLOPs" in summary:
        return summary.get("GFLOPs")
    elif "GFLOPS" in summary:
        return summary.get("GFLOPS")

class Terminate_slow_convergence(tf.keras.callbacks.Callback):
    ACCURACY_THRESHOLD =  0.9
    EPOCH = 6
    MAX_VAL_ACCURACY_THRESHOLD = 0.9995

    def on_epoch_end(self, epoch, logs={}):
        
        # if model is not converging fast, stop training
        if(logs.get('accuracy') < self.ACCURACY_THRESHOLD and epoch > self.EPOCH):
            print(f"Model accuracy is {logs.get('accuracy')} and is below {self.ACCURACY_THRESHOLD} at epoch {epoch}. Terminating training.")
            self.model.stop_training = True
            
        # if model converged, stop training
        if(logs.get('val_accuracy') >= self.MAX_VAL_ACCURACY_THRESHOLD and epoch > 2):
            print(f"Model reached max val_accuracy. Terminating training.")
            self.model.stop_training = True

2023-01-23 18:32:35.525962: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
defaults = dict(
    batch_size=32*4,
    epochs=100,    
    optimizer="adam"
)

In [3]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Available devices: ", tf.config.list_physical_devices())

Num GPUs Available:  1
Available devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2023-01-23 18:32:39.392205: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-23 18:32:39.411064: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-23 18:32:39.411458: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.


In [4]:

def train(dataset_subset, defaults, MODEL_NAME = "TinyVGG", job_type = "training"):
    with wandb.init(project="master-thesis", job_type=job_type, name=MODEL_NAME, config=defaults) as run:
        filters = wandb.config.filters
        ds_train, ds_test, ds_val = load_data(run, dataset_option=dataset_subset)

        num_classes = get_number_of_classes(ds_val)
        ds_train = preprocess_dataset(ds_train, batch_size=wandb.config.batch_size)
        ds_val = preprocess_dataset(ds_val, batch_size=wandb.config.batch_size)
        ds_test = preprocess_dataset(ds_test, batch_size=wandb.config.batch_size, cache=False)

        model = tf.keras.Sequential(
            [
                tf.keras.layers.InputLayer(input_shape=(32, 32, 1)),

                tf.keras.layers.Conv2D(filters,kernel_size=(3, 3), activation="relu"),
                tf.keras.layers.Conv2D(filters,kernel_size=(3, 3), activation="relu"),
                tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),

                tf.keras.layers.Conv2D(filters,kernel_size=(3, 3), activation="relu"),
                tf.keras.layers.Conv2D(filters,kernel_size=(3, 3), activation="relu"),
                tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),

                tf.keras.layers.Flatten(),
                tf.keras.layers.Dense(num_classes, activation="softmax"),
            ]
        )

        model.compile(
            optimizer="adam",
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=["accuracy"],
        )

        # save the best model
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=f"./artifacts/{dataset_subset}-{MODEL_NAME}.h5",
            save_weights_only=False,
            monitor="val_accuracy",
            mode="max",
            save_best_only=True,
        )

        wandb_callback = wandb.keras.WandbCallback(
            save_model=False,
            compute_flops=True,
        )

        history = model.fit(
            ds_train,
            epochs=wandb.config.epochs,
            validation_data=ds_val,
            callbacks=[wandb_callback, checkpoint_callback],
        )

        # calculate model size on disk, flops and number of parameters

        flops = calculate_model_flops(wandb.run.summary)
        disk_size = calculate_model_size_on_disk(f"./artifacts/{dataset_subset}-{MODEL_NAME}.h5")
        num_parameters = calculate_model_num_parameters(model)

        # evaluate model on ds_test and log to wandb
        test_loss, test_acc = model.evaluate(ds_test)

        wandb.log({
            "dataset subset": dataset_subset,
            "test loss": test_loss, 
            "test accuracy": test_acc, 
            "number of parameters": num_parameters,
            "disk size": disk_size, 
            "model flops": flops
            })

        # save artifact to wandb
        artifact = wandb.Artifact(
            name=f"{dataset_subset}-{MODEL_NAME}",
            type="model"
        )

        # save best model to artifact
        artifact.add_file(f"./artifacts/{dataset_subset}-{MODEL_NAME}.h5")
        run.log_artifact(artifact)
        run.finish()

In [6]:
defaults['filters'] = 40
train("phcd_paper", defaults)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgratkadlafana[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact phcd_paper_splits_tfds:latest, 163.87MB. 27 files... 
[34m[1mwandb[0m:   27 of 27 files downloaded.  
Done. 0:0:0.0
2023-01-23 18:33:16.847514: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


2023-01-23 18:33:16.848295: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-23 18:33:16.848533: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-23 18:33:16.848706: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-23 18:33:17.929945: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-23 18:33:17.930638: I tensorflow/compile

Instructions for updating:
This API was designed for TensorFlow v1. See https://www.tensorflow.org/guide/migrate for instructions on how to migrate your code to TensorFlow v2.
Epoch 1/100


2023-01-23 18:33:18.962742: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-23 18:33:18.962793: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1
2023-01-23 18:33:18.962971: I tensorflow/core/grappler/clusters/single_machine.cc:358] Starting new session
2023-01-23 18:33:18.963784: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-23 18:33:18.963991: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-23 18:33:18.964230: I tensor

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


0,1
accuracy,▁▅▆▇▇▇██
epoch,▁▂▃▄▅▆▇█
loss,█▃▃▂▂▁▁▁
val_accuracy,▁▄▅▆▇▇██
val_loss,█▅▄▃▂▂▁▁

0,1
GFLOPs,0.01527
accuracy,0.84131
best_epoch,7.0
best_val_loss,0.41763
epoch,7.0
loss,0.42181
val_accuracy,0.84064
val_loss,0.41763


KeyError: 0

# W&B Sweep

In [None]:
# define sweep parameters
sweep_config = {
    "method": "bayes",
    "metric": {
        "goal": "maximize", 
        "name": "val_accuracy"
    },
    "parameters": {
        "filters": {
            "values": [10, 20, 30, 40, 50, 60, 70, 89]
        },
        "batch_size": {
            "value": 32*4
        },
        "epochs": {
            "value": 100
        },
        "optimizer": {
            "value": "adam"
        }
    },
}


# launch sweep controller
#sweep_id = wandb.sweep(sweep_config, project="master-thesis")

In [None]:
def sweep_fn():
    train("phcd_paper",None, job_type='sweep', MODEL_NAME="TinyVGG")
#wandb.agent(sweep_id, sweep_fn, count=15)