# Residual network

This notebook trains the ResNet-20 based on


After training, model is serialized and uploaded to W&B project.

In [1]:
import wandb
import pathlib
import shutil
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from typing import List


def load_data(run: wandb.sdk.wandb_run.Run) -> List[tf.data.Dataset]:
    """
    Downloads datasets from a wandb artifact and loads them into a list of tf.data.Datasets.
    """

    artifact_name = f"letters_splits_tfds"
    artifact = run.use_artifact(f"master-thesis/{artifact_name}:latest")
    artifact_dir = pathlib.Path(
        f"./artifacts/{artifact.name.replace(':', '-')}"
    ).resolve()
    if not artifact_dir.exists():
        artifact_dir = artifact.download()
        artifact_dir = pathlib.Path(artifact_dir).resolve()
    
    output_list = []
    for split in ["train", "test", "val"]:
        ds = tf.data.Dataset.load(str(artifact_dir / split), compression="GZIP")
        output_list.append(ds)
    
    return output_list

def get_number_of_classes(ds: tf.data.Dataset) -> int:
    """
    Returns the number of classes in a dataset.
    """
    labels_iterator= ds.map(lambda x, y: y).as_numpy_iterator()
    labels = np.concatenate(list(labels_iterator))
    return len(np.unique(labels))


def preprocess_dataset(ds: tf.data.Dataset, batch_size: int, cache: bool = True) -> tf.data.Dataset:
    ds = ds.map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, y))  # normalize
    ds = ds.unbatch().batch(batch_size)
    if cache:
        ds = ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    return ds


def calculate_model_size_on_disk(path: str) -> int:
    return pathlib.Path(path).stat().st_size


def calculate_model_num_parameters(model: tf.keras.Model) -> int:
    return model.count_params()


def calculate_model_flops() -> str:
    pass


def plot_history(history):
    fig, ax = plt.subplots(1, 1, figsize=(15, 10))
    epochs = range(1, len(history.history["loss"]) + 1)
    ax.plot(epochs, history.history["accuracy"], label="accuracy")
    ax.plot(epochs, history.history["val_accuracy"], label="val_accuracy")
    ax.set_xlabel("Epoch")
    ax.set_ylabel("Accuracy")
    ax.legend(loc="lower right")

    plt.show()

def evaluate_model(model, ds_test, model_name):
    """
    Evaluate model test loss, accuracy and other characteristics then log to wandb
    """
    flops = wandb.run.summary["GFLOPs"]
    disk_size = calculate_model_size_on_disk(f"./artifacts/{model_name}.h5")
    num_parameters = calculate_model_num_parameters(model)

    # evaluate model on ds_test and log to wandb
    test_loss, test_acc = model.evaluate(ds_test)

    wandb.log(
        {
            "test loss": test_loss,
            "test accuracy": test_acc,
            "number of parameters": num_parameters,
            "disk size": disk_size,
            "model flops": flops,
        }
    )


def evaluate_diacritics_performance(model, ds_test):
    """
    Evaluate model test loss, accuracy on letters with diacritics then log to wandb
    """
    diacritics = {
        62: "ą",
        63: "ć",
        64: "ę",
        65: "ł",
        66: "ń",
        67: "ó",
        68: "ś",
        69: "ź",
        70: "ż",
        71: "Ą",
        72: "Ć",
        73: "Ę",
        74: "Ł",
        75: "Ń",
        76: "Ó",
        77: "Ś",
        78: "Ź",
        79: "Ż",
    }

    def calculate_batch_size(dataset):
        return next(iter(dataset)).shape[0]

    bs = calculate_batch_size(ds_test)

    # log test accuracy on these classes separately to wandb
    diacritics_acc = {}
    for diacritic_label in diacritics.keys():
        ds_test_diacritic = ds_test.unbatch().filter(lambda x, y: tf.equal(y, diacritic_label)).batch(bs)
        test_loss, test_acc = model.evaluate(ds_test_diacritic)
        diacritics_acc[diacritic_label] = {
            "loss": test_loss,
            "accuracy": test_acc,
            "label": diacritics[diacritic_label],
        }

    wandb.log(diacritics_acc)


2023-01-14 12:08:58.552405: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-14 12:08:59.175134: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda-11.2/lib64:
2023-01-14 12:08:59.175208: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda-11.2/lib64:


In [2]:
defaults = dict(
    batch_size=32*4,
    epochs=100,    
    optimizer="sgd",
    learning_rate=0.001,
    momentum=0.9,
)

RESNET_DEPTHS = [3, 4, 6, 3]
MODEL_NAME = f"resnet-{sum(RESNET_DEPTHS) + 2}"
run = wandb.init(project="master-thesis", job_type="training", name=MODEL_NAME, config=defaults,)

# hyperparameters

opt_name = wandb.config.optimizer
lr = wandb.config.learning_rate
momentum = wandb.config.momentum
bs = wandb.config.batch_size
epochs = wandb.config.epochs

[34m[1mwandb[0m: Currently logged in as: [33mgratkadlafana[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
ds_train, ds_test, ds_val = load_data(run)

num_classes = get_number_of_classes(ds_val)

print(f"There are {num_classes} classes")
print(f"Training set has {len(ds_train)} batches")
print(f"Test set has {len(ds_test)} batches")
print(f"Validation set has {len(ds_val)} batches")

ds_train = preprocess_dataset(ds_train, batch_size=bs, batch_size=bs)
ds_val = preprocess_dataset(ds_val, batch_size=bs, batch_size=bs)
ds_test = preprocess_dataset(ds_test, batch_size=bs, batch_size=bs, cache=False)

[34m[1mwandb[0m:   9 of 9 files downloaded.  


There are 89 classes
Training set has 13953 batches
Test set has 1743 batches
Validation set has 1743 batches


In [14]:
for r in ds_train.take(1):
    print(r[0].shape)
    print(r[1].shape)

(32, 32, 32, 1)
(32,)


In [16]:
for r in ds_train.unbatch().batch(128).take(1):
    print(r[0].shape)
    print(r[1].shape)

(128, 32, 32, 1)
(128,)


In [10]:
class ResidualBlock(tf.keras.layers.Layer):
    def __init__(self, filters, strides=1, activation="relu", **kwargs):
        super(ResidualBlock, self).__init__(**kwargs)
        self.activation = tf.keras.activations.get(activation)
        self.main_layers = [
            tf.keras.layers.Conv2D(
                filters, strides=strides, kernel_size=3, padding="same", use_bias=False
            ),
            tf.keras.layers.BatchNormalization(),
            self.activation,
            tf.keras.layers.Conv2D(
                filters, strides=1, kernel_size=3, padding="same", use_bias=False
            ),
            tf.keras.layers.BatchNormalization(),
        ]
        self.skip_layers = []
        if strides > 1:
            self.skip_layers = [
                tf.keras.layers.Conv2D(
                    filters,
                    strides=strides,
                    kernel_size=1,
                    padding="same",
                    use_bias=False,
                ),
                tf.keras.layers.BatchNormalization(),
            ]
    def get_config(self):
        config = super().get_config()
        config.update({
            'activation': self.activation,
            'main_layers': self.main_layers,
            'skip_layers': self.skip_layers
        })
        return config

    def call(self, inputs):
        x = inputs
        for layer in self.main_layers:
            x = layer(x)
        skip_x = inputs
        for layer in self.skip_layers:
            skip_x = layer(skip_x)
        return self.activation(x + skip_x)


class ResNet(tf.keras.Model):
    def __init__(
        self,
        block_design=[3, 4, 6, 3],
        input_shape=[32, 32, 1],
        num_classes=88,
        **kwargs
    ):
        super(ResNet, self).__init__(**kwargs)
        self.num_classes = num_classes
        self.classifier = tf.keras.layers.Dense(num_classes, activation="softmax")
        self.input_conv = tf.keras.models.Sequential(
            [
                tf.keras.layers.Conv2D(
                    64,
                    kernel_size=7,
                    strides=2,
                    input_shape=input_shape,
                    padding="same",
                ),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Activation("relu"),
                tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding="same"),
            ]
        )
        self.resnet_blocks = tf.keras.models.Sequential()
        prev_filters = 64
        for filters in np.repeat(np.array([64, 128, 256, 512]), block_design):
            strides = 1 if filters == prev_filters else 2
            self.resnet_blocks.add(ResidualBlock(filters, strides=strides))
            prev_filters = filters
        self.avg_pool = tf.keras.layers.GlobalAveragePooling2D()
        self.flatten = tf.keras.layers.Flatten()

    def call(self, inputs):
        x = self.input_conv(inputs)
        x = self.resnet_blocks(x)
        x = self.avg_pool(x)
        x = self.flatten(x)
        return self.classifier(x)


def get_resnet_model(input_shape, block_design, num_classes) -> tf.keras.Sequential:
    model = tf.keras.models.Sequential(
        [
            tf.keras.layers.Conv2D(
                64, kernel_size=7, strides=2, input_shape=input_shape, padding="same"
            ),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Activation("relu"),
            tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding="same"),
        ]
    )
    resnet_blocks = tf.keras.models.Sequential()
    prev_filters = 64
    for filters in np.repeat(np.array([64, 128, 256, 512]), block_design):
        strides = 1 if filters == prev_filters else 2
        resnet_blocks.add(ResidualBlock(filters, strides=strides))
        prev_filters = filters
    model.add(resnet_blocks)
    model.add(tf.keras.layers.GlobalAveragePooling2D())
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(num_classes, activation="softmax"))
    return model

In [11]:
model = get_resnet_model(input_shape=[32, 32, 1], block_design=RESNET_DEPTHS, num_classes=num_classes)

opt = tf.keras.optimizers.get({
    'class_name': wandb.config.optimizer,
    'config': {
        'learning_rate': lr,
        'momentum': momentum
    }
})

model.compile(
    optimizer=opt,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

wandb_callback = wandb.keras.WandbCallback(
    save_model=False,
    compute_flops=True,
)

# save the best model
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=f"./artifacts/{MODEL_NAME}.h5",
    save_weights_only=False,
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
)

history = model.fit(
    ds_train,
    epochs=epochs,
    validation_data=ds_val,
    callbacks=[wandb_callback, checkpoint_callback],
)

2023-01-14 12:11:32.621894: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2023-01-14 12:11:32.621994: I tensorflow/core/grappler/clusters/single_machine.cc:358] Starting new session


Instructions for updating:
This API was designed for TensorFlow v1. See https://www.tensorflow.org/guide/migrate for instructions on how to migrate your code to TensorFlow v2.
Epoch 1/100


ValueError: in user code:

    File "/home/wiktor/.pyenv/versions/3.10.9/envs/master-thesis/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/home/wiktor/.pyenv/versions/3.10.9/envs/master-thesis/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/wiktor/.pyenv/versions/3.10.9/envs/master-thesis/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/home/wiktor/.pyenv/versions/3.10.9/envs/master-thesis/lib/python3.10/site-packages/keras/engine/training.py", line 1023, in train_step
        y_pred = self(x, training=True)
    File "/home/wiktor/.pyenv/versions/3.10.9/envs/master-thesis/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/wiktor/.pyenv/versions/3.10.9/envs/master-thesis/lib/python3.10/site-packages/keras/engine/input_spec.py", line 277, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'sequential' (type Sequential).
    
    Input 0 of layer "conv2d" is incompatible with the layer: expected axis -1 of input shape to have value 1, but received input with shape (None, None, 32, 32)
    
    Call arguments received by layer 'sequential' (type Sequential):
      • inputs=tf.Tensor(shape=(None, None, 32, 32, 1), dtype=float32)
      • training=True
      • mask=None


In [None]:
plot_history(history)

In [None]:
# evaluate model then log to wandb

evaluate_model(model, ds_test, MODEL_NAME)
evaluate_diacritics_performance(model, ds_test)

In [None]:
# save artifact to wandb
artifact = wandb.Artifact(
    name=MODEL_NAME,
    type="model"
)

# save best model to artifact
artifact.add_file(f"./artifacts/{MODEL_NAME}.h5")
run.log_artifact(artifact)
run.finish()