# Better CNN model

Here I explore very similar architectures to the baseline (CNN - Flatten - Dense layers)

Hyperparameters experimented with:

- number of CNN and dense layers
- Dropout
- Activation function
- Other max pooling strategies
- learning rate scheduling


After training, model is serialized and uploaded to W&B project.

In [None]:
import wandb
import tensorflow as tf
import numpy as np
import pathlib
import shutil
from typing import List

def load_data(run) -> List[tf.data.Dataset]:
    """
    Downloads datasets from a wandb artifact and loads them into a list of tf.data.Datasets.
    """

    artifact_name = f"letters_splits_tfds"
    artifact = run.use_artifact(f"master-thesis/{artifact_name}:latest")
    artifact_dir = pathlib.Path(
        f"./artifacts/{artifact.name.replace(':', '-')}"
    ).resolve()
    if not artifact_dir.exists():
        artifact_dir = artifact.download()
        artifact_dir = pathlib.Path(artifact_dir).resolve()

    # if tf.__version__ minor is less than 10, use
    # tf.data.experimental.load instead of tf.data.Dataset.load

    if int(tf.__version__.split(".")[1]) < 10:
        load_function = tf.data.experimental.load
    else:
        load_function = tf.data.Dataset.load
    
    output_list = []
    for split in ["train", "test", "val"]:
        ds = load_function(str(artifact_dir / split), compression="GZIP")
        output_list.append(ds)
    
    return output_list

def get_number_of_classes(ds: tf.data.Dataset) -> int:
    """
    Returns the number of classes in a dataset.
    """
    labels_iterator= ds.map(lambda x, y: y).as_numpy_iterator()
    labels = np.concatenate(list(labels_iterator))
    return len(np.unique(labels))

def preprocess_dataset(ds: tf.data.Dataset, batch_size: int, cache: bool = True) -> tf.data.Dataset:
    ds = ds.map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, y))  # normalize
    ds = ds.unbatch().batch(batch_size)
    if cache:
        ds = ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    return ds

def calculate_model_size_on_disk(path: str) -> int:
    return pathlib.Path(path).stat().st_size    

def calculate_model_num_parameters(model: tf.keras.Model) -> int:
    return model.count_params()

def calculate_model_flops(model: tf.keras.Model) -> str:
    pass


class Terminate_slow_convergence(tf.keras.callbacks.Callback):
    ACCURACY_THRESHOLD =  0.9
    EPOCH = 6
    MAX_VAL_ACCURACY_THRESHOLD = 0.995

    def on_epoch_end(self, epoch, logs={}):
        
        # if model is not converging fast, stop training
        if(logs.get('accuracy') < self.ACCURACY_THRESHOLD and epoch > self.EPOCH):
            print(f"Model accuracy is {logs.get('accuracy')} and is below {self.ACCURACY_THRESHOLD} at epoch {epoch}. Terminating training.")
            self.model.stop_training = True
            
        # if model converged, stop training
        #if(logs.get('val_accuracy') >= self.MAX_VAL_ACCURACY_THRESHOLD and epoch > 2):
        #    self.model.stop_training = True

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Available devices: ", tf.config.list_physical_devices())


In [5]:
baseline_additional_dense_layer = tf.keras.Sequential(
        [
            tf.keras.layers.InputLayer(input_shape=(32, 32, 1)),
            tf.keras.layers.Conv2D(32,kernel_size=(3, 3), activation="relu"),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
            tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(5376, activation="relu"),
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(89, activation="softmax"),
        ]
    )

# test acc 0.999 ?
architecture_2 = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=(32, 32, 1)),
            tf.keras.layers.Conv2D(32,kernel_size=(3, 3), activation="relu", padding="same"),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
            tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(256*2, activation="relu"),
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(num_classes, activation="softmax"),
            ]
        )

def get_model(input_shape=(32, 32, 1), num_classes=89, conv_filters=[32, 64, 128], dense_units=[5376, 256], activation_fn=tf.keras.layers.ReLU(), dropout_rate=0.0):
    """
    Returns a tf.keras.Sequential model with the specified parameters. By default returns baseline model.
    """
    print("Input shape: ", input_shape)
    print("Number of classes: ", num_classes)
    print(f"Building model with {conv_filters} conv filters, {dense_units} dense units, {activation_fn} activation function and {dropout_rate} dropout rate.")
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.InputLayer(input_shape=input_shape))

    for num_conv_filters in conv_filters:
        model.add(tf.keras.layers.Conv2D(num_conv_filters, kernel_size=(3, 3)))
        model.add(activation_fn)
        model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    
    model.add(tf.keras.layers.Flatten())

    for num_dense_units in dense_units:
        model.add(tf.keras.layers.Dense(num_dense_units))
        model.add(activation_fn)
        if dropout_rate > 0.0:
            model.add(tf.keras.layers.Dropout(dropout_rate))
    
    model.add(tf.keras.layers.Dense(num_classes, activation="softmax"))

    return model
    
def train(model_name, config_defaults, num_classes=89):
    print("Initializing wandb")
    with wandb.init(
        project="master-thesis",
        job_type="training",
        name=model_name,
        config=config_defaults,
    ) as run:
        ds_train, ds_test, ds_val = load_data(run)

        num_classes = get_number_of_classes(ds_val)

        ds_train = preprocess_dataset(ds_train, batch_size=wandb.config.batch_size)
        ds_val = preprocess_dataset(ds_val, batch_size=wandb.config.batch_size)
        ds_test = preprocess_dataset(ds_test, batch_size=wandb.config.batch_size, cache=False)
        
        print(f"Model name: {model_name}")
        model = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=(32, 32, 1)),
            tf.keras.layers.Conv2D(32,kernel_size=(3, 3), activation="relu", padding="same"),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
            tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(256*2, activation="relu"),
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(num_classes, activation="softmax"),
            ]
        )

        model.compile(
            optimizer='adam',
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=["accuracy"],
        )

        # save the best model
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=f"./artifacts/{model_name}.h5",
            save_weights_only=False,
            monitor="val_accuracy",
            mode="max",
            save_best_only=True,
        )

        wandb_callback = wandb.keras.WandbCallback(
            save_model=False,
            compute_flops=True,
        )

        slow_convergence_callback = Terminate_slow_convergence()
        history = model.fit(
            ds_train,
            epochs=wandb.config.epochs,
            validation_data=ds_val,
            callbacks=[wandb_callback, checkpoint_callback, slow_convergence_callback],
        )
        print("Training finished")

        # calculate model size on disk, flops and number of parameters
        flops = wandb.run.summary.get("GFLOPS", None)
        if flops is None:
            flops = wandb.run.summary.get("GFLOPs", None) 
     
        disk_size = calculate_model_size_on_disk(f"./artifacts/{model_name}.h5")
        num_parameters = calculate_model_num_parameters(model)

        # evaluate model on ds_test and log to wandb
        test_loss, test_acc = model.evaluate(ds_test)

        wandb.log({
            "test loss": test_loss, 
            "test accuracy": test_acc, 
            "number of parameters": num_parameters,
            "disk size": disk_size, 
            "model flops": flops
            })
        # save artifact to wandb
        artifact = wandb.Artifact(name=model_name, type="model")

        # save best model to artifact
        artifact.add_file(f"./artifacts/{model_name}.h5")
        run.log_artifact(artifact)
        run.finish()
        print("Evaluation finished")


In [6]:
defaults = dict(
    batch_size=128,
    epochs=100,   
)

train("better_cnn-smaller-dense-layer-padding-same", defaults)

Initializing wandb


[34m[1mwandb[0m: Downloading large artifact letters_splits_tfds:latest, 54.88MB. 18 files... 
[34m[1mwandb[0m:   18 of 18 files downloaded.  
Done. 0:0:0.0


Model name: better_cnn-smaller-dense-layer-padding-same
Epoch 1/100


2023-01-22 16:11:17.929204: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-22 16:11:17.929259: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1
2023-01-22 16:11:17.929423: I tensorflow/core/grappler/clusters/single_machine.cc:358] Starting new session
2023-01-22 16:11:17.930064: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-22 16:11:17.930297: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-22 16:11:17.930540: I tensor

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

0,1
accuracy,▁▆▇▇▇███████████████████████████████████
disk size,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
model flops,▁
number of parameters,▁
test accuracy,▁
test loss,▁
val_accuracy,▁▃▅▅▆▆▇▆▇▇▇▇▇▇▇██▇████▇█████████████████
val_loss,█▆▄▃▃▂▂▃▂▂▂▂▂▂▂▁▁▂▁▁▁▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
GFLOPs,0.0057
accuracy,0.99878
best_epoch,98.0
best_val_loss,0.00232
disk size,21235872.0
epoch,99.0
loss,0.00532
model flops,0.0057
number of parameters,1765283.0
test accuracy,0.99896


Evaluation finished
