# Baseline network + hyperparameter sweep

This notebook trains the baseline network with exact same architecture as the one in paper [Recognition of handwritten Latin characters with diacritics using CNN](https://journals.pan.pl/dlibra/publication/136210/edition/119099/content/bulletin-of-the-polish-academy-of-sciences-technical-sciences-recognition-of-handwritten-latin-characters-with-diacritics-using-cnn-lukasik-edyta-charytanowicz-malgorzata-milosz-marek-tokovarov-michail-kaczorowska-monika-czerwinski-dariusz-zientarski-tomasz-2021-69-no-1?language=en)

Model architecture description:

- Input layer (32x32 grayscale image) 

- feature extraction
    - Conv2d (padding = 1, kernel = (3,3), stride = 1, activation = relu) 
    - MaxPool layer ( kernel=(2,2), stride=2) 
    - Conv2d layer (padding = 1, kernel = (3,3), stride = 1, activation = relu) 
    - MaxPool layer ( kernel=(2,2), stride=2) 

- dense layers
    - Dense(5376)
    - Dense(256)
    - Dense( number of classes - here in paper 89 classes)




After training, model is serialized and uploaded to W&B project.

In [None]:
import wandb
import tensorflow as tf
import numpy as np
import pathlib
import shutil

def load_data( run = wandb.init(project="master-thesis", job_type="preprocessing")) -> pathlib.Path:
    """
    Unpacks data from an artifact into a folder and returns the path to the folder.
    """

    artifact_name = f"letters_splits"
    artifact = run.use_artifact(f"master-thesis/{artifact_name}:latest")
    artifact_dir = artifact.download()
    artifact_dir = pathlib.Path(artifact_dir).resolve()
    
    for split_file in artifact_dir.iterdir():
        if split_file.name.endswith(".tar.gz"):
            split = split_file.name.replace(".tar.gz", "")
            shutil.unpack_archive(split_file, artifact_dir / split, format="gztar")
    return [ artifact_dir / split for split in ["train", "test", "val"]]

def get_number_of_classes(ds: tf.data.Dataset) -> int:
    """
    Returns the number of classes in a dataset.
    """
    return len(ds.class_names)

def create_tf_dataset(split_path: pathlib.Path, batch_size: int = 32):
    """
    Creates a tf dataset from path containing a folder for each class.
    """
    ds = tf.keras.utils.image_dataset_from_directory(
        split_path, 
        image_size=(32,32), 
        batch_size=batch_size,
        color_mode='grayscale',
    )
    return ds

def preprocess_dataset(ds: tf.data.Dataset, cache: bool = True) -> tf.data.Dataset :
    ds = ds.map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, y)) # normalize
    if cache:
        ds = ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    return ds

def calculate_model_size_on_disk(path: str) -> int:
    return pathlib.Path(path).stat().st_size    

def calculate_model_num_parameters(model: tf.keras.Model) -> int:
    return model.count_params()

In [None]:
defaults = dict(
    batch_size=32*4,
    epochs=100,    
    optimizer="sgd",
    learning_rate=0.01,
    momentum=0.9
    stack_n = 3,
    initializer = "he_normal",
	initial_num_feature_maps = 16,
)

MODEL_NAME = "ResNetv1"
run = wandb.init(project="master-thesis", job_type="training", name=MODEL_NAME, config=defaults,)
split_paths = load_data(run=run)

# hyperparameters

opt_name = wandb.config.optimizer
initializer_name = wandb.config.initializer
lr = wandb.config.learning_rate
bs = wandb.config.batch_size
epochs = wandb.config.epochs

In [None]:
ds_train, ds_test, ds_val = [
    create_tf_dataset(split_path, batch_size=bs) for split_path in split_paths
    ]

num_classes = len(ds_train.class_names)

print(f"There are {num_classes} classes")
print(f"Training set has {len(ds_train)} batches")
print(f"Test set has {len(ds_test)} batches")
print(f"Validation set has {len(ds_val)} batches")

In [None]:
def residual_block(x, number_of_filters, kernel_initializer = tf.keras.initializers.HeNormal(), match_filter_size=False):
    """
    Creates a residual block based on input and number of filters.
    """

    # Create skip connection
    x_skip = x

    # Perform the original mapping
    if match_filter_size:
        x = tf.keras.layers.Conv2D(
            number_of_filters,
            kernel_size=(3, 3),
            strides=(2, 2),
            kernel_initializer=kernel_initializer,
            padding="same",
        )(x_skip)
    else:
        x = tf.keras.layers.Conv2D(
            number_of_filters,
            kernel_size=(3, 3),
            strides=(1, 1),
            kernel_initializer=kernel_initializer,
            padding="same",
        )(x_skip)
    x = tf.keras.layers.BatchNormalization(axis=3)(x)
    x = tf.keras.layers.Activation("relu")(x)
    x = tf.keras.layers.Conv2D(
        number_of_filters,
        kernel_size=(3, 3),
        kernel_initializer=kernel_initializer,
        padding="same",
    )(x)
    x = tf.keras.layers.BatchNormalization(axis=3)(x)

    if match_filter_size:
        x_skip = tf.keras.layers.Lambda(
            lambda x: tf.pad(
                x[:, ::2, ::2, :],
                tf.constant(
                    [
                        [0, 0],
                        [0, 0],
                        [0, 0],
                        [number_of_filters // 4, number_of_filters // 4],
                    ]
                ),
                mode="CONSTANT",
            )
        )(x_skip)
    # Add the skip connection to the regular mapping
    x = tf.keras.layers.Add()([x, x_skip])
    x = tf.keras.layers.Activation("relu")(x)
    return x

class ResidualBlocks(tf.keras.layers.Layer):
    def __init__(self, filter_size, units=32, kernel_initializer = tf.keras.initializers.HeNormal()):
        super(ResidualBlocks, self).__init__()
        self.filter_size = filter_size
        self.units = units
        self.kernel_initializer = kernel_initializer

    def call(self, x):
        # Paper: "Then we use a stack of 6n layers (...)
        #	with 2n layers for each feature map size."
        # 6n/2n = 3, so there are always 3 groups.
        for layer_group in range(3):

            # Each block in our code has 2 weighted layers,
            # and each group has 2n such blocks,
            # so 2n/2 = n blocks per group.
            for block in range(self.units):

                # Perform filter size increase at every
                # first layer in the 2nd block onwards
                # Apply Conv block for projecting the skip
                # connection.
                if layer_group > 0 and block == 0:
                  print("layer_group > 0 and block == 0")
                  self.filter_size *= 2
                  x = residual_block(x, self.filter_size, match_filter_size=True, kernel_initializer=self.kernel_initializer)
                else:
                  print("ResLayer - else")
                  x = residual_block(x, self.filter_size, kernel_initializer=self.kernel_initializer)

        # Return final layer
        return x

def get_resnet(
     num_classes: int, n_stacks: int = 3, filter_size: int = 64
):
    """
    Builds ResNet model with 6n + 2 layers, default ResNet-20
    :filter_size  : number of filters in the first convolutional layer - 16, 32 or 64
    """

	# Get number of classes from model configuration
    initializer = tf.keras.initializers.get(defaults.get("initializer"))
    
    # Define model structure
	# logits are returned because Softmax is pushed to loss function.
    x_input = tf.keras.layers.Input(shape=(32, 32, 1))
    x = tf.keras.layers.Conv2D(filter_size, kernel_size=(3,3), strides=(1,1), kernel_initializer=initializer, padding="same")(x_input) 
    x = tf.keras.layers.BatchNormalization()(x) 
    x = tf.keras.layers.Activation("relu")(x) 
    x = ResidualBlocks(filter_size, units=n_stacks, kernel_initializer=initializer)(x) 
    x = tf.keras.layers.GlobalAveragePooling2D()(x) 
    x = tf.keras.layers.Flatten()(x)
    outputs = tf.keras.layers.Dense(num_classes, kernel_initializer=initializer)(x)
    
    return tf.keras.Model(x_input, outputs, name="ResNet-20")

In [None]:
ds_train = preprocess_dataset(ds_train)
ds_val = preprocess_dataset(ds_val)
ds_test = preprocess_dataset(ds_test, cache=False)

#model = get_resnet_v1(num_classes)
model = get_resnet_v2(num_classes, n_stacks=3, filter_size=64)

opt = tf.keras.optimizers.get({
    'class_name': wandb.config.optimizer,
    'config': {
        'learning_rate': lr,
    }
})

model.compile(
    optimizer=opt,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

wandb_callback = wandb.keras.WandbCallback(
    save_model=False,
    compute_flops=True,
)

# save the best model
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=f"./artifacts/{MODEL_NAME}.h5",
    save_weights_only=False,
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
)

history = model.fit(
    ds_train,
    epochs=epochs,
    validation_data=ds_val,
    callbacks=[wandb_callback, checkpoint_callback],
)

In [None]:
# plot history
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1, figsize=(15, 10))
epochs = range(1, len(history.history["loss"]) + 1)
ax.plot(epochs, history.history["accuracy"], label="accuracy")
ax.plot(epochs, history.history["val_accuracy"], label="val_accuracy")
ax.set_xlabel("Epoch")
ax.set_ylabel("Accuracy")
ax.legend(loc="lower right")

plt.show()

In [None]:
# calculate model size on disk, flops and number of parameters

flops = wandb.run.summary["GFLOPs"]
disk_size = calculate_model_size_on_disk(f"./artifacts/{MODEL_NAME}.h5")
num_parameters = calculate_model_num_parameters(model)

# evaluate model on ds_test and log to wandb
test_loss, test_acc = model.evaluate(ds_test)

wandb.log({
    "test loss": test_loss, 
    "test accuracy": test_acc, 
    "number of parameters": num_parameters,
    "disk size": disk_size, 
    "model flops": flops
    })


diacritics = {
    62: "ą",
    63: "ć",
    64: "ę",
    65: "ł",
    66: "ń",
    67: "ó",
    68: "ś",
    69: "ź",
    70: "ż",
    71: "Ą",
    72: "Ć",
    73: "Ę",
    74: "Ł",
    75: "Ń",
    76: "Ó",
    77: "Ś",
    78: "Ź",
    79: "Ż"
}

# log test accuracy on these classes separately to wandb

diacritics_acc = {}
for diacritic_label in diacritics.keys():
    ds_test_diacritic = ds_test.filter(lambda x, y: y == diacritic_label)
    test_loss, test_acc = model.evaluate(ds_test_diacritic)
    diacritics_acc[diacritic_label] = {
        "loss": test_loss,
        "accuracy": test_acc,
        "label": diacritics[diacritic_label],
    }

wandb.log(diacritics_acc)

In [None]:
# save artifact to wandb
artifact = wandb.Artifact(
    name=MODEL_NAME,
    type="model"
)

# save best model to artifact
artifact.add_file(f"./artifacts/{MODEL_NAME}.h5")
run.log_artifact(artifact)
run.finish()