In [64]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [65]:
import tensorflow as tf
from tensorflow.python.data import AUTOTUNE
# from tensorflow.keras.layers import Input, Conv2D, Dense, Flatten, Dropout, GlobalMaxPooling2D, Activation, Rescaling
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential as SequentialModel
# import keras_tuner as kt
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# import pandas as pd
# import numpy as np
import matplotlib.pyplot as plt
# import itertools

import os
# import time

from pathlib import Path

tf.random.set_seed(5263)
tf.__version__

'2.14.0'

This cell is equivalent to importing `dataset_builder.py`

In [66]:
import pandas as pd

from os import walk
from os.path import isfile, join

from pathlib import Path

# TODO: Add absolute option to choose whether paths become absolute when compiled,
#  then use that to make a function that can store the compiled labels as a singular CSV

class NoLabelFilesFoundError(Exception):
    def __init__(self, *args):
        super().__init__(*args)

def compile_label_files_in_directory(directory: Path, label_file_name: str) -> list[Path]:
    """
    Looks for all files with the name `label_file_name`, and returns all absolute paths of those files.

    :param directory: The root directory to check for label files within
    :param label_file_name: The name of the label file to look for (assumes all requested label files have this name)

    :return: A list of all `pathlib.Path` of the found label files. If no label files are found, an empty list will be returned.
    """

    if type(directory) == str:
        directory = Path(directory)
    # Include subclasses like `WindowsPath`, `PosixPath` etc
    elif not isinstance(directory, Path):
        raise ValueError("`directory` must be a `Path` to the requested dataset.")

    label_file_paths = []

    if not directory.is_dir():
        raise ValueError(f"The `directory` '{directory}' is not valid. Please check your arguments.")

    else:
        for cur_path, directories, files in walk(directory):
            for file in files:
                if file == label_file_name:
                    label_file_paths.append(Path(join(cur_path, file)))
                    break

    if not label_file_paths:
        raise NoLabelFilesFoundError("No label files were found in this directory.")

    return label_file_paths


def compile_dataset_labels(label_file_paths: list[Path]) -> pd.DataFrame:
    """
    Checks all label files within `label_file_paths`, loads each file one at a time, and concatenates all rows
    into a single `DataFrame`. All paths are made absolute to ensure consistency with usage from other libraries.

    :param label_file_paths: The paths of all label files to concatenate together. All files must be `CSV` files

    :return: A `DataFrame` with all label file rows concatenated together, with paths resolved to be absolute.
    """

    # Anything that can be iterated through in the same way as a list will work, so tuples and sets are fine
    if type(label_file_paths) in (list, tuple, set):
        if not all(isinstance(label_file, Path) for label_file in label_file_paths):
            raise ValueError("All paths within `label_file_paths` must be a `pathlib.Path`.")
    else:
        raise ValueError("A list of paths containing all label files is required.")

    dataframes = []

    for label_file in label_file_paths:
        try:
            df = pd.read_csv(label_file)
        except pd.errors.ParserError:
            raise ValueError("Invalid label file given - Ensure all files are CSV files.")

        # Get the folder the CSV resides in - it should be in the same folder as the data,
        # so we can use that to get their paths too.
        data_folder = label_file.parent
        for row_num, row in df.iterrows():
            sample_path = (data_folder / row[0]).resolve()

            if isfile(sample_path):
                df.loc[row_num, "file_name"] = str(sample_path)
            else:
                raise ValueError(f"'{row[0]}' does not resolve to a real file. Check that all files are present. "
                                 f"(File path checked: {sample_path})")

        dataframes.append(df)

    return pd.concat(dataframes, axis=0)


def dataset_labels_to_dataframe(dataset_directory: Path, label_file_name: str) -> pd.DataFrame:
    """
    Convenience function that runs `compile_label_files_in_directory` and `compile_dataset_labels`.

    :param dataset_directory: The directory of the dataset to collect and compile label files from.
    :param label_file_name: The name of the label file (assumes that the name of the file will be the same for each dataset within the directory)

    :return: A `DataFrame` that holds all entries from each label file found, with all file paths resolved to be absolute.
    """
    return compile_dataset_labels(compile_label_files_in_directory(dataset_directory, label_file_name))


In [67]:
dataset_path = Path("") / "drive" / "MyDrive" / "WA.M.AI datasets" / "FiNALE 1" / "train"
eval_dataset_path = Path("") / "drive" / "MyDrive" / "WA.M.AI datasets" / "FiNALE 1" / "eval"

df = dataset_labels_to_dataframe(dataset_path, "8_zone_presence_labels.csv")
eval_df = dataset_labels_to_dataframe(eval_dataset_path, "8_zone_presence_labels.csv")
# Dataset size
len(df.index)

6350

In [68]:
file_names = df["file_name"].values
eval_file_names = eval_df["file_name"].values

file_names[0]

'/content/drive/MyDrive/WA.M.AI datasets/FiNALE 1/train/mkdr-basic-15-played/mkdr-basic-15-played-0000000000.png'

In [69]:
labels = df.drop("file_name", axis=1)
eval_labels = eval_df.drop("file_name", axis=1)

labels.head()

Unnamed: 0,b1,b2,b3,b4,b5,b6,b7,b8
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False


In [70]:
# Initially loading dataset with paths - will be replaced with image source retroactively
dataset = tf.data.Dataset.from_tensor_slices((file_names, labels))
eval_dataset = tf.data.Dataset.from_tensor_slices((eval_file_names, eval_labels))
# Shuffle whole dataset before loading images
dataset.shuffle(buffer_size=dataset.cardinality(), reshuffle_each_iteration=True)
dataset

<_TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(8,), dtype=tf.bool, name=None))>

In [71]:
# Normalise image values to 0-1 from 0-255, and ensure all images are 400x400
IMG_SIZE = (400, 400, 3)
resize_and_rescale = tf.keras.Sequential([
  layers.Resizing(IMG_SIZE[0], IMG_SIZE[1]),
  layers.Rescaling(1./255)
])

def read_image(image_path, labels):
    # Read data from file path and resize to 400x400
    image = tf.io.read_file(image_path)
    image = tf.image.decode_png(image, channels=3)
    image = resize_and_rescale(image)
    # `decode_png` does not state the size of the image, so we force it here`
    # image.set_shape([400, 400, 3])
    # In a normal dataset, the first dimension states how many samples are in the set.
    # Since teach tensor set here represents 1 sample, we insert a new dimension (which will have length 1)
    # to state that there is 1 sample
    image = tf.expand_dims(image, 0)
    labels = [labels]

    return image, labels

dataset = dataset.map(read_image)
eval_dataset = eval_dataset.map(read_image)
# val_ds = val_ds.map(read_image)
# train_ds
dataset

<_MapDataset element_spec=(TensorSpec(shape=(1, 400, 400, 3), dtype=tf.float32, name=None), TensorSpec(shape=(1, 8), dtype=tf.bool, name=None))>

In [72]:
# https://www.tensorflow.org/tutorials/images/data_augmentation
def augment(image_label, seed):
  image, label = image_label
  # Make a new seed.
  new_seed = tf.random.split(seed, num=1)[0, :]
  # Random brightness. - NOTE: This has been adjusted as 0.5 delta creates
  # such a difference that some notes may be indiscernable,
  # and cause a garbage-in-garbage-out-loop
  image = tf.image.stateless_random_brightness(
      image, max_delta=0.25, seed=new_seed)
  image = tf.clip_by_value(image, 0, 1)
  return image, label

rng = tf.random.Generator.from_seed(9172, alg='philox')

# Create a wrapper function for updating seeds.
# @tf.function
def augment_with_seed(image, label):
  seed = rng.make_seeds(2)[0]
  image, label = augment((image, label), seed)
  return image, label
## raises `OperatorNotAllowedInGraphError: Iterating over a symbolic `tf.Tensor` is not allowed.`
dataset = dataset.map(augment_with_seed, num_parallel_calls=AUTOTUNE).prefetch(tf.data.AUTOTUNE)

In [73]:
# Visualise the first 5 items in the dataset to ensure that augmentation works as expected
def visualize(original):
  fig = plt.figure()
  plt.subplot(1,2,1)
  plt.title('Original image')
  plt.imshow(original)

items = dataset.take(20)

for i in items.as_numpy_iterator():
    visualize(i[0][0])

Output hidden; open in https://colab.research.google.com to view.

In [74]:
wamai_model = SequentialModel([

    # layers.InputLayer(input_shape=(400, 400, 3)),
    layers.Conv2D(filters = 32, kernel_size = (3,3), strides = 2, input_shape = IMG_SIZE),
    layers.Activation('relu'),
    layers.BatchNormalization(),

    layers.Conv2D(filters = 64, kernel_size = (3,3), strides = 2),
    layers.Activation('relu'),
    layers.BatchNormalization(),

    layers.Conv2D(filters = 128, kernel_size = (3,3), strides = 2),
    layers.Activation('relu'),
    layers.BatchNormalization(),

    layers.MaxPool2D(pool_size = (2, 2)),
    layers.Conv2D(filters = 32, kernel_size = (3,3)),
    layers.Activation('relu'),
    layers.BatchNormalization(),

    layers.MaxPool2D(pool_size = (2, 2)),
    layers.Flatten(),

    layers.Dense(units = 512),
    layers.Activation('relu'),

    layers.Dense(units = 512),
    layers.Activation('relu'),

    layers.Dense(units = 8),
    layers.Activation('sigmoid')

])

wamai_model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_8 (Conv2D)           (None, 199, 199, 32)      896       
                                                                 
 activation_14 (Activation)  (None, 199, 199, 32)      0         
                                                                 
 batch_normalization_8 (Bat  (None, 199, 199, 32)      128       
 chNormalization)                                                
                                                                 
 conv2d_9 (Conv2D)           (None, 99, 99, 64)        18496     
                                                                 
 activation_15 (Activation)  (None, 99, 99, 64)        0         
                                                                 
 batch_normalization_9 (Bat  (None, 99, 99, 64)        256       
 chNormalization)                                     

In [75]:
wamai_model.compile(optimizer=tf.keras.optimizers.Adam(),
                    loss='binary_crossentropy',
                    metrics=["accuracy"])

In [76]:
model_folder = Path("") / "drive" / "MyDrive" / "WA.M.AI models" / "FiNALE proto 1"
# model_folder_string = str(model_folder)
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

checkpoint_filepath = str(model_folder / "WA.M.AI-p-finale-epoch{epoch:02d}-l{val_loss:.2f}-acc{val_accuracy:.4f}.keras")

# Save a new model every time it improves
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    monitor='val_loss',
    filepath=checkpoint_filepath,
    save_best_only=False)
# Stop training once the model has not improved on the validation dataset for 7 epochs
# early_stopping_callback = tf.keras.callbacks.EarlyStopping(patience=20)

wamai_model.fit(dataset, epochs=200, callbacks=[model_checkpoint_callback], validation_data=eval_dataset)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
 398/6350 [>.............................] - ETA: 1:04 - loss: 0.0062 - accuracy: 0.4347

KeyboardInterrupt: ignored