# Dataset overview

In this notebook we review class counts in train and validation splits

In [2]:
import wandb
import pathlib
import shutil
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from typing import List


def load_data(run: wandb.sdk.wandb_run.Run) -> pathlib.Path:
    """
    Unpacks data from an artifact into a folder and returns the path to the folder.
    """

    artifact_name = f"letters_splits"
    artifact = run.use_artifact(f"master-thesis/{artifact_name}:latest")
    artifact_dir = pathlib.Path(
        f"./artifacts/{artifact.name.replace(':', '-')}"
    ).resolve()
    if not artifact_dir.exists():
        artifact_dir = artifact.download()
        artifact_dir = pathlib.Path(artifact_dir).resolve()
        for split_file in artifact_dir.iterdir():
            if split_file.name.endswith(".tar.gz"):
                split = split_file.name.replace(".tar.gz", "")
                shutil.unpack_archive(split_file, artifact_dir / split, format="gztar")

    return [artifact_dir / split for split in ["train", "test", "val"]]

In [3]:
run = wandb.init(project="master-thesis", job_type="preprocessing")
split_paths = load_data(run=run)

ds_train = tf.keras.utils.image_dataset_from_directory(
        split_paths[0],
        image_size=(32, 32),
        color_mode="grayscale",
    )

ds_test = tf.keras.utils.image_dataset_from_directory(
        split_paths[1],
        image_size=(32, 32),
        color_mode="grayscale",
    )

ds_val = tf.keras.utils.image_dataset_from_directory(
        split_paths[2],
        image_size=(32, 32),
        color_mode="grayscale",
    )

number_of_classes = len(ds_train.class_names)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/wflis/.netrc


[34m[1mwandb[0m: Downloading large artifact letters_splits:latest, 86.98MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:7.3


Found 446491 files belonging to 89 classes.


2023-01-15 19:20:50.057010: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-01-15 19:20:50.057065: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-01-15 19:20:50.057083: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ITEM-S127495): /proc/driver/nvidia/version does not exist
2023-01-15 19:20:50.057381: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Found 55773 files belonging to 89 classes.
Found 55773 files belonging to 89 classes.


In [4]:
char_id_to_class_name = {
        0: "0",
        1: "1",
        2: "2",
        3: "3",
        4: "4",
        5: "5",
        6: "6",
        7: "7",
        8: "8",
        9: "9",
        10: "a",
        11: "b",
        12: "c",
        13: "d",
        14: "e",
        15: "f",
        16: "g",
        17: "h",
        18: "i",
        19: "j",
        20: "k",
        21: "l",
        22: "m",
        23: "n",
        24: "o",
        25: "p",
        26: "q",
        27: "r",
        28: "s",
        29: "t",
        30: "u",
        31: "v",
        32: "w",
        33: "x",
        34: "y",
        35: "z",
        36: "A",
        37: "B",
        38: "C",
        39: "D",
        40: "E",
        41: "F",
        42: "G",
        43: "H",
        44: "I",
        45: "J",
        46: "K",
        47: "L",
        48: "M",
        49: "N",
        50: "O",
        51: "P",
        52: "Q",
        53: "R",
        54: "S",
        55: "T",
        56: "U",
        57: "V",
        58: "W",
        59: "X",
        60: "Y",
        61: "Z",
        # then lowercase letters of the Polish alphabet: ą, ć, ę, ł, ń, ó, ś, ź, ż
        62: "ą",
        63: "ć",
        64: "ę",
        65: "ł",
        66: "ń",
        67: "ó",
        68: "ś",
        69: "ź",
        70: "ż",
        # then uppercase letters of the Polish alphabet: Ą, Ć, Ę, Ł, Ń, Ó, Ś, Ź, Ż
        71: "Ą",
        72: "Ć",
        73: "Ę",
        74: "Ł",
        75: "Ń",
        76: "Ó",
        77: "Ś",
        78: "Ź",
        79: "Ż",
        # then special characters: + - : ; $ ! ? @
        80: "+",
        81: "-",
        82: ":",
        83: ";",
        84: "$",
        85: "!",
        86: "?",
        87: "@",
        88: ".",
    }

In [10]:
import math

CLASSES = list(char_id_to_class_name.keys())

# TFRecord util functions

def decode_jpeg_and_label(filename):
  bits = tf.io.read_file(filename)
  image = tf.io.decode_jpeg(bits)
  # parse flower name from containing directory
  label = tf.strings.split(tf.expand_dims(filename, axis=-1), sep='/')
  label = label.values[-2]
  return image, label

def recompress_image(image, label):
  height = tf.shape(image)[0]
  width = tf.shape(image)[1]
  image = tf.cast(image, tf.uint8)
  image = tf.image.encode_jpeg(image, optimize_size=True, chroma_downsampling=False)
  return image, label, height, width

# Three types of data can be stored in TFRecords: bytestrings, integers and floats
# They are always stored as lists, a single data element will be a list of size 1

def _bytestring_feature(list_of_bytestrings):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _int_feature(list_of_ints): # int64
  return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def _float_feature(list_of_floats): # float32
  return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))

def to_tfrecord(tfrec_filewriter, img_bytes, label, height, width):  
  class_num = np.argmax(np.array(CLASSES)==label) # 'roses' => 2 (order defined in CLASSES)
  one_hot_class = np.eye(len(CLASSES))[class_num]     # [0, 0, 1, 0, 0] for class #2, roses

  feature = {
      "image": _bytestring_feature([img_bytes]), # one image in the list
      "class": _int_feature([class_num]),        # one class in the list
      
      # additional (not very useful) fields to demonstrate TFRecord writing/reading of different types of data
      "label":         _bytestring_feature([label]),          # fixed length (1) list of strings, the text label
      "size":          _int_feature([height, width]),         # fixed length (2) list of ints
      "one_hot_class": _float_feature(one_hot_class.tolist()) # variable length  list of floats, n=len(CLASSES)
  }
  return tf.train.Example(features=tf.train.Features(feature=feature))

def write_tfrecords(split_path, data_dir, shards=16, output_dir="./datasets_tfrecords"):
  output_dir = pathlib.Path(output_dir).resolve()
  output_dir.mkdir(exist_ok=True)

  DATA_OUTPUT = str(output_dir) 
  DATA_PATTERN = f"{split_path}/*/*.png" 
  SHARDS = shards
  AUTOTUNE = tf.data.AUTOTUNE

  nb_images = len(tf.io.gfile.glob(DATA_PATTERN))
  shard_size = math.ceil(1.0 * nb_images / SHARDS)
  print("Pattern matches {} images which will be rewritten as {} .tfrec files containing {} images each.".format(nb_images, SHARDS, shard_size))

  filenames = tf.data.Dataset.list_files(DATA_PATTERN, seed=35155) # This also shuffles the images
  dataset1 = filenames.map(decode_jpeg_and_label, num_parallel_calls=AUTOTUNE)

  dataset3 = dataset1.map(recompress_image, num_parallel_calls=AUTOTUNE)
  dataset3 = dataset3.batch(shard_size) # sharding: there will be one "batch" of images per file 

  print("Writing TFRecords")
  for shard, (image, label, height, width) in enumerate(dataset3):
    # batch size used as shard size here
    shard_size = image.numpy().shape[0]
    # good practice to have the number of records in the filename
    filename = DATA_OUTPUT + "{:02d}-{}.tfrec".format(shard, shard_size)
    
    with tf.io.TFRecordWriter(filename) as out_file:
      for i in range(shard_size):
        example = to_tfrecord(out_file,
                              image.numpy()[i], # re-compressed image: already a byte string
                              label.numpy()[i],
                              height.numpy()[i],
                              width.numpy()[i])
        out_file.write(example.SerializeToString())
      print("Wrote file {} containing {} records".format(filename, shard_size))

In [24]:
output_dir = pathlib.Path("./datasets_tfrecords").resolve()
output_dir.mkdir(exist_ok=True)

shards=16
output_dir="./datasets_tfrecords"

split_path = split_paths[0]
DATA_OUTPUT = str(output_dir) 
DATA_PATTERN = f"{split_path}/*/*.png" 
SHARDS = shards
AUTOTUNE = tf.data.AUTOTUNE

nb_images = len(tf.io.gfile.glob(DATA_PATTERN))
shard_size = math.ceil(1.0 * nb_images / SHARDS)
print("Pattern matches {} images which will be rewritten as {} .tfrec files containing {} images each.".format(nb_images, SHARDS, shard_size))

Pattern matches 446491 images which will be rewritten as 16 .tfrec files containing 27906 images each.


In [20]:
filenames = tf.data.Dataset.list_files(DATA_PATTERN, seed=35155) # This also shuffles the images
dataset1 = filenames.map(decode_jpeg_and_label, num_parallel_calls=AUTOTUNE)

<ParallelMapDataset element_spec=(TensorSpec(shape=(None, None, None), dtype=tf.uint8, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))>

In [21]:
dataset3 = dataset1.map(recompress_image, num_parallel_calls=AUTOTUNE)
dataset3 = dataset3.batch(shard_size) # sharding: there will be one "batch" of images per file 

In [23]:
print("Writing TFRecords")
for shard, (image, label, height, width) in enumerate(dataset3):
    # batch size used as shard size here
    shard_size = image.numpy().shape[0]
    # good practice to have the number of records in the filename
    filename = DATA_OUTPUT + "/{:02d}-{}.tfrec".format(shard, shard_size)


    with tf.io.TFRecordWriter(filename) as out_file:
        for i in range(shard_size):
            '''
            example = to_tfrecord(out_file,
                                    image.numpy()[i], # re-compressed image: already a byte string
                                    label.numpy()[i],
                                    height.numpy()[i],
                                    width.numpy()[i])
            '''
            #out_file.write(example.SerializeToString())
            print("Wrote file {} containing {} records".format(filename, shard_size))

Writing TFRecords
Wrote file ./datasets_tfrecords/00-13953.tfrec containing 13953 records
Wrote file ./datasets_tfrecords/00-13953.tfrec containing 13953 records
Wrote file ./datasets_tfrecords/00-13953.tfrec containing 13953 records
Wrote file ./datasets_tfrecords/00-13953.tfrec containing 13953 records
Wrote file ./datasets_tfrecords/00-13953.tfrec containing 13953 records
Wrote file ./datasets_tfrecords/00-13953.tfrec containing 13953 records
Wrote file ./datasets_tfrecords/00-13953.tfrec containing 13953 records
Wrote file ./datasets_tfrecords/00-13953.tfrec containing 13953 records
Wrote file ./datasets_tfrecords/00-13953.tfrec containing 13953 records
Wrote file ./datasets_tfrecords/00-13953.tfrec containing 13953 records
Wrote file ./datasets_tfrecords/00-13953.tfrec containing 13953 records
Wrote file ./datasets_tfrecords/00-13953.tfrec containing 13953 records
Wrote file ./datasets_tfrecords/00-13953.tfrec containing 13953 records
Wrote file ./datasets_tfrecords/00-13953.tfrec

KeyboardInterrupt: 

In [11]:
# save as TFRecords and upload to WandB

tfrecord_dir = "./datasets_tfrecords"
for split_path in split_paths:
  write_tfrecords(split_path, tfrecord_dir)

artifact = wandb.Artifact("letters_splits_tfds", type="dataset", description="Dataset splits in tf.data.TFRecord format")
artifact.add_dir(tfrecord_dir)
run.log_artifact(artifact)

NotImplementedError: Non-relative patterns are unsupported

In [None]:
# save datasets on disk then upload to wandb as artifacts

output_dir = pathlib.Path("./datasets").resolve()
output_dir.mkdir(exist_ok=True)

ds_train.save(str(output_dir / "train"), compression="GZIP")
ds_val.save(str(output_dir / "val"), compression="GZIP")
ds_test.save(str(output_dir / "test"), compression="GZIP")

artifact = wandb.Artifact("letters_splits_tfds", type="dataset", description="Dataset splits in tf.data.Dataset format")
artifact.add_dir(output_dir)
run.log_artifact(artifact)

In [None]:
# calculate class count for each split
train_class_count = np.zeros(number_of_classes)
for _, label in ds_train:
    train_class_count += tf.math.bincount(label, minlength=number_of_classes)

val_class_count = np.zeros(number_of_classes)
for _, label in ds_val:
    val_class_count += tf.math.bincount(label, minlength=number_of_classes)

# plot class count for each split
plt.bar(ds_train.class_names, train_class_count)
plt.title("Train")
plt.show()

plt.bar(ds_val.class_names, val_class_count)

In [None]:
# log class count for each split to wandb

wandb.log({"train_class_count": wandb.Histogram(train_class_count)})
wandb.log({"val_class_count": wandb.Histogram(val_class_count)})
