# Dataset overview

In this notebook we review class counts in train and validation splits

In [1]:
import wandb
import pathlib
import shutil
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt


def load_data(run: wandb.sdk.wandb_run.Run) -> List[tf.data.Dataset]:
    """
    Downloads datasets from a wandb artifact and loads them into a list of tf.data.Datasets.
    """

    artifact_name = f"letters_splits_tfds"
    artifact = run.use_artifact(f"master-thesis/{artifact_name}:latest")
    artifact_dir = pathlib.Path(
        f"./artifacts/{artifact.name.replace(':', '-')}"
    ).resolve()
    if not artifact_dir.exists():
        artifact_dir = artifact.download()
        artifact_dir = pathlib.Path(artifact_dir).resolve()
    
    output_list = []
    for split in ["train", "test", "val"]:
        ds = tf.data.Dataset.load(str(artifact_dir / split), compression="GZIP")
        output_list.append(ds)
    
    return output_list


2023-01-14 10:30:50.098943: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-14 10:30:51.384342: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda-11.2/lib64:
2023-01-14 10:30:51.384410: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda-11.2/lib64:


In [2]:
run = wandb.init(project="master-thesis", job_type="preprocessing")
split_paths = load_data(run=run)

ds_train = tf.keras.utils.image_dataset_from_directory(
        split_paths[0],
        image_size=(32, 32),
        color_mode="grayscale",
    )

ds_test = tf.keras.utils.image_dataset_from_directory(
        split_paths[1],
        image_size=(32, 32),
        color_mode="grayscale",
    )

ds_val = tf.keras.utils.image_dataset_from_directory(
        split_paths[2],
        image_size=(32, 32),
        color_mode="grayscale",
    )

number_of_classes = len(ds_train.class_names)

[34m[1mwandb[0m: Currently logged in as: [33mgratkadlafana[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact letters_splits:latest, 86.98MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.3


Found 446491 files belonging to 89 classes.


2023-01-14 10:32:33.291689: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-01-14 10:32:33.291708: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (wiktor-on-linux): /proc/driver/nvidia/version does not exist
2023-01-14 10:32:33.293038: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Found 55773 files belonging to 89 classes.
Found 55773 files belonging to 89 classes.


In [4]:
# save datasets on disk then upload to wandb as artifacts

output_dir = pathlib.Path("./datasets").resolve()
output_dir.mkdir(exist_ok=True)

ds_train.save(str(output_dir / "train"), compression="GZIP")
ds_val.save(str(output_dir / "val"), compression="GZIP")
ds_test.save(str(output_dir / "test"), compression="GZIP")

artifact = wandb.Artifact("letters_splits_tfds", type="dataset", description="Dataset splits in tf.data.Dataset format")
artifact.add_dir(output_dir)
run.log_artifact(artifact)

[34m[1mwandb[0m: Adding directory to artifact (/home/wiktor/code/master-thesis/notebooks/datasets)... Done. 0.2s


<wandb.sdk.wandb_artifacts.Artifact at 0x7fa0f0aa8ee0>

In [None]:
# calculate class count for each split
train_class_count = np.zeros(number_of_classes)
for _, label in ds_train:
    train_class_count += tf.math.bincount(label, minlength=number_of_classes)

val_class_count = np.zeros(number_of_classes)
for _, label in ds_val:
    val_class_count += tf.math.bincount(label, minlength=number_of_classes)

# plot class count for each split
plt.bar(ds_train.class_names, train_class_count)
plt.title("Train")
plt.show()

plt.bar(ds_val.class_names, val_class_count)

In [None]:
# log class count for each split to wandb

wandb.log({"train_class_count": wandb.Histogram(train_class_count)})
wandb.log({"val_class_count": wandb.Histogram(val_class_count)})
