In [1]:
import yaml
from pathlib import Path
from torchvision import datasets, transforms
import numpy as np


In [3]:
with open("../config.yaml", "r") as f:
    cfg = yaml.safe_load(f)


In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=cfg["data"]["normalize"]["mean"],
        std=cfg["data"]["normalize"]["std"]
    )
])

train_ds = datasets.CIFAR10(
    root=cfg["data"]["data_dir"],
    train=True,
    download=cfg["data"]["download"],
    transform=transform
)

test_ds = datasets.CIFAR10(
    root=cfg["data"]["data_dir"],
    train=False,
    download=cfg["data"]["download"],
    transform=transform
)


## basic integrity checks

In [13]:
assert len(train_ds) > 0
assert len(test_ds) > 0
assert train_ds.classes == test_ds.classes

print("dataset size ok")
print("num classes:", len(train_ds.classes))


dataset size ok
num classes: 10


## schema validation


In [9]:
x, y = train_ds[0]

assert x.shape == (3, 32, 32)
assert isinstance(y, int)

print("schema validated")


schema validated


## value range validation

In [10]:
x_np = x.numpy()

assert np.isfinite(x_np).all()
assert x_np.min() >= -5
assert x_np.max() <= 5

print("value ranges ok")


value ranges ok


## label distribution check

In [11]:
labels = [train_ds[i][1] for i in range(len(train_ds))]
unique, counts = np.unique(labels, return_counts=True)

dist = dict(zip(unique, counts))
print(dist)


{0: 5000, 1: 5000, 2: 5000, 3: 5000, 4: 5000, 5: 5000, 6: 5000, 7: 5000, 8: 5000, 9: 5000}


## final validation summary

In [12]:
validation_report = {
    "train_size": len(train_ds),
    "test_size": len(test_ds),
    "num_classes": len(train_ds.classes),
    "status": "PASSED"
}

validation_report


{'train_size': 50000,
 'test_size': 10000,
 'num_classes': 10,
 'status': 'PASSED'}