In [1]:
import logging

logging.basicConfig()

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

import math
from datasets import load_dataset
import sadl
from sadl import xp
from tqdm import tqdm



In [2]:
N_TRAIN_SAMPLES = 60_000
N_TEST_SAMPLES = 10_000

BATCH_SIZE = 256

N_TRAIN_BATCHES = math.ceil(N_TRAIN_SAMPLES / BATCH_SIZE) # mnist train has 60k images
N_TEST_BATCHES = math.ceil(N_TEST_SAMPLES / BATCH_SIZE) # mnist test has 10k images

N_EPOCHS = 10

DEVICE = "cpu"

In [3]:
ds = load_dataset("ylecun/mnist")



In [4]:
def normalize(examples):
    # we could also use sadl.tensor here, but xp (numpy/cupy) is sufficient because we just transform the data once
    pixel_values = [xp.array(img, dtype=xp.float32).flatten() for img in examples["image"]]
    examples["pixel_values"] = [(pv / 255.0 - 0.1307) / 0.3081 for pv in pixel_values]
    return examples


In [5]:
ds_train = ds["train"].map(normalize, remove_columns=["image"], batched=True)
ds_eval = ds["test"].map(normalize, remove_columns=["image"], batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [6]:
def to_sadl_tensors(batch, onehot=True):
    x = sadl.tensor(batch["pixel_values"], dtype=xp.float32)
    y = sadl.tensor(xp.eye(10)[batch["label"]] if onehot else batch["label"])
    return x, y

In [7]:
model = sadl.Mlp([
    sadl.Linear(dim_in=784, dim_out=784),
    sadl.Linear(dim_in=784, dim_out=10),
])
log_softmax = sadl.LogSoftmax()

In [None]:
optimizer = sadl.Adam(params=list(model.parameters), lr=1e-3)

In [None]:
model = model.copy_to_device(device=DEVICE)
log_softmax = log_softmax.copy_to_device(device=DEVICE)
optimizer = optimizer.copy_to_device(device=DEVICE)

In [11]:
@sadl.no_grad_fn
def eval(model, ds_eval) -> float:
    n_correct = 0
    n_seen = 0

    for batch in tqdm(
        ds_eval.iter(batch_size=BATCH_SIZE),
        desc=f"Evaluating",
        total=N_TEST_BATCHES,
    ):
        x, y, = to_sadl_tensors(batch, onehot=False)

        x = x.copy_to_device(device=DEVICE)
        y = y.copy_to_device(device=DEVICE)

        logits = model(x)

        n_correct += xp.sum(logits.argmax(axis=-1) == y).item()
        n_seen += y.shape[0]


    return n_correct / n_seen
    

In [12]:
for epoch in range(N_EPOCHS):

    ds_train_iter = ds_train.shuffle(seed=epoch).iter(batch_size=BATCH_SIZE)
    
    for batch in tqdm(
        ds_train_iter,
        desc=f"Epoch {epoch+1}",
        total=N_TRAIN_BATCHES,
    ):

        optimizer.zero_grad()

        x, y, = to_sadl_tensors(batch)

        x = x.copy_to_device(device=DEVICE)
        y = y.copy_to_device(device=DEVICE)

        logits = model(x)

        loss = -xp.mean(xp.sum(log_softmax(logits) * y, axis=-1))

        optimizer.backward(loss=loss)
        optimizer.step()


    eval_accuracy = eval(model, ds_eval)

    logger.info(f"Train loss: {loss.item()}")
    logger.info(f"Eval accuracy: {eval_accuracy*100:.2f}%")


Epoch 1: 100%|██████████| 235/235 [00:11<00:00, 20.88it/s]
Evaluating: 100%|██████████| 40/40 [00:01<00:00, 22.93it/s]
INFO:__main__:Train loss: 1.586968335241684
INFO:__main__:Eval accuracy: 60.35%
Epoch 2: 100%|██████████| 235/235 [00:11<00:00, 20.90it/s]
Evaluating: 100%|██████████| 40/40 [00:01<00:00, 22.88it/s]
INFO:__main__:Train loss: 1.2990329332054729
INFO:__main__:Eval accuracy: 74.41%
Epoch 3: 100%|██████████| 235/235 [00:11<00:00, 20.73it/s]
Evaluating: 100%|██████████| 40/40 [00:01<00:00, 22.01it/s]
INFO:__main__:Train loss: 1.054761533942816
INFO:__main__:Eval accuracy: 78.80%
Epoch 4: 100%|██████████| 235/235 [00:11<00:00, 20.80it/s]
Evaluating: 100%|██████████| 40/40 [00:01<00:00, 22.54it/s]
INFO:__main__:Train loss: 0.7989026072572353
INFO:__main__:Eval accuracy: 81.74%
Epoch 5: 100%|██████████| 235/235 [00:11<00:00, 20.66it/s]
Evaluating: 100%|██████████| 40/40 [00:01<00:00, 22.58it/s]
INFO:__main__:Train loss: 0.8073050154381131
INFO:__main__:Eval accuracy: 83.16%
Ep