
# Target: ≥99.4% test acc within 15 epochs, with ≤8000 params, clean modular blocks.


## Imports, Device selection & training config.

In [1]:

import os, math, random, time
from dataclasses import dataclass
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch.nn.functional as F
from tqdm import tqdm
from torchsummary import summary

def pick_device(pref="auto"):
    if pref == "auto":
        if torch.cuda.is_available():
            return torch.device("cuda")
        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
            return torch.device("mps")
        return torch.device("cpu")
    if pref == "cuda":
        return torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if pref == "mps":
        return torch.device("mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() else "cpu")
    return torch.device("cpu")

def seed_everything(seed=42):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

@dataclass
class TrainCfg:
    epochs: int = 15          # assignment goal: ≤15
    batch_size: int = 128
    max_lr: float = 0.3       # OneCycle peak LR (tune 0.2–0.35 if needed)
    weight_decay: float = 0.0
    label_smoothing: float = 0.02
    device_pref: str = "auto"
    seed: int = 42

cfg = TrainCfg()
device = pick_device(cfg.device_pref)
seed_everything(cfg.seed)
device



device(type='mps')

## Data loader initialization

In [2]:
# %%
# Normalization (canonical MNIST), plus tiny rotation to boost generalization
mean, std = (0.1307,), (0.3081,)

train_tfms = transforms.Compose([
    transforms.RandomRotation(7, fill=0),  # Session6 Code-9 style augmentation (≈5–7°)
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

test_tfms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

train_ds = datasets.MNIST(root="./data", train=True, download=True, transform=train_tfms)
test_ds  = datasets.MNIST(root="./data", train=False, download=True, transform=test_tfms)

train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=2, pin_memory=False)
test_loader  = DataLoader(test_ds,  batch_size=1024, shuffle=False, num_workers=2, pin_memory=False)

len(train_ds), len(test_ds)


(60000, 10000)

## CNN Architecture - Primary(1/3)

In [3]:
dropout_value = 0.1
class ModelPrimary(nn.Module):
    def __init__(self):
        super(ModelPrimary, self).__init__()
        # Input Block
        self.convblock1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout(dropout_value)
        ) # output_size = 26

        # CONVOLUTION BLOCK 1
        self.convblock2 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.Dropout(dropout_value)
        ) # output_size = 24

        # TRANSITION BLOCK 1
        self.convblock3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=10, kernel_size=(1, 1), padding=0, bias=False),
        ) # output_size = 24
        self.pool1 = nn.MaxPool2d(2, 2) # output_size = 12

        # CONVOLUTION BLOCK 2
        self.convblock4 = nn.Sequential(
            nn.Conv2d(in_channels=10, out_channels=16, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout(dropout_value)
        ) # output_size = 10
        self.convblock5 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout(dropout_value)
        ) # output_size = 8
        self.convblock6 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3, 3), padding=0, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout(dropout_value)
        ) # output_size = 6
        self.convblock7 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3, 3), padding=1, bias=False),
            nn.ReLU(),
            nn.BatchNorm2d(16),
            nn.Dropout(dropout_value)
        ) # output_size = 6

        # OUTPUT BLOCK
        self.gap = nn.Sequential(
            nn.AvgPool2d(kernel_size=6)
        ) # output_size = 1

        self.convblock8 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=10, kernel_size=(1, 1), padding=0, bias=False),
            # nn.BatchNorm2d(10),
            # nn.ReLU(),
            # nn.Dropout(dropout_value)
        )


        self.dropout = nn.Dropout(dropout_value)

    def forward(self, x):
        x = self.convblock1(x)
        x = self.convblock2(x)
        x = self.convblock3(x)
        x = self.pool1(x)
        x = self.convblock4(x)
        x = self.convblock5(x)
        x = self.convblock6(x)
        x = self.convblock7(x)
        x = self.gap(x)
        x = self.convblock8(x)

        x = x.view(-1, 10)
        return F.log_softmax(x, dim=-1)

n_params = count_params(ModelPrimary().to(device))
print("Params:", n_params)
# assert n_params <= 8000, "Parameter budget exceeded!"

Params: 13808


## CNN Architecture - Intermediate attemps(2/3)

In [4]:
class Model_Intermediate(nn.Module):
    """
    TARGET: ≤8000 params, ≤15 epochs, ≥99.4% consistent (ideally 99.5–99.6%).
    Tools: 3x3 convs (pad=1), BN, Dropout(0.05), MaxPool, 1x1 transitions, GAP.
    Input: 1x28x28
    """
    def __init__(self, p_drop=0.05):
        super().__init__()
        # ----- Block A -----
        self.c1 = nn.Conv2d(1, 16, 3, padding=1)        # 28x28x16  (28->28)
        self.b1 = nn.BatchNorm2d(16)                    # 28x28x16
        self.c2 = nn.Conv2d(16, 16, 3, padding=1)       # 28x28x16  (28->28)
        self.b2 = nn.BatchNorm2d(16)                    # 28x28x16
        self.p1 = nn.MaxPool2d(2)                       # 14x14x16  (28->14)
        self.t1 = nn.Conv2d(16, 12, 1)                  # 14x14x12  (1x1 keeps H,W)
        self.d1 = nn.Dropout(p_drop)                    # 14x14x12

        # ----- Block B -----
        self.c3 = nn.Conv2d(12, 20, 3, padding=1)       # 14x14x20  (14->14)
        self.b3 = nn.BatchNorm2d(20)                    # 14x14x20
        self.c4 = nn.Conv2d(20, 20, 3, padding=1)       # 14x14x20  (14->14)
        self.b4 = nn.BatchNorm2d(20)                    # 14x14x20
        self.p2 = nn.MaxPool2d(2)                       # 7x7x20    (14->7)
        self.t2 = nn.Conv2d(20, 24, 1)                  # 7x7x24
        self.d2 = nn.Dropout(p_drop)                    # 7x7x24

        # ----- Head -----
        self.gap = nn.AdaptiveAvgPool2d(1)              # 1x1x24    (GAP)
        self.cls = nn.Conv2d(24, 10, 1)                 # 1x1x10

    def forward(self, x):
        # Block A
        x = F.relu(self.b1(self.c1(x)))                 # 28x28x16
        x = F.relu(self.b2(self.c2(x)))                 # 28x28x16
        x = self.p1(x)                                  # 14x14x16
        x = self.t1(x)                                  # 14x14x12
        x = self.d1(x)                                  # 14x14x12

        # Block B
        x = F.relu(self.b3(self.c3(x)))                 # 14x14x20
        x = F.relu(self.b4(self.c4(x)))                 # 14x14x20
        x = self.p2(x)                                  # 7x7x20
        x = self.t2(x)                                  # 7x7x24
        x = self.d2(x)                                  # 7x7x24

        # Head
        x = self.gap(x)                                 # 1x1x24
        x = self.cls(x).squeeze(-1).squeeze(-1)         # 10
        return x  # logits

n_params = count_params(Model_Intermediate().to(device))
print("Params:", n_params)
# assert n_params <= 8000, "Parameter budget exceeded!"

Params: 9382


## CNN Architecture - Final(3/3)

In [6]:
class NetSub8K(nn.Module):
    """
    Same topology as the original:
      1→7→7→10 → MaxPool → 10→12 → MaxPool → 16→18 → GAP → Linear(18→10)
    Padding/strides chosen to match spatial sizes from [1].
    Output is logits: use nn.CrossEntropyLoss.
    """
    def __init__(self, num_classes: int = 10):
        super().__init__()

        # BLOCK A: 28x28 -> 26x26 -> 24x24 -> 22x22
        self.convblock1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=7, kernel_size=3, padding=0, bias=False),  # 28->26
            nn.BatchNorm2d(7),
            nn.ReLU(inplace=True),

            nn.Conv2d(in_channels=7, out_channels=7, kernel_size=3, padding=0, bias=False), # 26->24
            nn.BatchNorm2d(7),
            nn.ReLU(inplace=True),

            nn.Conv2d(in_channels=7, out_channels=10, kernel_size=3, padding=0, bias=False),# 24->22
            nn.BatchNorm2d(10),
            nn.ReLU(inplace=True),
        )
        self.pool1 = nn.MaxPool2d(2, 2)  # 22->11

        # BLOCK B: 11x11 -> 9x9 -> 7x7
        self.convblock2 = nn.Sequential(
            nn.Conv2d(in_channels=10, out_channels=10, kernel_size=3, padding=0, bias=False), # 11->9
            nn.BatchNorm2d(10),
            nn.ReLU(inplace=True),

            nn.Conv2d(in_channels=10, out_channels=12, kernel_size=3, padding=0, bias=False), # 9->7
            nn.BatchNorm2d(12),
            nn.ReLU(inplace=True),
        )
        self.pool2 = nn.MaxPool2d(2, 2)  # 7->3

        # BLOCK C: keep 3x3 with padding=1
        self.convblock3 = nn.Sequential(
            nn.Conv2d(in_channels=12, out_channels=16, kernel_size=3, padding=1, bias=False), # 3->3
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),

            nn.Conv2d(in_channels=16, out_channels=18, kernel_size=3, padding=1, bias=False), # 3->3
            nn.BatchNorm2d(18),
            nn.ReLU(inplace=True),
        )

        # OUTPUT HEAD: GAP -> Linear(18->10)
        self.gap = nn.AdaptiveAvgPool2d(1)   # 3x3 -> 1x1
        self.fc  = nn.Linear(18, num_classes, bias=True)

    def forward(self, x):
        x = self.convblock1(x)   # 28->22
        x = self.pool1(x)        # 22->11
        x = self.convblock2(x)   # 11->7
        x = self.pool2(x)        # 7->3
        x = self.convblock3(x)   # 3->3
        x = self.gap(x)          # -> [N,18,1,1]
        x = x.view(x.size(0), -1)  # -> [N,18]
        x = self.fc(x)           # -> [N,10] (logits)
        return x


n_params = count_params(NetSub8K().to(device))
print("Params:", n_params)
assert n_params <= 8000, "Parameter budget exceeded!"

Params: 7784


##  Defining Training loop + optimizer

In [7]:
@torch.no_grad()
def evaluate(model, loader, loss_fn):
    model.eval()
    total, correct, loss_sum = 0, 0, 0.0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss   = loss_fn(logits, y)
        loss_sum += loss.item() * x.size(0)
        correct  += (logits.argmax(1) == y).sum().item()
        total    += y.size(0)
    avg_loss = loss_sum / total
    acc      = 100.0 * correct / total
    test_losses.append(avg_loss)
    test_acc.append(acc)
    print(f"\nTest set: Average loss: {avg_loss:.4f}, Accuracy: {correct}/{total} ({acc:.2f}%)\n")
    return acc/100.0, avg_loss  # keep original return semantics (acc in [0,1])

def train_epochs(model, train_loader, test_loader, cfg: TrainCfg):
    loss_fn = nn.CrossEntropyLoss(label_smoothing=cfg.label_smoothing)
    opt  = optim.SGD(model.parameters(),
                     lr=cfg.max_lr/10.0, momentum=0.9, weight_decay=cfg.weight_decay, nesterov=True)
    steps_per_epoch = len(train_loader)
    sched = optim.lr_scheduler.OneCycleLR(
        opt, max_lr=cfg.max_lr, epochs=cfg.epochs, steps_per_epoch=steps_per_epoch,
        pct_start=0.2, anneal_strategy="cos", div_factor=10.0, final_div_factor=100.0
    )

    best_acc = 0.0
    for epoch in range(1, cfg.epochs + 1):
        model.train()
        correct, processed = 0, 0

        pbar = tqdm(enumerate(train_loader), total=len(train_loader),
                    desc=f"Epoch {epoch}/{cfg.epochs}")

        epoch_loss_sum = 0.0
        for batch_idx, (x, y) in pbar:
            x, y = x.to(device), y.to(device)

            opt.zero_grad(set_to_none=True)
            logits = model(x)
            loss   = loss_fn(logits, y)
            loss.backward()
            opt.step()
            sched.step()  # OneCycleLR: step PER BATCH

            # stats for pbar (like [2])
            epoch_loss_sum += loss.item()
            preds = logits.argmax(dim=1)
            correct   += (preds == y).sum().item()
            processed += y.size(0)

            # mimic [2]'s description string
            pbar.set_description(
                desc=f"Epoch {epoch}/{cfg.epochs} | "
                     f"Loss={loss.item():.4f} Batch_id={batch_idx} "
                     f"Accuracy={100.0*correct/processed:0.2f}% "
                     f"LR={sched.get_last_lr()[0]:.4f}"
            )

        # epoch aggregates (match [2] behavior)
        avg_train_loss = epoch_loss_sum / len(train_loader)
        avg_train_acc  = 100.0 * correct / processed
        train_losses.append(avg_train_loss)
        train_acc.append(avg_train_acc)

        # evaluate & print like [2]
        acc, val_loss = evaluate(model, test_loader, loss_fn)
        best_acc = max(best_acc, acc)

    return best_acc

## Training Loop



### Training - Model Primary

In [8]:

train_losses, test_losses = [], []
train_acc,    test_acc    = [], []
model = ModelPrimary().to(device)
best = train_epochs(model, train_loader, test_loader, cfg)
print(f"Best accuracy over {cfg.epochs} epochs: {best*100:.2f}%")

Epoch 1/15 | Loss=0.2260 Batch_id=468 Accuracy=91.82% LR=0.0976: 100%|█| 469/469



Test set: Average loss: 0.2189, Accuracy: 9782/10000 (97.82%)



Epoch 2/15 | Loss=0.2023 Batch_id=468 Accuracy=98.20% LR=0.2327: 100%|█| 469/469



Test set: Average loss: 0.1825, Accuracy: 9892/10000 (98.92%)



Epoch 3/15 | Loss=0.1920 Batch_id=468 Accuracy=98.47% LR=0.3000: 100%|█| 469/469



Test set: Average loss: 0.1713, Accuracy: 9929/10000 (99.29%)



Epoch 4/15 | Loss=0.1717 Batch_id=468 Accuracy=98.71% LR=0.2949: 100%|█| 469/469



Test set: Average loss: 0.1671, Accuracy: 9937/10000 (99.37%)



Epoch 5/15 | Loss=0.1724 Batch_id=468 Accuracy=98.85% LR=0.2799: 100%|█| 469/469



Test set: Average loss: 0.1665, Accuracy: 9924/10000 (99.24%)



Epoch 6/15 | Loss=0.1618 Batch_id=468 Accuracy=98.89% LR=0.2561: 100%|█| 469/469



Test set: Average loss: 0.1651, Accuracy: 9932/10000 (99.32%)



Epoch 7/15 | Loss=0.1787 Batch_id=468 Accuracy=98.94% LR=0.2250: 100%|█| 469/469



Test set: Average loss: 0.1637, Accuracy: 9931/10000 (99.31%)



Epoch 8/15 | Loss=0.1801 Batch_id=468 Accuracy=99.04% LR=0.1889: 100%|█| 469/469



Test set: Average loss: 0.1620, Accuracy: 9931/10000 (99.31%)



Epoch 9/15 | Loss=0.1864 Batch_id=468 Accuracy=99.09% LR=0.1501: 100%|█| 469/469



Test set: Average loss: 0.1621, Accuracy: 9927/10000 (99.27%)



Epoch 10/15 | Loss=0.2327 Batch_id=468 Accuracy=99.12% LR=0.1113: 100%|█| 469/46



Test set: Average loss: 0.1598, Accuracy: 9945/10000 (99.45%)



Epoch 11/15 | Loss=0.1967 Batch_id=468 Accuracy=99.17% LR=0.0752: 100%|█| 469/46



Test set: Average loss: 0.1590, Accuracy: 9937/10000 (99.37%)



Epoch 12/15 | Loss=0.1541 Batch_id=468 Accuracy=99.23% LR=0.0441: 100%|█| 469/46



Test set: Average loss: 0.1573, Accuracy: 9946/10000 (99.46%)



Epoch 13/15 | Loss=0.1785 Batch_id=468 Accuracy=99.34% LR=0.0203: 100%|█| 469/46



Test set: Average loss: 0.1546, Accuracy: 9958/10000 (99.58%)



Epoch 14/15 | Loss=0.1518 Batch_id=468 Accuracy=99.32% LR=0.0054: 100%|█| 469/46



Test set: Average loss: 0.1542, Accuracy: 9955/10000 (99.55%)



Epoch 15/15 | Loss=0.1630 Batch_id=468 Accuracy=99.40% LR=0.0003: 100%|█| 469/46



Test set: Average loss: 0.1542, Accuracy: 9950/10000 (99.50%)

Best accuracy over 15 epochs: 99.58%


### Training - Model Intermediate

In [9]:

train_losses, test_losses = [], []
train_acc,    test_acc    = [], []
model = Model_Intermediate().to(device)
best = train_epochs(model, train_loader, test_loader, cfg)
print(f"Best accuracy over {cfg.epochs} epochs: {best*100:.2f}%")

Epoch 1/15 | Loss=0.2309 Batch_id=468 Accuracy=83.53% LR=0.0976: 100%|█| 469/469



Test set: Average loss: 0.2627, Accuracy: 9742/10000 (97.42%)



Epoch 2/15 | Loss=0.2304 Batch_id=468 Accuracy=97.02% LR=0.2327: 100%|█| 469/469



Test set: Average loss: 0.2271, Accuracy: 9828/10000 (98.28%)



Epoch 3/15 | Loss=0.2130 Batch_id=468 Accuracy=97.71% LR=0.3000: 100%|█| 469/469



Test set: Average loss: 0.2342, Accuracy: 9798/10000 (97.98%)



Epoch 4/15 | Loss=0.2609 Batch_id=468 Accuracy=98.04% LR=0.2949: 100%|█| 469/469



Test set: Average loss: 0.2125, Accuracy: 9828/10000 (98.28%)



Epoch 5/15 | Loss=0.1724 Batch_id=468 Accuracy=98.26% LR=0.2799: 100%|█| 469/469



Test set: Average loss: 0.1966, Accuracy: 9871/10000 (98.71%)



Epoch 6/15 | Loss=0.1909 Batch_id=468 Accuracy=98.49% LR=0.2561: 100%|█| 469/469



Test set: Average loss: 0.2156, Accuracy: 9849/10000 (98.49%)



Epoch 7/15 | Loss=0.1927 Batch_id=468 Accuracy=98.60% LR=0.2250: 100%|█| 469/469



Test set: Average loss: 0.1937, Accuracy: 9891/10000 (98.91%)



Epoch 8/15 | Loss=0.2105 Batch_id=468 Accuracy=98.75% LR=0.1889: 100%|█| 469/469



Test set: Average loss: 0.1871, Accuracy: 9910/10000 (99.10%)



Epoch 9/15 | Loss=0.1742 Batch_id=468 Accuracy=98.87% LR=0.1501: 100%|█| 469/469



Test set: Average loss: 0.1908, Accuracy: 9888/10000 (98.88%)



Epoch 10/15 | Loss=0.2040 Batch_id=468 Accuracy=98.91% LR=0.1113: 100%|█| 469/46



Test set: Average loss: 0.1837, Accuracy: 9916/10000 (99.16%)



Epoch 11/15 | Loss=0.1815 Batch_id=468 Accuracy=98.96% LR=0.0752: 100%|█| 469/46



Test set: Average loss: 0.1815, Accuracy: 9908/10000 (99.08%)



Epoch 12/15 | Loss=0.1633 Batch_id=468 Accuracy=99.06% LR=0.0441: 100%|█| 469/46



Test set: Average loss: 0.1792, Accuracy: 9918/10000 (99.18%)



Epoch 13/15 | Loss=0.1988 Batch_id=468 Accuracy=99.08% LR=0.0203: 100%|█| 469/46



Test set: Average loss: 0.1767, Accuracy: 9925/10000 (99.25%)



Epoch 14/15 | Loss=0.1934 Batch_id=468 Accuracy=99.22% LR=0.0054: 100%|█| 469/46



Test set: Average loss: 0.1747, Accuracy: 9927/10000 (99.27%)



Epoch 15/15 | Loss=0.2040 Batch_id=468 Accuracy=99.22% LR=0.0003: 100%|█| 469/46



Test set: Average loss: 0.1744, Accuracy: 9927/10000 (99.27%)

Best accuracy over 15 epochs: 99.27%


### Training - Model Final(NetSub8K)

In [10]:
train_losses, test_losses = [], []
train_acc,    test_acc    = [], []
model = NetSub8K().to(device)
best = train_epochs(model, train_loader, test_loader, cfg)
print(f"Best accuracy over {cfg.epochs} epochs: {best*100:.2f}%")

Epoch 1/15 | Loss=0.1985 Batch_id=468 Accuracy=92.06% LR=0.0976: 100%|█| 469/469



Test set: Average loss: 0.2190, Accuracy: 9797/10000 (97.97%)



Epoch 2/15 | Loss=0.1791 Batch_id=468 Accuracy=98.12% LR=0.2327: 100%|█| 469/469



Test set: Average loss: 0.1878, Accuracy: 9856/10000 (98.56%)



Epoch 3/15 | Loss=0.1829 Batch_id=468 Accuracy=98.46% LR=0.3000: 100%|█| 469/469



Test set: Average loss: 0.1794, Accuracy: 9865/10000 (98.65%)



Epoch 4/15 | Loss=0.2195 Batch_id=468 Accuracy=98.66% LR=0.2949: 100%|█| 469/469



Test set: Average loss: 0.1736, Accuracy: 9888/10000 (98.88%)



Epoch 5/15 | Loss=0.2033 Batch_id=468 Accuracy=98.83% LR=0.2799: 100%|█| 469/469



Test set: Average loss: 0.1628, Accuracy: 9926/10000 (99.26%)



Epoch 6/15 | Loss=0.1872 Batch_id=468 Accuracy=98.96% LR=0.2561: 100%|█| 469/469



Test set: Average loss: 0.1632, Accuracy: 9917/10000 (99.17%)



Epoch 7/15 | Loss=0.1920 Batch_id=468 Accuracy=99.06% LR=0.2250: 100%|█| 469/469



Test set: Average loss: 0.1643, Accuracy: 9913/10000 (99.13%)



Epoch 8/15 | Loss=0.1409 Batch_id=468 Accuracy=99.14% LR=0.1889: 100%|█| 469/469



Test set: Average loss: 0.1584, Accuracy: 9936/10000 (99.36%)



Epoch 9/15 | Loss=0.1748 Batch_id=468 Accuracy=99.14% LR=0.1501: 100%|█| 469/469



Test set: Average loss: 0.1631, Accuracy: 9919/10000 (99.19%)



Epoch 10/15 | Loss=0.2004 Batch_id=468 Accuracy=99.23% LR=0.1113: 100%|█| 469/46



Test set: Average loss: 0.1579, Accuracy: 9926/10000 (99.26%)



Epoch 11/15 | Loss=0.1412 Batch_id=468 Accuracy=99.32% LR=0.0752: 100%|█| 469/46



Test set: Average loss: 0.1556, Accuracy: 9936/10000 (99.36%)



Epoch 12/15 | Loss=0.1859 Batch_id=468 Accuracy=99.40% LR=0.0441: 100%|█| 469/46



Test set: Average loss: 0.1533, Accuracy: 9943/10000 (99.43%)



Epoch 13/15 | Loss=0.1439 Batch_id=468 Accuracy=99.44% LR=0.0203: 100%|█| 469/46



Test set: Average loss: 0.1518, Accuracy: 9947/10000 (99.47%)



Epoch 14/15 | Loss=0.1546 Batch_id=468 Accuracy=99.51% LR=0.0054: 100%|█| 469/46



Test set: Average loss: 0.1510, Accuracy: 9947/10000 (99.47%)



Epoch 15/15 | Loss=0.1497 Batch_id=468 Accuracy=99.58% LR=0.0003: 100%|█| 469/46



Test set: Average loss: 0.1509, Accuracy: 9944/10000 (99.44%)

Best accuracy over 15 epochs: 99.47%
