In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ----------------------------
# Basic 3x3 Convolution
# ----------------------------
def conv3x3(in_planes, out_planes, stride=1):
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)

# ----------------------------
# Basic Residual Block
# ----------------------------
class BasicBlock(nn.Module):
    expansion = 1
    def __init__(self, in_planes, planes, stride=1, downsample=None, is_last=False):
        super().__init__()
        self.is_last = is_last
        self.conv1 = conv3x3(in_planes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        preact = out
        out = F.relu(out)

        if self.is_last:
            return out, preact
        else:
            return out

# ----------------------------
# ResNet CIFAR Modular
# ----------------------------
class ResNet(nn.Module):
    def __init__(self, depth, num_filters=[16,16,32,64], block_name='basicblock', num_classes=100):
        super().__init__()
        assert block_name.lower() == 'basicblock', "Currently only BasicBlock supported"
        assert (depth - 2) % 6 == 0, "Depth must be 6n+2 for BasicBlock"
        n = (depth - 2) // 6

        self.in_planes = num_filters[0]
        self.conv1 = conv3x3(3, num_filters[0])
        self.bn1 = nn.BatchNorm2d(num_filters[0])
        self.relu = nn.ReLU(inplace=True)

        # Residual layers
        self.layer1 = self._make_layer(BasicBlock, num_filters[1], n)
        self.layer2 = self._make_layer(BasicBlock, num_filters[2], n, stride=2)
        self.layer3 = self._make_layer(BasicBlock, num_filters[3], n, stride=2)

        self.avgpool = nn.AvgPool2d(8)
        self.fc = nn.Linear(num_filters[3]*BasicBlock.expansion, num_classes)

        # weight init
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_planes != planes*block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_planes, planes*block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes*block.expansion)
            )

        layers = [block(self.in_planes, planes, stride, downsample, is_last=(blocks==1))]
        self.in_planes = planes*block.expansion
        for i in range(1, blocks):
            layers.append(block(self.in_planes, planes, is_last=(i==blocks-1)))
        return nn.Sequential(*layers)

    def forward(self, x, is_feat=False, preact=False):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        f0 = x

        x, f1_pre = self.layer1(x)
        f1 = x
        x, f2_pre = self.layer2(x)
        f2 = x
        x, f3_pre = self.layer3(x)
        f3 = x

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        f4 = x
        x = self.fc(x)

        if is_feat:
            return ([f0,f1,f2,f3,f4], x) if not preact else ([f0,f1_pre,f2_pre,f3_pre,f4], x)
        else:
            return x

# ----------------------------
# Architecture builders
# ----------------------------
def resnet20(num_classes=100): return ResNet(20, num_classes=num_classes)
def resnet32(num_classes=100):
    """ResNet32 matching the standard CIFAR ResNet paper structure."""
    # Use num_filters=[32, 32, 64, 128] to match the checkpoint you want to load
    return ResNet(depth=32, num_filters=[32, 64, 128, 256], block_name='basicblock', num_classes=num_classes)

def resnet56(num_classes=100): return ResNet(56, num_classes=num_classes)
def resnet110(num_classes=100): return ResNet(110, num_classes=num_classes)
def resnet32_basic(num_classes=100): return ResNet(32, num_classes=num_classes)

In [2]:
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# ---------------------------
# Device
# ---------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------------------------
# CIFAR-100 test dataset
# ---------------------------
mean = (0.5071, 0.4867, 0.4408)
std  = (0.2675, 0.2565, 0.2761)

test_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

test_ds = datasets.CIFAR100("./data", train=False, transform=test_tf, download=True)
test_loader = DataLoader(test_ds, batch_size=128, shuffle=False, num_workers=2, pin_memory=True)

# ---------------------------
# Model constructors
# ---------------------------
model_dict = {
    # "ResNet20": resnet20,
    # "ResNet32": resnet32(), # this is the resnet-32x4
    "ResNet56": resnet56(),
    "ResNet110": resnet110()
}

# Path mapping for pretrained weights (adjust paths if needed)
weights_dict = {
    # "ResNet20": "/path/to/resnet20.pth",
    # "ResNet32": "/kaggle/input/resnet32/pytorch/default/1/ckpt_epoch_240.pth",
    "ResNet56":  "/kaggle/input/resnet-56/ckpt_epoch_240.pth",
    "ResNet110": "/kaggle/input/resnet-110/ckpt_epoch_240.pth"
}

# ---------------------------
# Evaluation loop
# ---------------------------
for name, constructor in model_dict.items():
    print(f"Evaluating {name}...")
    model = constructor.to(device)
    
    # Load pretrained weights if available
    weight_path = weights_dict.get(name)
    if weight_path:
        checkpoint = torch.load(weight_path, map_location=device,weights_only=False)
        if 'model' in checkpoint:
            state_dict = checkpoint['model']
        else:
            state_dict = checkpoint
        model.load_state_dict(state_dict)
    
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for imgs, labels in test_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            logits = model(imgs)
            preds = logits.argmax(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    test_acc = 100 * correct / total
    print(f"🎯 {name} Test Accuracy on CIFAR-100: {test_acc:.2f}%\n")


100%|██████████| 169M/169M [00:16<00:00, 10.2MB/s]


Evaluating ResNet56...
🎯 ResNet56 Test Accuracy on CIFAR-100: 72.41%

Evaluating ResNet110...
🎯 ResNet110 Test Accuracy on CIFAR-100: 74.31%



# Training the student models BASIC Resnet 20 / 32

import os
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms
from torchvision.transforms import AutoAugment, AutoAugmentPolicy, RandomErasing
from tqdm.auto import tqdm
#from torchvision.models import resnet18, resnet34



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_dp = torch.cuda.device_count() > 1
torch.manual_seed(42)
np.random.seed(42)

mean = (0.5071, 0.4867, 0.4408)  # CIFAR-100 mean
std  = (0.2675, 0.2565, 0.2761)  # CIFAR-100 std

batch_size = 64

# Precompute DataLoaders for each resolution
stages = [(r) for r in [(32, 240)]]
dataloader_dict = {}

'''
Copied from the paper as it is.

they are not using any validation sets. Training it on the entire train set!
    train_transform = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=stdv),
    ])
    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=stdv),
    ])
    
    '''

for resolution, _ in stages:
    train_tf = transforms.Compose([
        transforms.RandomCrop(resolution,padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])
    
    train_set = datasets.CIFAR100('./data', train=True, download=False, transform=train_tf)
    
    train_loader = DataLoader(train_set, batch_size=batch_size,
                              shuffle=True, num_workers=0, pin_memory=True)
    dataloader_dict[resolution] = {
        'train': train_loader
    }

# Test loader (fixed resolution)
test_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

test_ds = datasets.CIFAR100('./data', train=False, transform=test_tf)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)


# learning_rate is divided by 10 

# ---------------------------
# Test loop
# ---------------------------

def test(Test_model):
    
    Test_model.eval()
    correct_val = total_val = 0
    
    with torch.no_grad():
        for imgs, labels in test_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            logits = Test_model(imgs)
            preds = logits.argmax(1)
            correct_val += (preds == labels).sum().item()
            total_val += labels.size(0)
    test_acc = 100 * correct_val / total_val

    # print(f"Test Acc = {val_acc:.2f}%")
    return test_acc

# ---------------------------
# Training loop
# ---------------------------

def train(model, model_type):
    
    # Loss + optimizer
    lr = 0.05 # as per the paper
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)

    # Step LR schedule: decay at 150, 180, 210 epochs
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[150, 180, 210], gamma=0.1)

    best_val_acc = 0.0
    num_epochs = 240
    
    for res, epochs in stages:
        
        print(f"\n=== Training at resolution {res}px ===")
        tr_loader = dataloader_dict[res]['train']
        
        for e in range(1, epochs+1):
            
            model.train()
            total_loss = 0
            correct = total = 0
        
            for imgs, labels in tqdm(tr_loader, desc=f"Epoch {e}/{num_epochs}"):
                
                imgs, labels = imgs.to(device), labels.to(device)
                optimizer.zero_grad()
                logits = model(imgs)
                loss = criterion(logits, labels)
                loss.backward()
                optimizer.step()
        
                total_loss += loss.item()
                preds = logits.argmax(1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        
            train_acc = 100 * correct / total
            scheduler.step()

            test_acc = test(model)
            
            print(f"Epoch {e}: Train Acc = {train_acc:.2f}% Test Accuracy = {test_acc:.2f}%") 
            
            # Save best model
            if test_acc > best_val_acc:
                best_val_acc = test_acc
                torch.save(model.state_dict(), f"resnet{model_type}_student.pth")
                print(f"→ Saved best model at epoch {e} with Test Acc = {train_acc:.2f}%")
    print("✅ Training completed!")

# KD Training Vanilla


In [3]:
import os
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms
from torchvision.transforms import AutoAugment, AutoAugmentPolicy, RandomErasing
from tqdm.auto import tqdm
#from torchvision.models import resnet18, resnet34
from torchvision import datasets, transforms
from torch.utils.data import DataLoader


mean = (0.5071, 0.4867, 0.4408)
std  = (0.2675, 0.2565, 0.2761)
batch_size = 128
num_workers = 0
pin_mem = True

@torch.no_grad()
def evaluate(model, loader, device):
    model.eval()
    correct = total = 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        pred = logits.argmax(1)
        correct += (pred == y).sum().item()
        total += y.size(0)
    return 100.0 * correct / total


torch.manual_seed(27)
np.random.seed(27)

torch.cuda.manual_seed_all(27)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False



In [4]:
stages = [(r) for r in [(32, 240)]]
dataloader_dict = {}

# --- Build train loader(s) for each stage ---
for resolution, _ in stages:
    train_tf = transforms.Compose([
        transforms.RandomCrop(resolution, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])

    train_set = datasets.CIFAR100('./data', train=True, download=True, transform=train_tf)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True,
                              num_workers=num_workers, pin_memory=pin_mem)

    dataloader_dict[resolution] = {'train': train_loader}

# --- Single test loader (fixed 32x32 normalization) ---
test_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])
test_set = datasets.CIFAR100('./data', train=False, download=True, transform=test_tf)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False,
                         num_workers=num_workers, pin_memory=pin_mem)

# --- Pick the 32px stage and call KD training ---
resolution, _ = stages[0]          # (32, 240)
train_loader = dataloader_dict[resolution]['train']


In [5]:
# learning_rate is divided by 10 
# ---------------------------
# Training loop
# ---------------------------

def kd_loss(student_logits, teacher_logits, labels, T=4.0, alpha=0.9): 
    """
    alpha = 0.1 as per the github repo
    Compute KD loss = α * KD + (1-α) * CE
    T = temperature
    α = weight for soft distillation loss
    """
    # Hard-label loss
    ce = F.cross_entropy(student_logits, labels)
    kd = F.kl_div(
        F.log_softmax(student_logits / T, dim=1),
        F.softmax(teacher_logits / T, dim=1),
        reduction="batchmean") * (T * T)
    
    return (1 - alpha) * ce + alpha * kd

In [6]:
def train_via_KD(t_model, s_model, train_loader, test_loader, device,
                 epochs=240, base_lr=0.05, wd=5e-4, milestones=(150,180,210),
                 T=4.0, alpha=0.9, save_path="student_kd.pth"):

    # Freeze teacher
    t_model.to(device).eval()
    for p in t_model.parameters():
        p.requires_grad = False

    s_model = s_model.to(device)

    optimizer = optim.SGD(s_model.parameters(), lr=base_lr, momentum=0.9, weight_decay=wd)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=list(milestones), gamma=0.1)

    best_test = -1.0
    for e in range(1, epochs + 1):
        s_model.train()
        running_loss, correct, total = 0.0, 0, 0

        for imgs, labels in tqdm(train_loader, desc=f"Epoch {e}/{epochs}"):
            imgs, labels = imgs.to(device), labels.to(device)

            with torch.no_grad():
                t_logits = t_model(imgs)

            s_logits = s_model(imgs)
            loss = kd_loss(s_logits, t_logits, labels, T=T, alpha=alpha)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * imgs.size(0)
            preds = s_logits.argmax(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        scheduler.step()
        train_loss = running_loss / total
        train_acc  = 100.0 * correct / total
        test_acc   = evaluate(s_model, test_loader, device)

        print(f"Epoch {e:3d}/{epochs} | loss {train_loss:.3f} | train {train_acc:5.2f}% | test {test_acc:5.2f}%")

        if test_acc > best_test:
            best_test = test_acc
            to_save = s_model.module.state_dict() if isinstance(s_model, torch.nn.DataParallel) else s_model.state_dict()
            torch.save(to_save, save_path)
            print(f"  ↳ Saved best @ epoch {e} (test {best_test:.2f}%) → {save_path}")

    print(f"✅ KD training finished. Best Test Acc: {best_test:.2f}%")


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
teacher = resnet56(num_classes = 100)
t_ckpt  = torch.load("/kaggle/input/resnet-56/ckpt_epoch_240.pth", map_location=device, weights_only=False)
teacher.load_state_dict(t_ckpt['model'] if 'model' in t_ckpt else t_ckpt)

student = resnet20(num_classes=100)

train_via_KD(teacher, student, train_loader, test_loader, device, save_path= "56_t-20_s.pth")

Epoch 1/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   1/240 | loss 11.932 | train  9.49% | test 13.88%
  ↳ Saved best @ epoch 1 (test 13.88%) → 56_t-20_s.pth


Epoch 2/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   2/240 | loss 9.339 | train 21.52% | test 22.95%
  ↳ Saved best @ epoch 2 (test 22.95%) → 56_t-20_s.pth


Epoch 3/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   3/240 | loss 7.682 | train 31.06% | test 29.84%
  ↳ Saved best @ epoch 3 (test 29.84%) → 56_t-20_s.pth


Epoch 4/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   4/240 | loss 6.674 | train 37.74% | test 37.31%
  ↳ Saved best @ epoch 4 (test 37.31%) → 56_t-20_s.pth


Epoch 5/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   5/240 | loss 5.952 | train 42.76% | test 39.59%
  ↳ Saved best @ epoch 5 (test 39.59%) → 56_t-20_s.pth


Epoch 6/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   6/240 | loss 5.462 | train 46.29% | test 43.45%
  ↳ Saved best @ epoch 6 (test 43.45%) → 56_t-20_s.pth


Epoch 7/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   7/240 | loss 5.123 | train 49.01% | test 38.81%


Epoch 8/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   8/240 | loss 4.823 | train 51.21% | test 44.96%
  ↳ Saved best @ epoch 8 (test 44.96%) → 56_t-20_s.pth


Epoch 9/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   9/240 | loss 4.600 | train 52.86% | test 46.47%
  ↳ Saved best @ epoch 9 (test 46.47%) → 56_t-20_s.pth


Epoch 10/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  10/240 | loss 4.413 | train 54.12% | test 48.68%
  ↳ Saved best @ epoch 10 (test 48.68%) → 56_t-20_s.pth


Epoch 11/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  11/240 | loss 4.262 | train 55.48% | test 45.07%


Epoch 12/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  12/240 | loss 4.159 | train 56.41% | test 45.14%


Epoch 13/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  13/240 | loss 4.057 | train 57.25% | test 53.10%
  ↳ Saved best @ epoch 13 (test 53.10%) → 56_t-20_s.pth


Epoch 14/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  14/240 | loss 3.947 | train 57.87% | test 51.25%


Epoch 15/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  15/240 | loss 3.884 | train 58.69% | test 51.60%


Epoch 16/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  16/240 | loss 3.811 | train 59.40% | test 52.96%


Epoch 17/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  17/240 | loss 3.771 | train 59.92% | test 52.68%


Epoch 18/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  18/240 | loss 3.712 | train 60.38% | test 54.82%
  ↳ Saved best @ epoch 18 (test 54.82%) → 56_t-20_s.pth


Epoch 19/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  19/240 | loss 3.655 | train 60.56% | test 50.28%


Epoch 20/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  20/240 | loss 3.622 | train 61.30% | test 53.29%


Epoch 21/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  21/240 | loss 3.590 | train 61.23% | test 56.86%
  ↳ Saved best @ epoch 21 (test 56.86%) → 56_t-20_s.pth


Epoch 22/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  22/240 | loss 3.558 | train 61.64% | test 55.31%


Epoch 23/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  23/240 | loss 3.520 | train 61.67% | test 55.27%


Epoch 24/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  24/240 | loss 3.501 | train 62.02% | test 51.44%


Epoch 25/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  25/240 | loss 3.454 | train 62.51% | test 52.70%


Epoch 26/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  26/240 | loss 3.458 | train 62.51% | test 53.31%


Epoch 27/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  27/240 | loss 3.421 | train 62.62% | test 54.46%


Epoch 28/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  28/240 | loss 3.412 | train 63.01% | test 53.80%


Epoch 29/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  29/240 | loss 3.377 | train 63.02% | test 52.22%


Epoch 30/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  30/240 | loss 3.355 | train 63.32% | test 55.91%


Epoch 31/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  31/240 | loss 3.340 | train 63.36% | test 54.07%


Epoch 32/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  32/240 | loss 3.349 | train 63.53% | test 56.23%


Epoch 33/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  33/240 | loss 3.327 | train 63.87% | test 52.04%


Epoch 34/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  34/240 | loss 3.313 | train 63.63% | test 58.60%
  ↳ Saved best @ epoch 34 (test 58.60%) → 56_t-20_s.pth


Epoch 35/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  35/240 | loss 3.288 | train 63.64% | test 55.78%


Epoch 36/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  36/240 | loss 3.280 | train 64.03% | test 54.46%


Epoch 37/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  37/240 | loss 3.268 | train 64.04% | test 56.14%


Epoch 38/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  38/240 | loss 3.246 | train 64.04% | test 53.62%


Epoch 39/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  39/240 | loss 3.239 | train 64.43% | test 58.83%
  ↳ Saved best @ epoch 39 (test 58.83%) → 56_t-20_s.pth


Epoch 40/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  40/240 | loss 3.243 | train 64.19% | test 56.55%


Epoch 41/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  41/240 | loss 3.238 | train 64.46% | test 57.84%


Epoch 42/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  42/240 | loss 3.211 | train 64.54% | test 57.38%


Epoch 43/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  43/240 | loss 3.212 | train 64.62% | test 53.98%


Epoch 44/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  44/240 | loss 3.205 | train 64.66% | test 58.33%


Epoch 45/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  45/240 | loss 3.195 | train 64.73% | test 58.42%


Epoch 46/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  46/240 | loss 3.191 | train 64.98% | test 55.93%


Epoch 47/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  47/240 | loss 3.190 | train 65.08% | test 56.83%


Epoch 48/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  48/240 | loss 3.187 | train 64.71% | test 56.80%


Epoch 49/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  49/240 | loss 3.167 | train 64.88% | test 57.98%


Epoch 50/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  50/240 | loss 3.154 | train 65.23% | test 59.06%
  ↳ Saved best @ epoch 50 (test 59.06%) → 56_t-20_s.pth


Epoch 51/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  51/240 | loss 3.184 | train 65.07% | test 54.94%


Epoch 52/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  52/240 | loss 3.172 | train 65.00% | test 56.68%


Epoch 53/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  53/240 | loss 3.161 | train 64.89% | test 59.32%
  ↳ Saved best @ epoch 53 (test 59.32%) → 56_t-20_s.pth


Epoch 54/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  54/240 | loss 3.129 | train 65.42% | test 59.15%


Epoch 55/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  55/240 | loss 3.148 | train 65.27% | test 55.19%


Epoch 56/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  56/240 | loss 3.112 | train 65.47% | test 55.76%


Epoch 57/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  57/240 | loss 3.146 | train 65.41% | test 55.01%


Epoch 58/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  58/240 | loss 3.133 | train 65.25% | test 56.85%


Epoch 59/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  59/240 | loss 3.132 | train 65.30% | test 55.06%


Epoch 60/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  60/240 | loss 3.081 | train 65.54% | test 56.96%


Epoch 61/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  61/240 | loss 3.113 | train 65.46% | test 55.16%


Epoch 62/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  62/240 | loss 3.118 | train 65.40% | test 57.97%


Epoch 63/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  63/240 | loss 3.090 | train 65.85% | test 59.14%


Epoch 64/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  64/240 | loss 3.085 | train 65.70% | test 60.28%
  ↳ Saved best @ epoch 64 (test 60.28%) → 56_t-20_s.pth


Epoch 65/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  65/240 | loss 3.107 | train 65.32% | test 56.10%


Epoch 66/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  66/240 | loss 3.113 | train 65.63% | test 57.57%


Epoch 67/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  67/240 | loss 3.118 | train 65.45% | test 55.20%


Epoch 68/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  68/240 | loss 3.090 | train 65.63% | test 52.45%


Epoch 69/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  69/240 | loss 3.084 | train 65.56% | test 57.11%


Epoch 70/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  70/240 | loss 3.101 | train 65.57% | test 57.64%


Epoch 71/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  71/240 | loss 3.065 | train 65.91% | test 55.71%


Epoch 72/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  72/240 | loss 3.075 | train 65.76% | test 56.02%


Epoch 73/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  73/240 | loss 3.076 | train 65.78% | test 56.85%


Epoch 74/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  74/240 | loss 3.073 | train 65.73% | test 59.92%


Epoch 75/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  75/240 | loss 3.055 | train 65.96% | test 58.06%


Epoch 76/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  76/240 | loss 3.070 | train 65.84% | test 59.15%


Epoch 77/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  77/240 | loss 3.046 | train 65.99% | test 55.91%


Epoch 78/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  78/240 | loss 3.063 | train 65.76% | test 58.02%


Epoch 79/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  79/240 | loss 3.036 | train 66.28% | test 57.38%


Epoch 80/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  80/240 | loss 3.087 | train 65.75% | test 56.92%


Epoch 81/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  81/240 | loss 3.061 | train 65.88% | test 56.85%


Epoch 82/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  82/240 | loss 3.050 | train 65.89% | test 56.67%


Epoch 83/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  83/240 | loss 3.044 | train 66.18% | test 57.68%


Epoch 84/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  84/240 | loss 3.051 | train 66.11% | test 51.48%


Epoch 85/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  85/240 | loss 3.027 | train 66.08% | test 56.36%


Epoch 86/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  86/240 | loss 3.048 | train 66.21% | test 58.43%


Epoch 87/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  87/240 | loss 3.039 | train 65.87% | test 57.09%


Epoch 88/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  88/240 | loss 3.025 | train 66.34% | test 55.54%


Epoch 89/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  89/240 | loss 3.022 | train 66.32% | test 57.88%


Epoch 90/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  90/240 | loss 3.038 | train 65.96% | test 59.84%


Epoch 91/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  91/240 | loss 3.030 | train 66.29% | test 57.67%


Epoch 92/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  92/240 | loss 3.015 | train 66.34% | test 56.40%


Epoch 93/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  93/240 | loss 3.021 | train 66.32% | test 57.23%


Epoch 94/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  94/240 | loss 3.046 | train 66.13% | test 59.68%


Epoch 95/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  95/240 | loss 3.004 | train 66.20% | test 57.85%


Epoch 96/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  96/240 | loss 3.015 | train 66.43% | test 59.27%


Epoch 97/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  97/240 | loss 3.019 | train 66.27% | test 56.57%


Epoch 98/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  98/240 | loss 3.006 | train 66.53% | test 59.34%


Epoch 99/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  99/240 | loss 3.028 | train 66.30% | test 57.47%


Epoch 100/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 100/240 | loss 2.996 | train 66.46% | test 60.02%


Epoch 101/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 101/240 | loss 3.015 | train 66.60% | test 56.31%


Epoch 102/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 102/240 | loss 3.012 | train 66.34% | test 55.42%


Epoch 103/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 103/240 | loss 3.015 | train 66.19% | test 56.25%


Epoch 104/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 104/240 | loss 2.988 | train 66.44% | test 58.39%


Epoch 105/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 105/240 | loss 3.009 | train 66.26% | test 58.48%


Epoch 106/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 106/240 | loss 2.995 | train 66.40% | test 56.95%


Epoch 107/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 107/240 | loss 3.003 | train 66.51% | test 57.20%


Epoch 108/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 108/240 | loss 3.007 | train 66.42% | test 57.40%


Epoch 109/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 109/240 | loss 2.991 | train 66.60% | test 57.48%


Epoch 110/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 110/240 | loss 2.990 | train 66.43% | test 56.64%


Epoch 111/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 111/240 | loss 2.988 | train 66.59% | test 57.84%


Epoch 112/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 112/240 | loss 2.996 | train 66.85% | test 58.06%


Epoch 113/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 113/240 | loss 2.989 | train 66.66% | test 57.14%


Epoch 114/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 114/240 | loss 2.983 | train 66.61% | test 59.28%


Epoch 115/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 115/240 | loss 2.976 | train 66.50% | test 54.88%


Epoch 116/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 116/240 | loss 2.996 | train 66.53% | test 56.83%


Epoch 117/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 117/240 | loss 3.005 | train 66.44% | test 53.24%


Epoch 118/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 118/240 | loss 2.993 | train 66.44% | test 56.43%


Epoch 119/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 119/240 | loss 2.971 | train 66.51% | test 53.99%


Epoch 120/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 120/240 | loss 2.955 | train 66.69% | test 60.74%
  ↳ Saved best @ epoch 120 (test 60.74%) → 56_t-20_s.pth


Epoch 121/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 121/240 | loss 2.968 | train 66.96% | test 58.14%


Epoch 122/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 122/240 | loss 2.961 | train 66.98% | test 58.82%


Epoch 123/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 123/240 | loss 2.981 | train 66.62% | test 56.07%


Epoch 124/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 124/240 | loss 2.959 | train 66.87% | test 60.65%


Epoch 125/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 125/240 | loss 2.963 | train 66.95% | test 59.33%


Epoch 126/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 126/240 | loss 2.954 | train 66.84% | test 59.44%


Epoch 127/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 127/240 | loss 2.967 | train 66.62% | test 59.81%


Epoch 128/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 128/240 | loss 2.977 | train 66.71% | test 55.49%


Epoch 129/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 129/240 | loss 2.966 | train 66.72% | test 58.44%


Epoch 130/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 130/240 | loss 2.965 | train 66.95% | test 58.55%


Epoch 131/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 131/240 | loss 2.950 | train 66.89% | test 59.32%


Epoch 132/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 132/240 | loss 2.954 | train 67.00% | test 58.47%


Epoch 133/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 133/240 | loss 2.946 | train 66.92% | test 57.85%


Epoch 134/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 134/240 | loss 2.959 | train 66.54% | test 56.09%


Epoch 135/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 135/240 | loss 2.972 | train 66.76% | test 58.14%


Epoch 136/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 136/240 | loss 2.945 | train 67.21% | test 56.46%


Epoch 137/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 137/240 | loss 2.959 | train 66.86% | test 58.50%


Epoch 138/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 138/240 | loss 2.936 | train 67.06% | test 59.40%


Epoch 139/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 139/240 | loss 2.963 | train 66.89% | test 56.31%


Epoch 140/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 140/240 | loss 2.959 | train 66.67% | test 57.94%


Epoch 141/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 141/240 | loss 2.938 | train 66.98% | test 59.25%


Epoch 142/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 142/240 | loss 2.946 | train 66.80% | test 57.53%


Epoch 143/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 143/240 | loss 2.955 | train 66.73% | test 59.34%


Epoch 144/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 144/240 | loss 2.940 | train 67.05% | test 56.41%


Epoch 145/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 145/240 | loss 2.961 | train 67.12% | test 58.47%


Epoch 146/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 146/240 | loss 2.935 | train 67.10% | test 61.09%
  ↳ Saved best @ epoch 146 (test 61.09%) → 56_t-20_s.pth


Epoch 147/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 147/240 | loss 2.923 | train 67.17% | test 56.23%


Epoch 148/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 148/240 | loss 2.951 | train 67.10% | test 58.23%


Epoch 149/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 149/240 | loss 2.948 | train 66.95% | test 59.67%


Epoch 150/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 150/240 | loss 2.949 | train 67.02% | test 57.40%


Epoch 151/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 151/240 | loss 2.289 | train 72.79% | test 68.40%
  ↳ Saved best @ epoch 151 (test 68.40%) → 56_t-20_s.pth


Epoch 152/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 152/240 | loss 2.138 | train 74.48% | test 68.57%
  ↳ Saved best @ epoch 152 (test 68.57%) → 56_t-20_s.pth


Epoch 153/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 153/240 | loss 2.086 | train 74.89% | test 68.79%
  ↳ Saved best @ epoch 153 (test 68.79%) → 56_t-20_s.pth


Epoch 154/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 154/240 | loss 2.047 | train 75.36% | test 69.12%
  ↳ Saved best @ epoch 154 (test 69.12%) → 56_t-20_s.pth


Epoch 155/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 155/240 | loss 2.024 | train 75.47% | test 68.97%


Epoch 156/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 156/240 | loss 2.004 | train 76.00% | test 69.29%
  ↳ Saved best @ epoch 156 (test 69.29%) → 56_t-20_s.pth


Epoch 157/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 157/240 | loss 1.983 | train 76.09% | test 69.23%


Epoch 158/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 158/240 | loss 1.972 | train 76.17% | test 69.41%
  ↳ Saved best @ epoch 158 (test 69.41%) → 56_t-20_s.pth


Epoch 159/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 159/240 | loss 1.956 | train 76.71% | test 69.70%
  ↳ Saved best @ epoch 159 (test 69.70%) → 56_t-20_s.pth


Epoch 160/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 160/240 | loss 1.956 | train 76.45% | test 69.49%


Epoch 161/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 161/240 | loss 1.935 | train 76.89% | test 69.60%


Epoch 162/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 162/240 | loss 1.942 | train 76.71% | test 69.55%


Epoch 163/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 163/240 | loss 1.932 | train 77.01% | test 69.61%


Epoch 164/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 164/240 | loss 1.928 | train 76.88% | test 69.70%


Epoch 165/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 165/240 | loss 1.917 | train 77.04% | test 69.82%
  ↳ Saved best @ epoch 165 (test 69.82%) → 56_t-20_s.pth


Epoch 166/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 166/240 | loss 1.908 | train 76.95% | test 69.82%


Epoch 167/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 167/240 | loss 1.894 | train 77.12% | test 69.79%


Epoch 168/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 168/240 | loss 1.902 | train 77.38% | test 69.43%


Epoch 169/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 169/240 | loss 1.887 | train 77.65% | test 69.75%


Epoch 170/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 170/240 | loss 1.891 | train 77.44% | test 69.74%


Epoch 171/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 171/240 | loss 1.884 | train 77.61% | test 69.45%


Epoch 172/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 172/240 | loss 1.883 | train 77.78% | test 69.42%


Epoch 173/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 173/240 | loss 1.876 | train 77.71% | test 69.61%


Epoch 174/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 174/240 | loss 1.878 | train 77.68% | test 69.63%


Epoch 175/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 175/240 | loss 1.870 | train 77.94% | test 69.70%


Epoch 176/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 176/240 | loss 1.861 | train 77.90% | test 69.45%


Epoch 177/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 177/240 | loss 1.872 | train 77.90% | test 69.58%


Epoch 178/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 178/240 | loss 1.871 | train 77.96% | test 69.40%


Epoch 179/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 179/240 | loss 1.867 | train 78.20% | test 69.33%


Epoch 180/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 180/240 | loss 1.870 | train 78.05% | test 69.27%


Epoch 181/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 181/240 | loss 1.775 | train 79.05% | test 70.25%
  ↳ Saved best @ epoch 181 (test 70.25%) → 56_t-20_s.pth


Epoch 182/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 182/240 | loss 1.762 | train 79.20% | test 70.54%
  ↳ Saved best @ epoch 182 (test 70.54%) → 56_t-20_s.pth


Epoch 183/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 183/240 | loss 1.754 | train 79.25% | test 70.47%


Epoch 184/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 184/240 | loss 1.747 | train 79.42% | test 70.41%


Epoch 185/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 185/240 | loss 1.740 | train 79.52% | test 70.49%


Epoch 186/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 186/240 | loss 1.745 | train 79.26% | test 70.78%
  ↳ Saved best @ epoch 186 (test 70.78%) → 56_t-20_s.pth


Epoch 187/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 187/240 | loss 1.732 | train 79.53% | test 70.38%


Epoch 188/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 188/240 | loss 1.738 | train 79.43% | test 70.59%


Epoch 189/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 189/240 | loss 1.739 | train 79.44% | test 70.46%


Epoch 190/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 190/240 | loss 1.733 | train 79.50% | test 70.57%


Epoch 191/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 191/240 | loss 1.738 | train 79.70% | test 70.63%


Epoch 192/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 192/240 | loss 1.732 | train 79.60% | test 70.56%


Epoch 193/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 193/240 | loss 1.735 | train 79.62% | test 70.56%


Epoch 194/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 194/240 | loss 1.731 | train 79.56% | test 70.51%


Epoch 195/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 195/240 | loss 1.735 | train 79.59% | test 70.34%


Epoch 196/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 196/240 | loss 1.733 | train 79.44% | test 70.51%


Epoch 197/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 197/240 | loss 1.725 | train 79.70% | test 70.72%


Epoch 198/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 198/240 | loss 1.725 | train 79.62% | test 70.57%


Epoch 199/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 199/240 | loss 1.727 | train 79.66% | test 70.40%


Epoch 200/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 200/240 | loss 1.727 | train 79.65% | test 70.69%


Epoch 201/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 201/240 | loss 1.726 | train 79.71% | test 70.63%


Epoch 202/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 202/240 | loss 1.719 | train 79.80% | test 70.62%


Epoch 203/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 203/240 | loss 1.718 | train 79.84% | test 70.52%


Epoch 204/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 204/240 | loss 1.721 | train 79.84% | test 70.59%


Epoch 205/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 205/240 | loss 1.724 | train 79.67% | test 70.54%


Epoch 206/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 206/240 | loss 1.719 | train 79.86% | test 70.62%


Epoch 207/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 207/240 | loss 1.709 | train 79.75% | test 70.42%


Epoch 208/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 208/240 | loss 1.724 | train 79.67% | test 70.42%


Epoch 209/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 209/240 | loss 1.716 | train 79.83% | test 70.41%


Epoch 210/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 210/240 | loss 1.717 | train 79.77% | test 70.51%


Epoch 211/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 211/240 | loss 1.714 | train 79.90% | test 70.63%


Epoch 212/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 212/240 | loss 1.705 | train 80.13% | test 70.71%


Epoch 213/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 213/240 | loss 1.711 | train 79.93% | test 70.50%


Epoch 214/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 214/240 | loss 1.705 | train 79.88% | test 70.69%


Epoch 215/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 215/240 | loss 1.703 | train 80.05% | test 70.58%


Epoch 216/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 216/240 | loss 1.701 | train 80.15% | test 70.54%


Epoch 217/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 217/240 | loss 1.699 | train 80.16% | test 70.71%


Epoch 218/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 218/240 | loss 1.703 | train 79.98% | test 70.50%


Epoch 219/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 219/240 | loss 1.706 | train 80.08% | test 70.55%


Epoch 220/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 220/240 | loss 1.701 | train 79.88% | test 70.81%
  ↳ Saved best @ epoch 220 (test 70.81%) → 56_t-20_s.pth


Epoch 221/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 221/240 | loss 1.701 | train 80.00% | test 70.43%


Epoch 222/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 222/240 | loss 1.706 | train 79.90% | test 70.67%


Epoch 223/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 223/240 | loss 1.704 | train 80.06% | test 70.62%


Epoch 224/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 224/240 | loss 1.701 | train 79.91% | test 70.62%


Epoch 225/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 225/240 | loss 1.700 | train 80.16% | test 70.64%


Epoch 226/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 226/240 | loss 1.702 | train 79.92% | test 70.79%


Epoch 227/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 227/240 | loss 1.700 | train 80.21% | test 70.72%


Epoch 228/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 228/240 | loss 1.704 | train 79.89% | test 70.65%


Epoch 229/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 229/240 | loss 1.703 | train 79.89% | test 70.50%


Epoch 230/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 230/240 | loss 1.701 | train 80.10% | test 70.57%


Epoch 231/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 231/240 | loss 1.699 | train 80.11% | test 70.69%


Epoch 232/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 232/240 | loss 1.704 | train 79.90% | test 70.81%


Epoch 233/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 233/240 | loss 1.698 | train 79.99% | test 70.65%


Epoch 234/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 234/240 | loss 1.703 | train 80.05% | test 70.57%


Epoch 235/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 235/240 | loss 1.699 | train 80.11% | test 70.73%


Epoch 236/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 236/240 | loss 1.708 | train 79.92% | test 70.53%


Epoch 237/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 237/240 | loss 1.699 | train 80.09% | test 70.66%


Epoch 238/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 238/240 | loss 1.704 | train 79.87% | test 70.64%


Epoch 239/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 239/240 | loss 1.701 | train 80.07% | test 70.58%


Epoch 240/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 240/240 | loss 1.696 | train 80.12% | test 70.57%
✅ KD training finished. Best Test Acc: 70.81%


In [8]:

teacher = resnet110(num_classes = 100)
t_ckpt  = torch.load("/kaggle/input/resnet-110/ckpt_epoch_240.pth", map_location=device, weights_only=False)
teacher.load_state_dict(t_ckpt['model'] if 'model' in t_ckpt else t_ckpt)

student = resnet20(num_classes=100)

train_via_KD(teacher, student, train_loader, test_loader, device, save_path= "110_t-32_s.pth")

Epoch 1/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   1/240 | loss 14.803 | train  8.43% | test 11.31%
  ↳ Saved best @ epoch 1 (test 11.31%) → 110_t-32_s.pth


Epoch 2/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   2/240 | loss 11.942 | train 19.91% | test 21.33%
  ↳ Saved best @ epoch 2 (test 21.33%) → 110_t-32_s.pth


Epoch 3/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   3/240 | loss 9.979 | train 29.36% | test 27.78%
  ↳ Saved best @ epoch 3 (test 27.78%) → 110_t-32_s.pth


Epoch 4/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   4/240 | loss 8.752 | train 36.06% | test 33.85%
  ↳ Saved best @ epoch 4 (test 33.85%) → 110_t-32_s.pth


Epoch 5/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   5/240 | loss 7.881 | train 41.44% | test 37.23%
  ↳ Saved best @ epoch 5 (test 37.23%) → 110_t-32_s.pth


Epoch 6/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   6/240 | loss 7.247 | train 45.18% | test 37.06%


Epoch 7/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   7/240 | loss 6.785 | train 47.91% | test 45.18%
  ↳ Saved best @ epoch 7 (test 45.18%) → 110_t-32_s.pth


Epoch 8/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   8/240 | loss 6.478 | train 49.86% | test 44.83%


Epoch 9/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   9/240 | loss 6.199 | train 51.73% | test 41.07%


Epoch 10/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  10/240 | loss 5.961 | train 53.43% | test 48.70%
  ↳ Saved best @ epoch 10 (test 48.70%) → 110_t-32_s.pth


Epoch 11/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  11/240 | loss 5.809 | train 54.26% | test 47.08%


Epoch 12/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  12/240 | loss 5.641 | train 55.22% | test 47.92%


Epoch 13/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  13/240 | loss 5.559 | train 56.20% | test 47.65%


Epoch 14/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  14/240 | loss 5.412 | train 56.86% | test 48.91%
  ↳ Saved best @ epoch 14 (test 48.91%) → 110_t-32_s.pth


Epoch 15/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  15/240 | loss 5.323 | train 57.46% | test 48.12%


Epoch 16/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  16/240 | loss 5.264 | train 58.11% | test 51.75%
  ↳ Saved best @ epoch 16 (test 51.75%) → 110_t-32_s.pth


Epoch 17/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  17/240 | loss 5.138 | train 58.83% | test 53.86%
  ↳ Saved best @ epoch 17 (test 53.86%) → 110_t-32_s.pth


Epoch 18/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  18/240 | loss 5.103 | train 59.04% | test 47.97%


Epoch 19/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  19/240 | loss 5.059 | train 59.06% | test 53.86%


Epoch 20/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  20/240 | loss 4.987 | train 59.78% | test 54.22%
  ↳ Saved best @ epoch 20 (test 54.22%) → 110_t-32_s.pth


Epoch 21/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  21/240 | loss 4.971 | train 60.04% | test 53.53%


Epoch 22/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  22/240 | loss 4.876 | train 60.58% | test 51.19%


Epoch 23/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  23/240 | loss 4.849 | train 60.87% | test 53.89%


Epoch 24/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  24/240 | loss 4.800 | train 61.16% | test 53.18%


Epoch 25/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  25/240 | loss 4.789 | train 61.23% | test 51.33%


Epoch 26/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  26/240 | loss 4.765 | train 61.46% | test 54.50%
  ↳ Saved best @ epoch 26 (test 54.50%) → 110_t-32_s.pth


Epoch 27/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  27/240 | loss 4.727 | train 61.57% | test 55.29%
  ↳ Saved best @ epoch 27 (test 55.29%) → 110_t-32_s.pth


Epoch 28/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  28/240 | loss 4.713 | train 61.84% | test 55.02%


Epoch 29/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  29/240 | loss 4.698 | train 61.99% | test 53.36%


Epoch 30/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  30/240 | loss 4.677 | train 62.01% | test 55.04%


Epoch 31/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  31/240 | loss 4.657 | train 62.27% | test 52.92%


Epoch 32/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  32/240 | loss 4.607 | train 62.45% | test 53.42%


Epoch 33/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  33/240 | loss 4.602 | train 62.49% | test 53.80%


Epoch 34/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  34/240 | loss 4.581 | train 62.81% | test 52.18%


Epoch 35/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  35/240 | loss 4.538 | train 63.06% | test 53.15%


Epoch 36/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  36/240 | loss 4.531 | train 62.99% | test 54.80%


Epoch 37/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  37/240 | loss 4.551 | train 62.96% | test 55.22%


Epoch 38/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  38/240 | loss 4.504 | train 63.35% | test 51.91%


Epoch 39/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  39/240 | loss 4.520 | train 63.20% | test 53.56%


Epoch 40/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  40/240 | loss 4.482 | train 63.36% | test 55.73%
  ↳ Saved best @ epoch 40 (test 55.73%) → 110_t-32_s.pth


Epoch 41/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  41/240 | loss 4.497 | train 63.52% | test 53.00%


Epoch 42/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  42/240 | loss 4.494 | train 63.38% | test 54.90%


Epoch 43/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  43/240 | loss 4.429 | train 63.90% | test 55.14%


Epoch 44/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  44/240 | loss 4.453 | train 63.76% | test 55.92%
  ↳ Saved best @ epoch 44 (test 55.92%) → 110_t-32_s.pth


Epoch 45/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  45/240 | loss 4.434 | train 64.08% | test 53.89%


Epoch 46/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  46/240 | loss 4.425 | train 64.02% | test 55.19%


Epoch 47/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  47/240 | loss 4.415 | train 64.11% | test 55.84%


Epoch 48/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  48/240 | loss 4.387 | train 64.23% | test 56.18%
  ↳ Saved best @ epoch 48 (test 56.18%) → 110_t-32_s.pth


Epoch 49/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  49/240 | loss 4.363 | train 64.40% | test 54.15%


Epoch 50/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  50/240 | loss 4.404 | train 64.11% | test 55.52%


Epoch 51/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  51/240 | loss 4.395 | train 64.15% | test 53.34%


Epoch 52/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  52/240 | loss 4.365 | train 64.34% | test 54.16%


Epoch 53/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  53/240 | loss 4.387 | train 64.36% | test 56.07%


Epoch 54/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  54/240 | loss 4.345 | train 64.53% | test 55.82%


Epoch 55/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  55/240 | loss 4.352 | train 64.37% | test 56.30%
  ↳ Saved best @ epoch 55 (test 56.30%) → 110_t-32_s.pth


Epoch 56/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  56/240 | loss 4.343 | train 64.48% | test 56.02%


Epoch 57/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  57/240 | loss 4.338 | train 64.47% | test 55.24%


Epoch 58/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  58/240 | loss 4.355 | train 64.30% | test 54.06%


Epoch 59/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  59/240 | loss 4.325 | train 64.54% | test 57.44%
  ↳ Saved best @ epoch 59 (test 57.44%) → 110_t-32_s.pth


Epoch 60/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  60/240 | loss 4.328 | train 64.58% | test 54.67%


Epoch 61/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  61/240 | loss 4.300 | train 64.78% | test 56.31%


Epoch 62/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  62/240 | loss 4.304 | train 64.90% | test 58.82%
  ↳ Saved best @ epoch 62 (test 58.82%) → 110_t-32_s.pth


Epoch 63/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  63/240 | loss 4.295 | train 64.89% | test 57.91%


Epoch 64/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  64/240 | loss 4.297 | train 64.86% | test 58.13%


Epoch 65/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  65/240 | loss 4.302 | train 64.75% | test 54.40%


Epoch 66/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  66/240 | loss 4.262 | train 64.89% | test 56.48%


Epoch 67/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  67/240 | loss 4.262 | train 64.83% | test 55.58%


Epoch 68/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  68/240 | loss 4.269 | train 65.10% | test 55.30%


Epoch 69/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  69/240 | loss 4.251 | train 65.03% | test 57.78%


Epoch 70/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  70/240 | loss 4.260 | train 65.21% | test 56.18%


Epoch 71/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  71/240 | loss 4.247 | train 65.09% | test 55.55%


Epoch 72/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  72/240 | loss 4.262 | train 64.87% | test 55.60%


Epoch 73/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  73/240 | loss 4.265 | train 65.19% | test 54.87%


Epoch 74/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  74/240 | loss 4.257 | train 65.05% | test 56.57%


Epoch 75/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  75/240 | loss 4.247 | train 65.50% | test 57.58%


Epoch 76/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  76/240 | loss 4.199 | train 65.48% | test 55.90%


Epoch 77/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  77/240 | loss 4.249 | train 64.85% | test 56.37%


Epoch 78/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  78/240 | loss 4.238 | train 65.53% | test 49.93%


Epoch 79/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  79/240 | loss 4.242 | train 65.19% | test 59.30%
  ↳ Saved best @ epoch 79 (test 59.30%) → 110_t-32_s.pth


Epoch 80/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  80/240 | loss 4.211 | train 65.44% | test 51.03%


Epoch 81/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  81/240 | loss 4.207 | train 65.70% | test 54.53%


Epoch 82/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  82/240 | loss 4.220 | train 65.52% | test 58.65%


Epoch 83/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  83/240 | loss 4.197 | train 65.53% | test 55.94%


Epoch 84/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  84/240 | loss 4.233 | train 65.22% | test 58.02%


Epoch 85/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  85/240 | loss 4.216 | train 65.51% | test 52.13%


Epoch 86/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  86/240 | loss 4.187 | train 65.69% | test 57.35%


Epoch 87/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  87/240 | loss 4.198 | train 65.75% | test 57.38%


Epoch 88/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  88/240 | loss 4.159 | train 65.88% | test 55.97%


Epoch 89/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  89/240 | loss 4.192 | train 65.74% | test 57.73%


Epoch 90/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  90/240 | loss 4.188 | train 65.83% | test 53.13%


Epoch 91/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  91/240 | loss 4.186 | train 65.45% | test 57.16%


Epoch 92/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  92/240 | loss 4.169 | train 66.06% | test 57.86%


Epoch 93/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  93/240 | loss 4.179 | train 65.86% | test 56.83%


Epoch 94/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  94/240 | loss 4.167 | train 65.62% | test 57.72%


Epoch 95/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  95/240 | loss 4.162 | train 65.72% | test 58.04%


Epoch 96/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  96/240 | loss 4.189 | train 65.73% | test 53.42%


Epoch 97/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  97/240 | loss 4.184 | train 65.53% | test 55.50%


Epoch 98/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  98/240 | loss 4.156 | train 65.83% | test 55.15%


Epoch 99/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  99/240 | loss 4.197 | train 65.71% | test 54.82%


Epoch 100/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 100/240 | loss 4.161 | train 65.77% | test 57.29%


Epoch 101/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 101/240 | loss 4.172 | train 65.74% | test 58.14%


Epoch 102/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 102/240 | loss 4.143 | train 65.87% | test 58.78%


Epoch 103/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 103/240 | loss 4.165 | train 65.76% | test 57.08%


Epoch 104/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 104/240 | loss 4.148 | train 66.07% | test 55.63%


Epoch 105/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 105/240 | loss 4.135 | train 65.81% | test 53.26%


Epoch 106/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 106/240 | loss 4.136 | train 65.79% | test 53.72%


Epoch 107/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 107/240 | loss 4.131 | train 65.72% | test 52.05%


Epoch 108/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 108/240 | loss 4.145 | train 65.82% | test 57.22%


Epoch 109/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 109/240 | loss 4.151 | train 65.74% | test 57.75%


Epoch 110/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 110/240 | loss 4.156 | train 65.70% | test 59.40%
  ↳ Saved best @ epoch 110 (test 59.40%) → 110_t-32_s.pth


Epoch 111/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 111/240 | loss 4.146 | train 65.78% | test 55.24%


Epoch 112/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 112/240 | loss 4.121 | train 66.05% | test 58.63%


Epoch 113/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 113/240 | loss 4.139 | train 66.22% | test 58.12%


Epoch 114/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 114/240 | loss 4.104 | train 66.00% | test 57.95%


Epoch 115/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 115/240 | loss 4.170 | train 65.72% | test 55.76%


Epoch 116/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 116/240 | loss 4.153 | train 66.01% | test 56.77%


Epoch 117/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 117/240 | loss 4.142 | train 65.92% | test 56.88%


Epoch 118/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 118/240 | loss 4.101 | train 66.41% | test 56.22%


Epoch 119/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 119/240 | loss 4.100 | train 66.38% | test 57.38%


Epoch 120/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 120/240 | loss 4.112 | train 66.19% | test 57.13%


Epoch 121/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 121/240 | loss 4.103 | train 66.15% | test 56.03%


Epoch 122/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 122/240 | loss 4.107 | train 66.17% | test 55.07%


Epoch 123/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 123/240 | loss 4.112 | train 65.96% | test 55.38%


Epoch 124/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 124/240 | loss 4.119 | train 66.15% | test 57.58%


Epoch 125/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 125/240 | loss 4.115 | train 66.10% | test 56.27%


Epoch 126/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 126/240 | loss 4.115 | train 66.27% | test 58.95%


Epoch 127/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 127/240 | loss 4.124 | train 66.37% | test 58.25%


Epoch 128/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 128/240 | loss 4.117 | train 66.20% | test 57.18%


Epoch 129/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 129/240 | loss 4.094 | train 66.21% | test 58.11%


Epoch 130/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 130/240 | loss 4.115 | train 66.12% | test 57.83%


Epoch 131/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 131/240 | loss 4.096 | train 66.32% | test 59.47%
  ↳ Saved best @ epoch 131 (test 59.47%) → 110_t-32_s.pth


Epoch 132/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 132/240 | loss 4.115 | train 66.22% | test 60.60%
  ↳ Saved best @ epoch 132 (test 60.60%) → 110_t-32_s.pth


Epoch 133/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 133/240 | loss 4.107 | train 65.85% | test 54.72%


Epoch 134/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 134/240 | loss 4.114 | train 66.22% | test 58.96%


Epoch 135/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 135/240 | loss 4.085 | train 66.37% | test 57.64%


Epoch 136/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 136/240 | loss 4.098 | train 66.44% | test 57.51%


Epoch 137/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 137/240 | loss 4.092 | train 66.55% | test 57.77%


Epoch 138/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 138/240 | loss 4.076 | train 66.14% | test 57.16%


Epoch 139/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 139/240 | loss 4.049 | train 66.29% | test 57.30%


Epoch 140/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 140/240 | loss 4.097 | train 66.19% | test 55.66%


Epoch 141/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 141/240 | loss 4.080 | train 66.58% | test 54.84%


Epoch 142/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 142/240 | loss 4.084 | train 66.01% | test 54.93%


Epoch 143/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 143/240 | loss 4.068 | train 66.30% | test 54.70%


Epoch 144/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 144/240 | loss 4.094 | train 66.46% | test 57.35%


Epoch 145/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 145/240 | loss 4.089 | train 66.44% | test 56.03%


Epoch 146/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 146/240 | loss 4.109 | train 66.14% | test 57.15%


Epoch 147/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 147/240 | loss 4.081 | train 66.40% | test 54.57%


Epoch 148/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 148/240 | loss 4.091 | train 66.31% | test 56.78%


Epoch 149/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 149/240 | loss 4.085 | train 66.22% | test 58.45%


Epoch 150/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 150/240 | loss 4.055 | train 66.49% | test 56.93%


Epoch 151/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 151/240 | loss 3.158 | train 72.92% | test 68.14%
  ↳ Saved best @ epoch 151 (test 68.14%) → 110_t-32_s.pth


Epoch 152/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 152/240 | loss 2.930 | train 74.82% | test 68.66%
  ↳ Saved best @ epoch 152 (test 68.66%) → 110_t-32_s.pth


Epoch 153/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 153/240 | loss 2.865 | train 75.26% | test 68.88%
  ↳ Saved best @ epoch 153 (test 68.88%) → 110_t-32_s.pth


Epoch 154/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 154/240 | loss 2.825 | train 75.80% | test 68.63%


Epoch 155/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 155/240 | loss 2.788 | train 76.05% | test 69.28%
  ↳ Saved best @ epoch 155 (test 69.28%) → 110_t-32_s.pth


Epoch 156/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 156/240 | loss 2.765 | train 76.33% | test 69.13%


Epoch 157/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 157/240 | loss 2.735 | train 76.57% | test 68.90%


Epoch 158/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 158/240 | loss 2.722 | train 76.74% | test 69.34%
  ↳ Saved best @ epoch 158 (test 69.34%) → 110_t-32_s.pth


Epoch 159/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 159/240 | loss 2.705 | train 76.89% | test 69.47%
  ↳ Saved best @ epoch 159 (test 69.47%) → 110_t-32_s.pth


Epoch 160/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 160/240 | loss 2.683 | train 76.98% | test 68.92%


Epoch 161/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 161/240 | loss 2.668 | train 77.34% | test 69.00%


Epoch 162/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 162/240 | loss 2.657 | train 77.42% | test 68.90%


Epoch 163/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 163/240 | loss 2.637 | train 77.44% | test 69.02%


Epoch 164/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 164/240 | loss 2.636 | train 77.66% | test 69.38%


Epoch 165/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 165/240 | loss 2.632 | train 77.80% | test 69.42%


Epoch 166/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 166/240 | loss 2.621 | train 77.83% | test 69.58%
  ↳ Saved best @ epoch 166 (test 69.58%) → 110_t-32_s.pth


Epoch 167/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 167/240 | loss 2.604 | train 78.10% | test 68.58%


Epoch 168/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 168/240 | loss 2.606 | train 77.92% | test 69.63%
  ↳ Saved best @ epoch 168 (test 69.63%) → 110_t-32_s.pth


Epoch 169/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 169/240 | loss 2.601 | train 78.06% | test 69.51%


Epoch 170/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 170/240 | loss 2.601 | train 78.01% | test 69.36%


Epoch 171/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 171/240 | loss 2.593 | train 78.25% | test 69.57%


Epoch 172/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 172/240 | loss 2.579 | train 78.31% | test 69.42%


Epoch 173/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 173/240 | loss 2.574 | train 78.34% | test 69.00%


Epoch 174/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 174/240 | loss 2.578 | train 78.08% | test 69.29%


Epoch 175/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 175/240 | loss 2.575 | train 78.42% | test 69.19%


Epoch 176/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 176/240 | loss 2.563 | train 78.63% | test 68.89%


Epoch 177/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 177/240 | loss 2.562 | train 78.61% | test 69.00%


Epoch 178/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 178/240 | loss 2.564 | train 78.53% | test 69.11%


Epoch 179/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 179/240 | loss 2.548 | train 78.72% | test 68.75%


Epoch 180/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 180/240 | loss 2.556 | train 78.76% | test 68.78%


Epoch 181/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 181/240 | loss 2.437 | train 79.82% | test 70.06%
  ↳ Saved best @ epoch 181 (test 70.06%) → 110_t-32_s.pth


Epoch 182/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 182/240 | loss 2.394 | train 80.30% | test 70.17%
  ↳ Saved best @ epoch 182 (test 70.17%) → 110_t-32_s.pth


Epoch 183/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 183/240 | loss 2.382 | train 80.23% | test 70.20%
  ↳ Saved best @ epoch 183 (test 70.20%) → 110_t-32_s.pth


Epoch 184/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 184/240 | loss 2.375 | train 80.38% | test 70.27%
  ↳ Saved best @ epoch 184 (test 70.27%) → 110_t-32_s.pth


Epoch 185/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 185/240 | loss 2.359 | train 80.51% | test 70.32%
  ↳ Saved best @ epoch 185 (test 70.32%) → 110_t-32_s.pth


Epoch 186/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 186/240 | loss 2.364 | train 80.27% | test 70.40%
  ↳ Saved best @ epoch 186 (test 70.40%) → 110_t-32_s.pth


Epoch 187/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 187/240 | loss 2.360 | train 80.43% | test 70.33%


Epoch 188/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 188/240 | loss 2.348 | train 80.63% | test 70.39%


Epoch 189/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 189/240 | loss 2.357 | train 80.55% | test 70.05%


Epoch 190/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 190/240 | loss 2.351 | train 80.63% | test 70.47%
  ↳ Saved best @ epoch 190 (test 70.47%) → 110_t-32_s.pth


Epoch 191/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 191/240 | loss 2.357 | train 80.53% | test 70.22%


Epoch 192/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 192/240 | loss 2.349 | train 80.74% | test 70.41%


Epoch 193/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 193/240 | loss 2.342 | train 80.57% | test 70.33%


Epoch 194/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 194/240 | loss 2.349 | train 80.52% | test 70.38%


Epoch 195/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 195/240 | loss 2.344 | train 80.78% | test 70.17%


Epoch 196/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 196/240 | loss 2.343 | train 80.80% | test 70.30%


Epoch 197/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 197/240 | loss 2.341 | train 80.85% | test 70.14%


Epoch 198/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 198/240 | loss 2.338 | train 80.70% | test 70.21%


Epoch 199/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 199/240 | loss 2.336 | train 80.72% | test 70.31%


Epoch 200/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 200/240 | loss 2.341 | train 80.69% | test 70.36%


Epoch 201/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 201/240 | loss 2.340 | train 80.75% | test 70.44%


Epoch 202/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 202/240 | loss 2.336 | train 80.67% | test 70.25%


Epoch 203/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 203/240 | loss 2.330 | train 80.94% | test 70.31%


Epoch 204/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 204/240 | loss 2.331 | train 80.71% | test 70.19%


Epoch 205/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 205/240 | loss 2.334 | train 80.84% | test 70.41%


Epoch 206/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 206/240 | loss 2.318 | train 80.91% | test 70.19%


Epoch 207/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 207/240 | loss 2.331 | train 80.85% | test 70.32%


Epoch 208/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 208/240 | loss 2.324 | train 80.85% | test 70.45%


Epoch 209/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 209/240 | loss 2.331 | train 80.87% | test 70.28%


Epoch 210/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 210/240 | loss 2.321 | train 81.05% | test 70.35%


Epoch 211/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 211/240 | loss 2.314 | train 81.19% | test 70.25%


Epoch 212/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 212/240 | loss 2.310 | train 81.00% | test 70.18%


Epoch 213/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 213/240 | loss 2.310 | train 81.04% | test 70.25%


Epoch 214/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 214/240 | loss 2.306 | train 81.23% | test 70.30%


Epoch 215/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 215/240 | loss 2.306 | train 80.98% | test 70.36%


Epoch 216/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 216/240 | loss 2.307 | train 81.11% | test 70.26%


Epoch 217/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 217/240 | loss 2.304 | train 81.32% | test 70.23%


Epoch 218/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 218/240 | loss 2.304 | train 81.34% | test 70.25%


Epoch 219/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 219/240 | loss 2.316 | train 81.03% | test 70.35%


Epoch 220/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 220/240 | loss 2.303 | train 81.41% | test 70.25%


Epoch 221/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 221/240 | loss 2.300 | train 81.09% | test 70.42%


Epoch 222/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 222/240 | loss 2.294 | train 81.06% | test 70.39%


Epoch 223/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 223/240 | loss 2.296 | train 81.21% | test 70.50%
  ↳ Saved best @ epoch 223 (test 70.50%) → 110_t-32_s.pth


Epoch 224/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 224/240 | loss 2.297 | train 81.25% | test 70.36%


Epoch 225/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 225/240 | loss 2.302 | train 81.11% | test 70.32%


Epoch 226/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 226/240 | loss 2.291 | train 81.26% | test 70.35%


Epoch 227/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 227/240 | loss 2.304 | train 81.07% | test 70.45%


Epoch 228/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 228/240 | loss 2.305 | train 81.16% | test 70.35%


Epoch 229/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 229/240 | loss 2.317 | train 81.13% | test 70.40%


Epoch 230/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 230/240 | loss 2.308 | train 81.09% | test 70.37%


Epoch 231/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 231/240 | loss 2.309 | train 81.23% | test 70.59%
  ↳ Saved best @ epoch 231 (test 70.59%) → 110_t-32_s.pth


Epoch 232/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 232/240 | loss 2.302 | train 81.26% | test 70.15%


Epoch 233/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 233/240 | loss 2.299 | train 81.31% | test 70.31%


Epoch 234/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 234/240 | loss 2.295 | train 81.12% | test 70.42%


Epoch 235/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 235/240 | loss 2.309 | train 81.06% | test 70.23%


Epoch 236/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 236/240 | loss 2.319 | train 81.05% | test 70.46%


Epoch 237/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 237/240 | loss 2.282 | train 81.24% | test 70.45%


Epoch 238/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 238/240 | loss 2.299 | train 81.24% | test 70.43%


Epoch 239/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 239/240 | loss 2.291 | train 81.35% | test 70.41%


Epoch 240/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 240/240 | loss 2.307 | train 81.16% | test 70.43%
✅ KD training finished. Best Test Acc: 70.59%


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
teacher = resnet110(num_classes = 100)
t_ckpt  = torch.load("/kaggle/input/resnet-110/ckpt_epoch_240.pth", map_location=device, weights_only=False)
teacher.load_state_dict(t_ckpt['model'] if 'model' in t_ckpt else t_ckpt)

student = resnet32_basic(num_classes=100)

train_via_KD(teacher, student, train_loader, test_loader, device, save_path= "110_t-32_s.pth")

Epoch 1/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   1/240 | loss 14.883 | train  7.91% | test 11.83%
  ↳ Saved best @ epoch 1 (test 11.83%) → 110_t-32_s.pth


Epoch 2/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   2/240 | loss 12.160 | train 18.86% | test 20.39%
  ↳ Saved best @ epoch 2 (test 20.39%) → 110_t-32_s.pth


Epoch 3/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   3/240 | loss 10.197 | train 28.39% | test 26.32%
  ↳ Saved best @ epoch 3 (test 26.32%) → 110_t-32_s.pth


Epoch 4/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   4/240 | loss 8.867 | train 35.36% | test 32.83%
  ↳ Saved best @ epoch 4 (test 32.83%) → 110_t-32_s.pth


Epoch 5/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   5/240 | loss 7.895 | train 40.60% | test 31.48%


Epoch 6/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   6/240 | loss 7.171 | train 45.25% | test 38.08%
  ↳ Saved best @ epoch 6 (test 38.08%) → 110_t-32_s.pth


Epoch 7/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   7/240 | loss 6.658 | train 48.40% | test 43.67%
  ↳ Saved best @ epoch 7 (test 43.67%) → 110_t-32_s.pth


Epoch 8/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   8/240 | loss 6.279 | train 50.68% | test 42.56%


Epoch 9/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   9/240 | loss 5.948 | train 52.64% | test 43.46%


Epoch 10/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  10/240 | loss 5.686 | train 54.66% | test 49.61%
  ↳ Saved best @ epoch 10 (test 49.61%) → 110_t-32_s.pth


Epoch 11/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  11/240 | loss 5.476 | train 55.80% | test 51.06%
  ↳ Saved best @ epoch 11 (test 51.06%) → 110_t-32_s.pth


Epoch 12/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  12/240 | loss 5.319 | train 57.06% | test 46.83%


Epoch 13/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  13/240 | loss 5.182 | train 57.91% | test 53.86%
  ↳ Saved best @ epoch 13 (test 53.86%) → 110_t-32_s.pth


Epoch 14/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  14/240 | loss 5.082 | train 58.92% | test 52.20%


Epoch 15/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  15/240 | loss 4.959 | train 59.55% | test 50.23%


Epoch 16/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  16/240 | loss 4.877 | train 59.92% | test 52.51%


Epoch 17/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  17/240 | loss 4.779 | train 60.56% | test 51.47%


Epoch 18/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  18/240 | loss 4.717 | train 61.16% | test 53.91%
  ↳ Saved best @ epoch 18 (test 53.91%) → 110_t-32_s.pth


Epoch 19/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  19/240 | loss 4.637 | train 61.81% | test 54.92%
  ↳ Saved best @ epoch 19 (test 54.92%) → 110_t-32_s.pth


Epoch 20/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  20/240 | loss 4.550 | train 62.24% | test 55.77%
  ↳ Saved best @ epoch 20 (test 55.77%) → 110_t-32_s.pth


Epoch 21/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  21/240 | loss 4.511 | train 62.66% | test 52.99%


Epoch 22/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  22/240 | loss 4.467 | train 62.87% | test 51.31%


Epoch 23/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  23/240 | loss 4.438 | train 63.10% | test 53.69%


Epoch 24/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  24/240 | loss 4.359 | train 63.79% | test 52.80%


Epoch 25/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  25/240 | loss 4.359 | train 63.94% | test 56.95%
  ↳ Saved best @ epoch 25 (test 56.95%) → 110_t-32_s.pth


Epoch 26/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  26/240 | loss 4.306 | train 63.96% | test 53.03%


Epoch 27/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  27/240 | loss 4.276 | train 64.48% | test 54.42%


Epoch 28/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  28/240 | loss 4.286 | train 64.26% | test 55.66%


Epoch 29/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  29/240 | loss 4.237 | train 64.83% | test 56.55%


Epoch 30/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  30/240 | loss 4.204 | train 65.02% | test 55.56%


Epoch 31/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  31/240 | loss 4.189 | train 64.89% | test 57.15%
  ↳ Saved best @ epoch 31 (test 57.15%) → 110_t-32_s.pth


Epoch 32/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  32/240 | loss 4.151 | train 65.43% | test 57.39%
  ↳ Saved best @ epoch 32 (test 57.39%) → 110_t-32_s.pth


Epoch 33/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  33/240 | loss 4.166 | train 65.35% | test 54.85%


Epoch 34/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  34/240 | loss 4.100 | train 65.86% | test 54.45%


Epoch 35/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  35/240 | loss 4.096 | train 65.83% | test 58.73%
  ↳ Saved best @ epoch 35 (test 58.73%) → 110_t-32_s.pth


Epoch 36/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  36/240 | loss 4.077 | train 65.95% | test 52.22%


Epoch 37/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  37/240 | loss 4.060 | train 65.93% | test 56.20%


Epoch 38/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  38/240 | loss 4.040 | train 66.03% | test 55.41%


Epoch 39/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  39/240 | loss 4.032 | train 66.13% | test 57.09%


Epoch 40/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  40/240 | loss 3.983 | train 66.60% | test 60.20%
  ↳ Saved best @ epoch 40 (test 60.20%) → 110_t-32_s.pth


Epoch 41/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  41/240 | loss 4.015 | train 66.53% | test 55.24%


Epoch 42/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  42/240 | loss 4.000 | train 66.53% | test 54.97%


Epoch 43/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  43/240 | loss 3.985 | train 66.31% | test 56.01%


Epoch 44/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  44/240 | loss 3.940 | train 66.90% | test 55.28%


Epoch 45/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  45/240 | loss 3.933 | train 66.77% | test 57.53%


Epoch 46/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  46/240 | loss 3.952 | train 66.51% | test 52.93%


Epoch 47/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  47/240 | loss 3.942 | train 66.97% | test 59.18%


Epoch 48/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  48/240 | loss 3.931 | train 67.01% | test 54.16%


Epoch 49/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  49/240 | loss 3.927 | train 66.91% | test 53.49%


Epoch 50/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  50/240 | loss 3.877 | train 67.28% | test 54.59%


Epoch 51/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  51/240 | loss 3.912 | train 67.32% | test 56.02%


Epoch 52/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  52/240 | loss 3.860 | train 67.49% | test 55.38%


Epoch 53/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  53/240 | loss 3.860 | train 67.48% | test 57.65%


Epoch 54/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  54/240 | loss 3.874 | train 67.42% | test 57.80%


Epoch 55/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  55/240 | loss 3.879 | train 67.68% | test 57.00%


Epoch 56/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  56/240 | loss 3.859 | train 67.91% | test 55.44%


Epoch 57/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  57/240 | loss 3.870 | train 67.37% | test 59.76%


Epoch 58/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  58/240 | loss 3.843 | train 67.68% | test 58.49%


Epoch 59/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  59/240 | loss 3.825 | train 67.75% | test 58.64%


Epoch 60/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  60/240 | loss 3.837 | train 67.64% | test 57.21%


Epoch 61/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  61/240 | loss 3.813 | train 67.77% | test 54.68%


Epoch 62/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  62/240 | loss 3.796 | train 67.88% | test 55.53%


Epoch 63/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  63/240 | loss 3.799 | train 67.92% | test 58.85%


Epoch 64/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  64/240 | loss 3.792 | train 68.12% | test 52.87%


Epoch 65/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  65/240 | loss 3.789 | train 68.00% | test 56.16%


Epoch 66/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  66/240 | loss 3.802 | train 68.33% | test 56.56%


Epoch 67/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  67/240 | loss 3.794 | train 68.06% | test 56.82%


Epoch 68/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  68/240 | loss 3.759 | train 68.50% | test 56.05%


Epoch 69/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  69/240 | loss 3.767 | train 68.20% | test 57.03%


Epoch 70/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  70/240 | loss 3.751 | train 68.32% | test 51.96%


Epoch 71/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  71/240 | loss 3.754 | train 68.14% | test 58.87%


Epoch 72/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  72/240 | loss 3.739 | train 68.34% | test 58.10%


Epoch 73/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  73/240 | loss 3.742 | train 68.45% | test 57.30%


Epoch 74/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  74/240 | loss 3.763 | train 68.47% | test 55.42%


Epoch 75/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  75/240 | loss 3.717 | train 68.75% | test 59.64%


Epoch 76/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  76/240 | loss 3.700 | train 68.85% | test 57.55%


Epoch 77/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  77/240 | loss 3.716 | train 68.68% | test 61.41%
  ↳ Saved best @ epoch 77 (test 61.41%) → 110_t-32_s.pth


Epoch 78/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  78/240 | loss 3.704 | train 68.82% | test 57.03%


Epoch 79/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  79/240 | loss 3.734 | train 68.47% | test 57.83%


Epoch 80/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  80/240 | loss 3.704 | train 68.73% | test 59.34%


Epoch 81/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  81/240 | loss 3.687 | train 68.75% | test 60.23%


Epoch 82/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  82/240 | loss 3.701 | train 68.70% | test 58.09%


Epoch 83/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  83/240 | loss 3.648 | train 69.09% | test 59.51%


Epoch 84/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  84/240 | loss 3.666 | train 69.00% | test 58.25%


Epoch 85/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  85/240 | loss 3.670 | train 69.11% | test 60.90%


Epoch 86/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  86/240 | loss 3.670 | train 69.21% | test 57.41%


Epoch 87/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  87/240 | loss 3.711 | train 68.51% | test 59.93%


Epoch 88/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  88/240 | loss 3.654 | train 69.17% | test 60.26%


Epoch 89/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  89/240 | loss 3.660 | train 69.12% | test 59.63%


Epoch 90/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  90/240 | loss 3.680 | train 69.00% | test 58.72%


Epoch 91/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  91/240 | loss 3.683 | train 68.79% | test 56.47%


Epoch 92/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  92/240 | loss 3.659 | train 69.07% | test 58.09%


Epoch 93/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  93/240 | loss 3.647 | train 68.87% | test 59.68%


Epoch 94/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  94/240 | loss 3.669 | train 69.08% | test 50.73%


Epoch 95/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  95/240 | loss 3.644 | train 68.98% | test 59.89%


Epoch 96/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  96/240 | loss 3.646 | train 69.05% | test 57.67%


Epoch 97/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  97/240 | loss 3.667 | train 69.33% | test 59.31%


Epoch 98/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  98/240 | loss 3.631 | train 69.17% | test 61.22%


Epoch 99/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  99/240 | loss 3.619 | train 69.35% | test 57.91%


Epoch 100/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 100/240 | loss 3.648 | train 68.99% | test 56.15%


Epoch 101/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 101/240 | loss 3.655 | train 69.11% | test 58.34%


Epoch 102/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 102/240 | loss 3.656 | train 69.09% | test 59.15%


Epoch 103/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 103/240 | loss 3.623 | train 69.30% | test 61.75%
  ↳ Saved best @ epoch 103 (test 61.75%) → 110_t-32_s.pth


Epoch 104/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 104/240 | loss 3.590 | train 69.66% | test 60.00%


Epoch 105/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 105/240 | loss 3.648 | train 69.24% | test 58.96%


Epoch 106/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 106/240 | loss 3.649 | train 69.16% | test 56.60%


Epoch 107/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 107/240 | loss 3.633 | train 69.31% | test 59.28%


Epoch 108/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 108/240 | loss 3.645 | train 69.15% | test 59.10%


Epoch 109/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 109/240 | loss 3.612 | train 69.40% | test 59.93%


Epoch 110/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 110/240 | loss 3.615 | train 69.56% | test 56.30%


Epoch 111/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 111/240 | loss 3.588 | train 69.56% | test 57.28%


Epoch 112/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 112/240 | loss 3.635 | train 69.16% | test 60.34%


Epoch 113/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 113/240 | loss 3.583 | train 69.84% | test 56.23%


Epoch 114/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 114/240 | loss 3.609 | train 69.69% | test 59.65%


Epoch 115/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 115/240 | loss 3.598 | train 69.43% | test 58.56%


Epoch 116/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 116/240 | loss 3.584 | train 69.73% | test 60.05%


Epoch 117/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 117/240 | loss 3.600 | train 69.54% | test 61.62%


Epoch 118/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 118/240 | loss 3.587 | train 69.72% | test 59.48%


Epoch 119/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 119/240 | loss 3.592 | train 69.70% | test 59.56%


Epoch 120/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 120/240 | loss 3.557 | train 69.89% | test 60.49%


Epoch 121/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 121/240 | loss 3.564 | train 69.84% | test 59.18%


Epoch 122/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 122/240 | loss 3.584 | train 69.49% | test 61.42%


Epoch 123/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 123/240 | loss 3.586 | train 69.62% | test 58.27%


Epoch 124/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 124/240 | loss 3.586 | train 69.57% | test 58.34%


Epoch 125/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 125/240 | loss 3.565 | train 69.86% | test 61.34%


Epoch 126/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 126/240 | loss 3.561 | train 69.87% | test 60.67%


Epoch 127/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 127/240 | loss 3.577 | train 69.59% | test 60.57%


Epoch 128/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 128/240 | loss 3.583 | train 69.52% | test 54.80%


Epoch 129/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 129/240 | loss 3.571 | train 69.66% | test 60.85%


Epoch 130/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 130/240 | loss 3.591 | train 69.68% | test 58.88%


Epoch 131/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 131/240 | loss 3.556 | train 69.70% | test 58.87%


Epoch 132/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 132/240 | loss 3.576 | train 69.69% | test 59.73%


Epoch 133/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 133/240 | loss 3.573 | train 69.78% | test 58.12%


Epoch 134/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 134/240 | loss 3.585 | train 69.53% | test 58.19%


Epoch 135/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 135/240 | loss 3.535 | train 70.01% | test 57.89%


Epoch 136/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 136/240 | loss 3.572 | train 69.73% | test 61.25%


Epoch 137/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 137/240 | loss 3.574 | train 69.77% | test 57.51%


Epoch 138/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 138/240 | loss 3.582 | train 69.60% | test 61.15%


Epoch 139/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 139/240 | loss 3.536 | train 69.99% | test 59.16%


Epoch 140/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 140/240 | loss 3.526 | train 70.26% | test 60.66%


Epoch 141/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 141/240 | loss 3.555 | train 69.87% | test 57.23%


Epoch 142/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 142/240 | loss 3.551 | train 69.90% | test 61.60%


Epoch 143/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 143/240 | loss 3.576 | train 69.64% | test 58.63%


Epoch 144/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 144/240 | loss 3.559 | train 69.84% | test 58.98%


Epoch 145/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 145/240 | loss 3.520 | train 70.03% | test 58.72%


Epoch 146/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 146/240 | loss 3.537 | train 70.01% | test 58.44%


Epoch 147/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 147/240 | loss 3.562 | train 69.64% | test 56.93%


Epoch 148/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 148/240 | loss 3.545 | train 69.96% | test 60.88%


Epoch 149/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 149/240 | loss 3.505 | train 70.01% | test 59.98%


Epoch 150/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 150/240 | loss 3.519 | train 70.02% | test 57.92%


Epoch 151/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 151/240 | loss 2.615 | train 77.07% | test 71.08%
  ↳ Saved best @ epoch 151 (test 71.08%) → 110_t-32_s.pth


Epoch 152/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 152/240 | loss 2.365 | train 79.14% | test 72.16%
  ↳ Saved best @ epoch 152 (test 72.16%) → 110_t-32_s.pth


Epoch 153/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 153/240 | loss 2.296 | train 79.71% | test 71.73%


Epoch 154/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 154/240 | loss 2.236 | train 80.22% | test 72.35%
  ↳ Saved best @ epoch 154 (test 72.35%) → 110_t-32_s.pth


Epoch 155/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 155/240 | loss 2.202 | train 80.76% | test 72.45%
  ↳ Saved best @ epoch 155 (test 72.45%) → 110_t-32_s.pth


Epoch 156/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 156/240 | loss 2.169 | train 81.17% | test 72.76%
  ↳ Saved best @ epoch 156 (test 72.76%) → 110_t-32_s.pth


Epoch 157/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 157/240 | loss 2.140 | train 81.38% | test 72.59%


Epoch 158/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 158/240 | loss 2.128 | train 81.46% | test 72.91%
  ↳ Saved best @ epoch 158 (test 72.91%) → 110_t-32_s.pth


Epoch 159/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 159/240 | loss 2.100 | train 81.88% | test 73.11%
  ↳ Saved best @ epoch 159 (test 73.11%) → 110_t-32_s.pth


Epoch 160/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 160/240 | loss 2.075 | train 82.04% | test 73.02%


Epoch 161/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 161/240 | loss 2.068 | train 82.22% | test 72.79%


Epoch 162/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 162/240 | loss 2.056 | train 82.22% | test 72.79%


Epoch 163/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 163/240 | loss 2.039 | train 82.62% | test 72.57%


Epoch 164/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 164/240 | loss 2.025 | train 82.80% | test 72.82%


Epoch 165/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 165/240 | loss 2.022 | train 82.92% | test 73.19%
  ↳ Saved best @ epoch 165 (test 73.19%) → 110_t-32_s.pth


Epoch 166/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 166/240 | loss 2.006 | train 83.05% | test 72.67%


Epoch 167/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 167/240 | loss 1.988 | train 83.14% | test 72.57%


Epoch 168/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 168/240 | loss 1.991 | train 83.27% | test 72.68%


Epoch 169/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 169/240 | loss 1.984 | train 83.42% | test 72.90%


Epoch 170/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 170/240 | loss 1.971 | train 83.62% | test 72.59%


Epoch 171/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 171/240 | loss 1.964 | train 83.71% | test 72.47%


Epoch 172/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 172/240 | loss 1.957 | train 83.74% | test 72.81%


Epoch 173/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 173/240 | loss 1.937 | train 83.97% | test 72.69%


Epoch 174/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 174/240 | loss 1.934 | train 84.17% | test 72.55%


Epoch 175/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 175/240 | loss 1.933 | train 84.14% | test 72.81%


Epoch 176/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 176/240 | loss 1.928 | train 84.18% | test 73.03%


Epoch 177/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 177/240 | loss 1.944 | train 84.21% | test 72.94%


Epoch 178/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 178/240 | loss 1.915 | train 84.26% | test 72.60%


Epoch 179/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 179/240 | loss 1.926 | train 84.39% | test 72.58%


Epoch 180/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 180/240 | loss 1.920 | train 84.33% | test 72.50%


Epoch 181/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 181/240 | loss 1.800 | train 85.47% | test 73.44%
  ↳ Saved best @ epoch 181 (test 73.44%) → 110_t-32_s.pth


Epoch 182/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 182/240 | loss 1.780 | train 85.80% | test 73.62%
  ↳ Saved best @ epoch 182 (test 73.62%) → 110_t-32_s.pth


Epoch 183/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 183/240 | loss 1.762 | train 85.97% | test 73.67%
  ↳ Saved best @ epoch 183 (test 73.67%) → 110_t-32_s.pth


Epoch 184/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 184/240 | loss 1.758 | train 86.05% | test 73.92%
  ↳ Saved best @ epoch 184 (test 73.92%) → 110_t-32_s.pth


Epoch 185/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 185/240 | loss 1.755 | train 86.20% | test 73.74%


Epoch 186/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 186/240 | loss 1.740 | train 86.04% | test 73.45%


Epoch 187/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 187/240 | loss 1.739 | train 86.15% | test 73.59%


Epoch 188/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 188/240 | loss 1.739 | train 86.11% | test 73.63%


Epoch 189/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 189/240 | loss 1.746 | train 86.24% | test 73.71%


Epoch 190/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 190/240 | loss 1.731 | train 86.20% | test 73.75%


Epoch 191/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 191/240 | loss 1.733 | train 86.26% | test 73.64%


Epoch 192/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 192/240 | loss 1.732 | train 86.35% | test 73.72%


Epoch 193/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 193/240 | loss 1.729 | train 86.26% | test 73.64%


Epoch 194/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 194/240 | loss 1.732 | train 86.31% | test 73.61%


Epoch 195/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 195/240 | loss 1.726 | train 86.23% | test 73.78%


Epoch 196/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 196/240 | loss 1.726 | train 86.54% | test 73.75%


Epoch 197/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 197/240 | loss 1.723 | train 86.60% | test 73.71%


Epoch 198/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 198/240 | loss 1.717 | train 86.50% | test 73.76%


Epoch 199/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 199/240 | loss 1.724 | train 86.39% | test 73.77%


Epoch 200/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 200/240 | loss 1.723 | train 86.43% | test 73.90%


Epoch 201/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 201/240 | loss 1.718 | train 86.68% | test 73.60%


Epoch 202/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 202/240 | loss 1.712 | train 86.48% | test 73.63%


Epoch 203/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 203/240 | loss 1.710 | train 86.67% | test 73.64%


Epoch 204/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 204/240 | loss 1.717 | train 86.59% | test 73.74%


Epoch 205/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 205/240 | loss 1.714 | train 86.67% | test 73.74%


Epoch 206/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 206/240 | loss 1.725 | train 86.53% | test 73.68%


Epoch 207/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 207/240 | loss 1.704 | train 86.45% | test 73.79%


Epoch 208/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 208/240 | loss 1.706 | train 86.48% | test 73.62%


Epoch 209/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 209/240 | loss 1.709 | train 86.45% | test 73.66%


Epoch 210/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 210/240 | loss 1.705 | train 86.58% | test 73.74%


Epoch 211/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 211/240 | loss 1.697 | train 86.76% | test 73.79%


Epoch 212/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 212/240 | loss 1.687 | train 86.83% | test 73.78%


Epoch 213/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 213/240 | loss 1.695 | train 86.88% | test 73.90%


Epoch 214/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 214/240 | loss 1.695 | train 86.76% | test 73.70%


Epoch 215/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 215/240 | loss 1.692 | train 86.99% | test 73.73%


Epoch 216/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 216/240 | loss 1.682 | train 86.81% | test 73.85%


Epoch 217/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 217/240 | loss 1.688 | train 86.92% | test 73.85%


Epoch 218/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 218/240 | loss 1.693 | train 86.78% | test 73.85%


Epoch 219/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 219/240 | loss 1.696 | train 86.92% | test 73.86%


Epoch 220/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 220/240 | loss 1.689 | train 86.88% | test 73.77%


Epoch 221/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 221/240 | loss 1.691 | train 86.67% | test 73.70%


Epoch 222/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 222/240 | loss 1.688 | train 86.87% | test 73.80%


Epoch 223/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 223/240 | loss 1.693 | train 86.95% | test 73.71%


Epoch 224/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 224/240 | loss 1.686 | train 86.77% | test 73.80%


Epoch 225/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 225/240 | loss 1.683 | train 86.81% | test 73.81%


Epoch 226/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 226/240 | loss 1.689 | train 86.85% | test 73.76%


Epoch 227/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 227/240 | loss 1.689 | train 86.89% | test 73.57%


Epoch 228/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 228/240 | loss 1.685 | train 86.86% | test 73.84%


Epoch 229/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 229/240 | loss 1.684 | train 87.05% | test 73.80%


Epoch 230/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 230/240 | loss 1.691 | train 86.80% | test 73.69%


Epoch 231/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 231/240 | loss 1.691 | train 86.85% | test 73.61%


Epoch 232/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 232/240 | loss 1.684 | train 86.98% | test 73.81%


Epoch 233/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 233/240 | loss 1.688 | train 86.87% | test 73.71%


Epoch 234/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 234/240 | loss 1.684 | train 86.97% | test 73.76%


Epoch 235/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 235/240 | loss 1.683 | train 87.06% | test 73.61%


Epoch 236/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 236/240 | loss 1.691 | train 86.86% | test 73.87%


Epoch 237/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 237/240 | loss 1.682 | train 87.16% | test 73.95%
  ↳ Saved best @ epoch 237 (test 73.95%) → 110_t-32_s.pth


Epoch 238/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 238/240 | loss 1.689 | train 87.10% | test 73.75%


Epoch 239/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 239/240 | loss 1.680 | train 87.00% | test 73.92%


Epoch 240/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 240/240 | loss 1.685 | train 86.91% | test 73.72%
✅ KD training finished. Best Test Acc: 73.95%
