In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ----------------------------
# Basic 3x3 Convolution
# ----------------------------
def conv3x3(in_planes, out_planes, stride=1):
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)

# ----------------------------
# Basic Residual Block
# ----------------------------
class BasicBlock(nn.Module):
    expansion = 1
    def __init__(self, in_planes, planes, stride=1, downsample=None, is_last=False):
        super().__init__()
        self.is_last = is_last
        self.conv1 = conv3x3(in_planes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        preact = out
        out = F.relu(out)

        if self.is_last:
            return out, preact
        else:
            return out

# ----------------------------
# ResNet CIFAR Modular
# ----------------------------
class ResNet(nn.Module):
    def __init__(self, depth, num_filters=[16,16,32,64], block_name='basicblock', num_classes=100):
        super().__init__()
        assert block_name.lower() == 'basicblock', "Currently only BasicBlock supported"
        assert (depth - 2) % 6 == 0, "Depth must be 6n+2 for BasicBlock"
        n = (depth - 2) // 6

        self.in_planes = num_filters[0]
        self.conv1 = conv3x3(3, num_filters[0])
        self.bn1 = nn.BatchNorm2d(num_filters[0])
        self.relu = nn.ReLU(inplace=True)

        # Residual layers
        self.layer1 = self._make_layer(BasicBlock, num_filters[1], n)
        self.layer2 = self._make_layer(BasicBlock, num_filters[2], n, stride=2)
        self.layer3 = self._make_layer(BasicBlock, num_filters[3], n, stride=2)

        self.avgpool = nn.AvgPool2d(8)
        self.fc = nn.Linear(num_filters[3]*BasicBlock.expansion, num_classes)

        # weight init
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_planes != planes*block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_planes, planes*block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes*block.expansion)
            )

        layers = [block(self.in_planes, planes, stride, downsample, is_last=(blocks==1))]
        self.in_planes = planes*block.expansion
        for i in range(1, blocks):
            layers.append(block(self.in_planes, planes, is_last=(i==blocks-1)))
        return nn.Sequential(*layers)

    def forward(self, x, is_feat=False, preact=False):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        f0 = x

        x, f1_pre = self.layer1(x)
        f1 = x
        x, f2_pre = self.layer2(x)
        f2 = x
        x, f3_pre = self.layer3(x)
        f3 = x

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        f4 = x
        x = self.fc(x)

        if is_feat:
            return ([f0,f1,f2,f3,f4], x) if not preact else ([f0,f1_pre,f2_pre,f3_pre,f4], x)
        else:
            return x

# ----------------------------
# Architecture builders
# ----------------------------
def resnet20(num_classes=100): return ResNet(20, num_classes=num_classes)
def resnet32(num_classes=100):
    """ResNet32 matching the standard CIFAR ResNet paper structure."""
    # Use num_filters=[32, 32, 64, 128] to match the checkpoint you want to load
    return ResNet(depth=32, num_filters=[32, 64, 128, 256], block_name='basicblock', num_classes=num_classes)

def resnet56(num_classes=100): return ResNet(56, num_classes=num_classes)
def resnet110(num_classes=100): return ResNet(110, num_classes=num_classes)
def resnet32_basic(num_classes=100): return ResNet(32, num_classes=num_classes)

In [2]:
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# ---------------------------
# Device
# ---------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------------------------
# CIFAR-100 test dataset
# ---------------------------
mean = (0.5071, 0.4867, 0.4408)
std  = (0.2675, 0.2565, 0.2761)

test_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

test_ds = datasets.CIFAR100("./data", train=False, transform=test_tf, download=True)
test_loader = DataLoader(test_ds, batch_size=128, shuffle=False, num_workers=2, pin_memory=True)

# ---------------------------
# Model constructors
# ---------------------------
model_dict = {
    # "ResNet20": resnet20,
    # "ResNet32": resnet32(), # this is the resnet-32x4
    "ResNet56": resnet56(),
    "ResNet110": resnet110()
}

# Path mapping for pretrained weights (adjust paths if needed)
weights_dict = {
    # "ResNet20": "/path/to/resnet20.pth",
    # "ResNet32": "/kaggle/input/resnet32/pytorch/default/1/ckpt_epoch_240.pth",
    "ResNet56":  "/kaggle/input/resnet-56/ckpt_epoch_240.pth",
    "ResNet110": "/kaggle/input/resnet-110/ckpt_epoch_240.pth"
}

# ---------------------------
# Evaluation loop
# ---------------------------
for name, constructor in model_dict.items():
    print(f"Evaluating {name}...")
    model = constructor.to(device)
    
    # Load pretrained weights if available
    weight_path = weights_dict.get(name)
    if weight_path:
        checkpoint = torch.load(weight_path, map_location=device,weights_only=False)
        if 'model' in checkpoint:
            state_dict = checkpoint['model']
        else:
            state_dict = checkpoint
        model.load_state_dict(state_dict)
    
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for imgs, labels in test_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            logits = model(imgs)
            preds = logits.argmax(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    test_acc = 100 * correct / total
    print(f"🎯 {name} Test Accuracy on CIFAR-100: {test_acc:.2f}%\n")


100%|██████████| 169M/169M [00:02<00:00, 58.3MB/s]


Evaluating ResNet56...
🎯 ResNet56 Test Accuracy on CIFAR-100: 72.41%

Evaluating ResNet110...
🎯 ResNet110 Test Accuracy on CIFAR-100: 74.31%



# Training the student models BASIC Resnet 20 / 32

import os
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms
from torchvision.transforms import AutoAugment, AutoAugmentPolicy, RandomErasing
from tqdm.auto import tqdm
#from torchvision.models import resnet18, resnet34



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_dp = torch.cuda.device_count() > 1
torch.manual_seed(42)
np.random.seed(42)

mean = (0.5071, 0.4867, 0.4408)  # CIFAR-100 mean
std  = (0.2675, 0.2565, 0.2761)  # CIFAR-100 std

batch_size = 64

# Precompute DataLoaders for each resolution
stages = [(r) for r in [(32, 240)]]
dataloader_dict = {}

'''
Copied from the paper as it is.

they are not using any validation sets. Training it on the entire train set!
    train_transform = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=stdv),
    ])
    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=stdv),
    ])
    
    '''

for resolution, _ in stages:
    train_tf = transforms.Compose([
        transforms.RandomCrop(resolution,padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])
    
    train_set = datasets.CIFAR100('./data', train=True, download=False, transform=train_tf)
    
    train_loader = DataLoader(train_set, batch_size=batch_size,
                              shuffle=True, num_workers=0, pin_memory=True)
    dataloader_dict[resolution] = {
        'train': train_loader
    }

# Test loader (fixed resolution)
test_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

test_ds = datasets.CIFAR100('./data', train=False, transform=test_tf)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)


# learning_rate is divided by 10 

# ---------------------------
# Test loop
# ---------------------------

def test(Test_model):
    
    Test_model.eval()
    correct_val = total_val = 0
    
    with torch.no_grad():
        for imgs, labels in test_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            logits = Test_model(imgs)
            preds = logits.argmax(1)
            correct_val += (preds == labels).sum().item()
            total_val += labels.size(0)
    test_acc = 100 * correct_val / total_val

    # print(f"Test Acc = {val_acc:.2f}%")
    return test_acc

# ---------------------------
# Training loop
# ---------------------------

def train(model, model_type):
    
    # Loss + optimizer
    lr = 0.05 # as per the paper
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)

    # Step LR schedule: decay at 150, 180, 210 epochs
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[150, 180, 210], gamma=0.1)

    best_val_acc = 0.0
    num_epochs = 240
    
    for res, epochs in stages:
        
        print(f"\n=== Training at resolution {res}px ===")
        tr_loader = dataloader_dict[res]['train']
        
        for e in range(1, epochs+1):
            
            model.train()
            total_loss = 0
            correct = total = 0
        
            for imgs, labels in tqdm(tr_loader, desc=f"Epoch {e}/{num_epochs}"):
                
                imgs, labels = imgs.to(device), labels.to(device)
                optimizer.zero_grad()
                logits = model(imgs)
                loss = criterion(logits, labels)
                loss.backward()
                optimizer.step()
        
                total_loss += loss.item()
                preds = logits.argmax(1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        
            train_acc = 100 * correct / total
            scheduler.step()

            test_acc = test(model)
            
            print(f"Epoch {e}: Train Acc = {train_acc:.2f}% Test Accuracy = {test_acc:.2f}%") 
            
            # Save best model
            if test_acc > best_val_acc:
                best_val_acc = test_acc
                torch.save(model.state_dict(), f"resnet{model_type}_student.pth")
                print(f"→ Saved best model at epoch {e} with Test Acc = {train_acc:.2f}%")
    print("✅ Training completed!")

# KD Training Vanilla


In [3]:
import os
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms
from torchvision.transforms import AutoAugment, AutoAugmentPolicy, RandomErasing
from tqdm.auto import tqdm
#from torchvision.models import resnet18, resnet34
from torchvision import datasets, transforms
from torch.utils.data import DataLoader


mean = (0.5071, 0.4867, 0.4408)
std  = (0.2675, 0.2565, 0.2761)
batch_size = 128
num_workers = 0
pin_mem = True

@torch.no_grad()
def evaluate(model, loader, device):
    model.eval()
    correct = total = 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        pred = logits.argmax(1)
        correct += (pred == y).sum().item()
        total += y.size(0)
    return 100.0 * correct / total


torch.manual_seed(27)
np.random.seed(27)

torch.cuda.manual_seed_all(27)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True



In [4]:
stages = [(r) for r in [(32, 240)]]
dataloader_dict = {}

# --- Build train loader(s) for each stage ---
for resolution, _ in stages:
    train_tf = transforms.Compose([
        transforms.RandomCrop(resolution, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])

    train_set = datasets.CIFAR100('./data', train=True, download=True, transform=train_tf)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True,
                              num_workers=num_workers, pin_memory=pin_mem)

    dataloader_dict[resolution] = {'train': train_loader}

# --- Single test loader (fixed 32x32 normalization) ---
test_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])
test_set = datasets.CIFAR100('./data', train=False, download=True, transform=test_tf)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False,
                         num_workers=num_workers, pin_memory=pin_mem)

# --- Pick the 32px stage and call KD training ---
resolution, _ = stages[0]          # (32, 240)
train_loader = dataloader_dict[resolution]['train']


In [5]:
# learning_rate is divided by 10 
# ---------------------------
# Training loop
# ---------------------------

def kd_loss(student_logits, teacher_logits, labels, T=4.0, alpha=0.9): 
    """
    alpha = 0.1 as per the github repo
    Compute KD loss = α * KD + (1-α) * CE
    T = temperature
    α = weight for soft distillation loss
    """
    # Hard-label loss
    ce = F.cross_entropy(student_logits, labels)
    kd = F.kl_div(
        F.log_softmax(student_logits / T, dim=1),
        F.softmax(teacher_logits / T, dim=1),
        reduction="batchmean") * (T * T)
    
    return (1 - alpha) * ce + alpha * kd

In [6]:
def train_via_KD(t_model, s_model, train_loader, test_loader, device,
                 epochs=240, base_lr=0.05, wd=5e-4, milestones=(150,180,210),
                 T=4.0, alpha=0.9, save_path="student_kd.pth"):

    # Freeze teacher
    t_model.to(device).eval()
    for p in t_model.parameters():
        p.requires_grad = False

    s_model = s_model.to(device)

    optimizer = optim.SGD(s_model.parameters(), lr=base_lr, momentum=0.9, weight_decay=wd)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=list(milestones), gamma=0.1)

    best_test = -1.0
    for e in range(1, epochs + 1):
        s_model.train()
        running_loss, correct, total = 0.0, 0, 0

        for imgs, labels in tqdm(train_loader, desc=f"Epoch {e}/{epochs}"):
            imgs, labels = imgs.to(device), labels.to(device)

            with torch.no_grad():
                t_logits = t_model(imgs)

            s_logits = s_model(imgs)
            loss = kd_loss(s_logits, t_logits, labels, T=T, alpha=alpha)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * imgs.size(0)
            preds = s_logits.argmax(1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        scheduler.step()
        train_loss = running_loss / total
        train_acc  = 100.0 * correct / total
        test_acc   = evaluate(s_model, test_loader, device)

        print(f"Epoch {e:3d}/{epochs} | loss {train_loss:.3f} | train {train_acc:5.2f}% | test {test_acc:5.2f}%")

        if test_acc > best_test:
            best_test = test_acc
            to_save = s_model.module.state_dict() if isinstance(s_model, torch.nn.DataParallel) else s_model.state_dict()
            torch.save(to_save, save_path)
            print(f"  ↳ Saved best @ epoch {e} (test {best_test:.2f}%) → {save_path}")

    print(f"✅ KD training finished. Best Test Acc: {best_test:.2f}%")


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
teacher = resnet56(num_classes = 100)
t_ckpt  = torch.load("/kaggle/input/resnet-56/ckpt_epoch_240.pth", map_location=device, weights_only=False)
teacher.load_state_dict(t_ckpt['model'] if 'model' in t_ckpt else t_ckpt)

student = resnet20(num_classes=100)

train_via_KD(teacher, student, train_loader, test_loader, device, save_path= "56_t-20_s.pth")

Epoch 1/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   1/240 | loss 12.014 | train  9.05% | test 12.12%
  ↳ Saved best @ epoch 1 (test 12.12%) → 56_t-20_s.pth


Epoch 2/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   2/240 | loss 9.440 | train 20.84% | test 21.26%
  ↳ Saved best @ epoch 2 (test 21.26%) → 56_t-20_s.pth


Epoch 3/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   3/240 | loss 7.778 | train 30.49% | test 28.29%
  ↳ Saved best @ epoch 3 (test 28.29%) → 56_t-20_s.pth


Epoch 4/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   4/240 | loss 6.755 | train 36.88% | test 35.17%
  ↳ Saved best @ epoch 4 (test 35.17%) → 56_t-20_s.pth


Epoch 5/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   5/240 | loss 5.998 | train 42.39% | test 39.79%
  ↳ Saved best @ epoch 5 (test 39.79%) → 56_t-20_s.pth


Epoch 6/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   6/240 | loss 5.483 | train 45.98% | test 43.54%
  ↳ Saved best @ epoch 6 (test 43.54%) → 56_t-20_s.pth


Epoch 7/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   7/240 | loss 5.119 | train 48.71% | test 39.73%


Epoch 8/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   8/240 | loss 4.854 | train 50.50% | test 47.11%
  ↳ Saved best @ epoch 8 (test 47.11%) → 56_t-20_s.pth


Epoch 9/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   9/240 | loss 4.602 | train 52.53% | test 46.82%


Epoch 10/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  10/240 | loss 4.442 | train 54.05% | test 49.12%
  ↳ Saved best @ epoch 10 (test 49.12%) → 56_t-20_s.pth


Epoch 11/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  11/240 | loss 4.304 | train 54.78% | test 46.34%


Epoch 12/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  12/240 | loss 4.170 | train 56.23% | test 45.78%


Epoch 13/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  13/240 | loss 4.100 | train 56.72% | test 51.99%
  ↳ Saved best @ epoch 13 (test 51.99%) → 56_t-20_s.pth


Epoch 14/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  14/240 | loss 3.991 | train 57.68% | test 49.87%


Epoch 15/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  15/240 | loss 3.913 | train 58.38% | test 49.63%


Epoch 16/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  16/240 | loss 3.857 | train 58.93% | test 51.23%


Epoch 17/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  17/240 | loss 3.800 | train 59.26% | test 48.41%


Epoch 18/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  18/240 | loss 3.738 | train 60.04% | test 53.56%
  ↳ Saved best @ epoch 18 (test 53.56%) → 56_t-20_s.pth


Epoch 19/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  19/240 | loss 3.699 | train 60.26% | test 48.50%


Epoch 20/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  20/240 | loss 3.667 | train 60.51% | test 54.75%
  ↳ Saved best @ epoch 20 (test 54.75%) → 56_t-20_s.pth


Epoch 21/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  21/240 | loss 3.610 | train 60.92% | test 56.18%
  ↳ Saved best @ epoch 21 (test 56.18%) → 56_t-20_s.pth


Epoch 22/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  22/240 | loss 3.601 | train 61.12% | test 53.29%


Epoch 23/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  23/240 | loss 3.544 | train 61.55% | test 50.20%


Epoch 24/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  24/240 | loss 3.516 | train 61.86% | test 54.76%


Epoch 25/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  25/240 | loss 3.485 | train 61.92% | test 53.20%


Epoch 26/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  26/240 | loss 3.492 | train 61.99% | test 54.59%


Epoch 27/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  27/240 | loss 3.451 | train 62.38% | test 55.17%


Epoch 28/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  28/240 | loss 3.425 | train 62.80% | test 55.97%


Epoch 29/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  29/240 | loss 3.400 | train 62.82% | test 54.37%


Epoch 30/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  30/240 | loss 3.382 | train 63.03% | test 54.64%


Epoch 31/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  31/240 | loss 3.369 | train 62.98% | test 55.53%


Epoch 32/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  32/240 | loss 3.385 | train 63.30% | test 49.74%


Epoch 33/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  33/240 | loss 3.342 | train 63.49% | test 48.29%


Epoch 34/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  34/240 | loss 3.336 | train 63.39% | test 55.02%


Epoch 35/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  35/240 | loss 3.306 | train 63.89% | test 55.77%


Epoch 36/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  36/240 | loss 3.324 | train 63.41% | test 55.06%


Epoch 37/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  37/240 | loss 3.282 | train 64.05% | test 56.91%
  ↳ Saved best @ epoch 37 (test 56.91%) → 56_t-20_s.pth


Epoch 38/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  38/240 | loss 3.276 | train 64.22% | test 57.39%
  ↳ Saved best @ epoch 38 (test 57.39%) → 56_t-20_s.pth


Epoch 39/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  39/240 | loss 3.266 | train 63.95% | test 57.30%


Epoch 40/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  40/240 | loss 3.250 | train 64.06% | test 55.42%


Epoch 41/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  41/240 | loss 3.249 | train 64.44% | test 53.89%


Epoch 42/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  42/240 | loss 3.240 | train 64.20% | test 55.58%


Epoch 43/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  43/240 | loss 3.234 | train 64.65% | test 58.58%
  ↳ Saved best @ epoch 43 (test 58.58%) → 56_t-20_s.pth


Epoch 44/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  44/240 | loss 3.190 | train 64.66% | test 56.99%


Epoch 45/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  45/240 | loss 3.198 | train 64.71% | test 59.46%
  ↳ Saved best @ epoch 45 (test 59.46%) → 56_t-20_s.pth


Epoch 46/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  46/240 | loss 3.208 | train 64.66% | test 53.27%


Epoch 47/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  47/240 | loss 3.213 | train 64.77% | test 55.87%


Epoch 48/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  48/240 | loss 3.215 | train 64.38% | test 56.05%


Epoch 49/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  49/240 | loss 3.191 | train 64.71% | test 55.95%


Epoch 50/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  50/240 | loss 3.162 | train 65.02% | test 57.83%


Epoch 51/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  51/240 | loss 3.189 | train 64.82% | test 58.20%


Epoch 52/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  52/240 | loss 3.183 | train 65.10% | test 54.55%


Epoch 53/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  53/240 | loss 3.170 | train 64.84% | test 56.65%


Epoch 54/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  54/240 | loss 3.144 | train 65.32% | test 58.62%


Epoch 55/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  55/240 | loss 3.151 | train 65.27% | test 57.08%


Epoch 56/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  56/240 | loss 3.125 | train 65.48% | test 55.26%


Epoch 57/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  57/240 | loss 3.150 | train 65.28% | test 56.03%


Epoch 58/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  58/240 | loss 3.122 | train 65.19% | test 54.49%


Epoch 59/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  59/240 | loss 3.115 | train 65.12% | test 56.90%


Epoch 60/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  60/240 | loss 3.096 | train 65.52% | test 56.92%


Epoch 61/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  61/240 | loss 3.116 | train 65.33% | test 53.65%


Epoch 62/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  62/240 | loss 3.093 | train 65.59% | test 58.08%


Epoch 63/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  63/240 | loss 3.109 | train 65.38% | test 57.60%


Epoch 64/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  64/240 | loss 3.088 | train 65.28% | test 56.25%


Epoch 65/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  65/240 | loss 3.094 | train 65.62% | test 55.85%


Epoch 66/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  66/240 | loss 3.092 | train 65.59% | test 56.91%


Epoch 67/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  67/240 | loss 3.116 | train 65.34% | test 57.90%


Epoch 68/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  68/240 | loss 3.089 | train 65.78% | test 53.85%


Epoch 69/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  69/240 | loss 3.071 | train 65.77% | test 56.05%


Epoch 70/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  70/240 | loss 3.094 | train 65.65% | test 56.23%


Epoch 71/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  71/240 | loss 3.068 | train 65.96% | test 59.00%


Epoch 72/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  72/240 | loss 3.064 | train 65.93% | test 55.52%


Epoch 73/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  73/240 | loss 3.066 | train 66.01% | test 57.49%


Epoch 74/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  74/240 | loss 3.071 | train 65.73% | test 59.44%


Epoch 75/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  75/240 | loss 3.049 | train 66.01% | test 55.46%


Epoch 76/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  76/240 | loss 3.067 | train 65.99% | test 58.49%


Epoch 77/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  77/240 | loss 3.029 | train 66.12% | test 53.06%


Epoch 78/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  78/240 | loss 3.070 | train 65.95% | test 56.30%


Epoch 79/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  79/240 | loss 3.027 | train 66.13% | test 56.47%


Epoch 80/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  80/240 | loss 3.080 | train 66.04% | test 58.57%


Epoch 81/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  81/240 | loss 3.032 | train 66.24% | test 58.38%


Epoch 82/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  82/240 | loss 3.051 | train 65.85% | test 57.87%


Epoch 83/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  83/240 | loss 3.035 | train 66.18% | test 54.91%


Epoch 84/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  84/240 | loss 3.041 | train 66.34% | test 54.84%


Epoch 85/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  85/240 | loss 3.043 | train 65.98% | test 56.54%


Epoch 86/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  86/240 | loss 3.029 | train 65.99% | test 55.60%


Epoch 87/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  87/240 | loss 3.017 | train 66.12% | test 55.53%


Epoch 88/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  88/240 | loss 3.014 | train 66.34% | test 53.74%


Epoch 89/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  89/240 | loss 3.033 | train 66.31% | test 56.96%


Epoch 90/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  90/240 | loss 3.020 | train 66.33% | test 57.15%


Epoch 91/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  91/240 | loss 3.017 | train 66.38% | test 59.78%
  ↳ Saved best @ epoch 91 (test 59.78%) → 56_t-20_s.pth


Epoch 92/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  92/240 | loss 3.009 | train 66.41% | test 57.34%


Epoch 93/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  93/240 | loss 3.024 | train 66.25% | test 56.86%


Epoch 94/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  94/240 | loss 3.031 | train 66.15% | test 58.28%


Epoch 95/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  95/240 | loss 2.996 | train 66.40% | test 57.11%


Epoch 96/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  96/240 | loss 3.038 | train 66.18% | test 55.39%


Epoch 97/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  97/240 | loss 3.000 | train 66.38% | test 58.54%


Epoch 98/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  98/240 | loss 3.013 | train 66.49% | test 57.96%


Epoch 99/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  99/240 | loss 3.021 | train 66.22% | test 59.05%


Epoch 100/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 100/240 | loss 3.009 | train 66.40% | test 56.51%


Epoch 101/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 101/240 | loss 2.997 | train 66.62% | test 58.20%


Epoch 102/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 102/240 | loss 2.987 | train 66.62% | test 60.16%
  ↳ Saved best @ epoch 102 (test 60.16%) → 56_t-20_s.pth


Epoch 103/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 103/240 | loss 2.995 | train 66.24% | test 57.81%


Epoch 104/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 104/240 | loss 2.967 | train 66.69% | test 58.04%


Epoch 105/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 105/240 | loss 3.010 | train 66.43% | test 53.97%


Epoch 106/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 106/240 | loss 2.989 | train 66.55% | test 56.77%


Epoch 107/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 107/240 | loss 2.998 | train 66.56% | test 53.42%


Epoch 108/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 108/240 | loss 2.994 | train 66.60% | test 55.68%


Epoch 109/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 109/240 | loss 2.987 | train 66.69% | test 53.63%


Epoch 110/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 110/240 | loss 2.973 | train 66.80% | test 57.61%


Epoch 111/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 111/240 | loss 2.992 | train 66.57% | test 57.81%


Epoch 112/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 112/240 | loss 2.989 | train 66.58% | test 56.65%


Epoch 113/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 113/240 | loss 2.980 | train 66.55% | test 56.89%


Epoch 114/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 114/240 | loss 2.965 | train 66.87% | test 56.00%


Epoch 115/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 115/240 | loss 3.001 | train 66.23% | test 55.95%


Epoch 116/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 116/240 | loss 2.962 | train 66.93% | test 58.61%


Epoch 117/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 117/240 | loss 2.987 | train 66.93% | test 52.93%


Epoch 118/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 118/240 | loss 2.978 | train 66.53% | test 56.83%


Epoch 119/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 119/240 | loss 2.973 | train 66.55% | test 56.52%


Epoch 120/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 120/240 | loss 2.944 | train 66.99% | test 57.48%


Epoch 121/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 121/240 | loss 2.958 | train 67.20% | test 58.05%


Epoch 122/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 122/240 | loss 2.956 | train 66.90% | test 59.02%


Epoch 123/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 123/240 | loss 2.952 | train 66.96% | test 53.67%


Epoch 124/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 124/240 | loss 2.969 | train 66.74% | test 58.75%


Epoch 125/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 125/240 | loss 2.962 | train 66.88% | test 59.67%


Epoch 126/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 126/240 | loss 2.951 | train 66.94% | test 57.84%


Epoch 127/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 127/240 | loss 2.962 | train 66.83% | test 58.22%


Epoch 128/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 128/240 | loss 2.939 | train 67.04% | test 55.98%


Epoch 129/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 129/240 | loss 2.953 | train 66.75% | test 57.55%


Epoch 130/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 130/240 | loss 2.946 | train 66.82% | test 56.75%


Epoch 131/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 131/240 | loss 2.971 | train 66.73% | test 53.22%


Epoch 132/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 132/240 | loss 2.942 | train 66.86% | test 57.92%


Epoch 133/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 133/240 | loss 2.934 | train 67.04% | test 58.84%


Epoch 134/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 134/240 | loss 2.962 | train 66.94% | test 56.43%


Epoch 135/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 135/240 | loss 2.942 | train 67.06% | test 57.09%


Epoch 136/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 136/240 | loss 2.948 | train 67.10% | test 58.57%


Epoch 137/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 137/240 | loss 2.929 | train 67.16% | test 49.86%


Epoch 138/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 138/240 | loss 2.941 | train 66.94% | test 57.25%


Epoch 139/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 139/240 | loss 2.952 | train 67.05% | test 57.40%


Epoch 140/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 140/240 | loss 2.964 | train 66.95% | test 58.68%


Epoch 141/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 141/240 | loss 2.948 | train 67.04% | test 56.86%


Epoch 142/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 142/240 | loss 2.942 | train 66.94% | test 56.83%


Epoch 143/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 143/240 | loss 2.954 | train 67.04% | test 56.37%


Epoch 144/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 144/240 | loss 2.937 | train 66.92% | test 59.12%


Epoch 145/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 145/240 | loss 2.945 | train 67.04% | test 59.93%


Epoch 146/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 146/240 | loss 2.938 | train 67.01% | test 56.13%


Epoch 147/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 147/240 | loss 2.927 | train 67.28% | test 57.70%


Epoch 148/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 148/240 | loss 2.941 | train 66.93% | test 55.56%


Epoch 149/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 149/240 | loss 2.929 | train 67.06% | test 59.75%


Epoch 150/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 150/240 | loss 2.945 | train 66.89% | test 56.25%


Epoch 151/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 151/240 | loss 2.282 | train 73.01% | test 68.20%
  ↳ Saved best @ epoch 151 (test 68.20%) → 56_t-20_s.pth


Epoch 152/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 152/240 | loss 2.127 | train 74.50% | test 69.23%
  ↳ Saved best @ epoch 152 (test 69.23%) → 56_t-20_s.pth


Epoch 153/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 153/240 | loss 2.072 | train 75.07% | test 68.91%


Epoch 154/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 154/240 | loss 2.035 | train 75.53% | test 69.03%


Epoch 155/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 155/240 | loss 2.009 | train 75.82% | test 68.99%


Epoch 156/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 156/240 | loss 1.993 | train 75.87% | test 69.26%
  ↳ Saved best @ epoch 156 (test 69.26%) → 56_t-20_s.pth


Epoch 157/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 157/240 | loss 1.968 | train 76.29% | test 69.55%
  ↳ Saved best @ epoch 157 (test 69.55%) → 56_t-20_s.pth


Epoch 158/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 158/240 | loss 1.964 | train 76.37% | test 69.32%


Epoch 159/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 159/240 | loss 1.948 | train 76.51% | test 69.21%


Epoch 160/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 160/240 | loss 1.944 | train 76.55% | test 69.26%


Epoch 161/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 161/240 | loss 1.938 | train 76.68% | test 69.65%
  ↳ Saved best @ epoch 161 (test 69.65%) → 56_t-20_s.pth


Epoch 162/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 162/240 | loss 1.932 | train 76.82% | test 69.46%


Epoch 163/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 163/240 | loss 1.917 | train 76.99% | test 69.70%
  ↳ Saved best @ epoch 163 (test 69.70%) → 56_t-20_s.pth


Epoch 164/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 164/240 | loss 1.916 | train 76.95% | test 69.64%


Epoch 165/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 165/240 | loss 1.909 | train 77.00% | test 69.45%


Epoch 166/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 166/240 | loss 1.907 | train 77.22% | test 69.86%
  ↳ Saved best @ epoch 166 (test 69.86%) → 56_t-20_s.pth


Epoch 167/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 167/240 | loss 1.886 | train 77.39% | test 70.10%
  ↳ Saved best @ epoch 167 (test 70.10%) → 56_t-20_s.pth


Epoch 168/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 168/240 | loss 1.901 | train 77.19% | test 69.45%


Epoch 169/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 169/240 | loss 1.877 | train 77.47% | test 69.66%


Epoch 170/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 170/240 | loss 1.887 | train 77.50% | test 69.48%


Epoch 171/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 171/240 | loss 1.880 | train 77.44% | test 69.69%


Epoch 172/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 172/240 | loss 1.879 | train 77.67% | test 69.65%


Epoch 173/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 173/240 | loss 1.873 | train 77.84% | test 69.72%


Epoch 174/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 174/240 | loss 1.870 | train 77.79% | test 69.31%


Epoch 175/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 175/240 | loss 1.857 | train 78.01% | test 69.78%


Epoch 176/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 176/240 | loss 1.864 | train 78.10% | test 69.97%


Epoch 177/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 177/240 | loss 1.868 | train 77.96% | test 69.77%


Epoch 178/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 178/240 | loss 1.866 | train 77.87% | test 69.53%


Epoch 179/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 179/240 | loss 1.859 | train 78.26% | test 69.53%


Epoch 180/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 180/240 | loss 1.865 | train 78.07% | test 69.68%


Epoch 181/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 181/240 | loss 1.768 | train 79.10% | test 70.71%
  ↳ Saved best @ epoch 181 (test 70.71%) → 56_t-20_s.pth


Epoch 182/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 182/240 | loss 1.751 | train 79.20% | test 70.65%


Epoch 183/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 183/240 | loss 1.745 | train 79.48% | test 70.79%
  ↳ Saved best @ epoch 183 (test 70.79%) → 56_t-20_s.pth


Epoch 184/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 184/240 | loss 1.738 | train 79.52% | test 70.48%


Epoch 185/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 185/240 | loss 1.733 | train 79.28% | test 70.77%


Epoch 186/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 186/240 | loss 1.735 | train 79.37% | test 70.62%


Epoch 187/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 187/240 | loss 1.724 | train 79.50% | test 70.63%


Epoch 188/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 188/240 | loss 1.733 | train 79.47% | test 70.71%


Epoch 189/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 189/240 | loss 1.729 | train 79.67% | test 70.72%


Epoch 190/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 190/240 | loss 1.732 | train 79.45% | test 70.67%


Epoch 191/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 191/240 | loss 1.731 | train 79.72% | test 70.70%


Epoch 192/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 192/240 | loss 1.727 | train 79.54% | test 70.60%


Epoch 193/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 193/240 | loss 1.729 | train 79.49% | test 70.62%


Epoch 194/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 194/240 | loss 1.716 | train 79.73% | test 70.80%
  ↳ Saved best @ epoch 194 (test 70.80%) → 56_t-20_s.pth


Epoch 195/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 195/240 | loss 1.727 | train 79.67% | test 70.69%


Epoch 196/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 196/240 | loss 1.728 | train 79.61% | test 70.66%


Epoch 197/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 197/240 | loss 1.720 | train 79.60% | test 70.73%


Epoch 198/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 198/240 | loss 1.725 | train 79.57% | test 70.90%
  ↳ Saved best @ epoch 198 (test 70.90%) → 56_t-20_s.pth


Epoch 199/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 199/240 | loss 1.722 | train 79.63% | test 70.55%


Epoch 200/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 200/240 | loss 1.716 | train 79.60% | test 70.76%


Epoch 201/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 201/240 | loss 1.722 | train 79.77% | test 70.80%


Epoch 202/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 202/240 | loss 1.717 | train 79.62% | test 71.01%
  ↳ Saved best @ epoch 202 (test 71.01%) → 56_t-20_s.pth


Epoch 203/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 203/240 | loss 1.707 | train 79.83% | test 70.57%


Epoch 204/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 204/240 | loss 1.717 | train 79.80% | test 70.76%


Epoch 205/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 205/240 | loss 1.711 | train 79.88% | test 70.80%


Epoch 206/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 206/240 | loss 1.710 | train 79.79% | test 70.82%


Epoch 207/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 207/240 | loss 1.710 | train 79.68% | test 70.62%


Epoch 208/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 208/240 | loss 1.716 | train 79.88% | test 70.69%


Epoch 209/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 209/240 | loss 1.714 | train 79.79% | test 70.63%


Epoch 210/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 210/240 | loss 1.708 | train 79.59% | test 70.89%


Epoch 211/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 211/240 | loss 1.704 | train 80.19% | test 70.88%


Epoch 212/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 212/240 | loss 1.700 | train 79.98% | test 70.79%


Epoch 213/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 213/240 | loss 1.701 | train 80.06% | test 70.78%


Epoch 214/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 214/240 | loss 1.698 | train 79.97% | test 70.81%


Epoch 215/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 215/240 | loss 1.689 | train 80.13% | test 70.76%


Epoch 216/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 216/240 | loss 1.699 | train 79.93% | test 70.82%


Epoch 217/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 217/240 | loss 1.695 | train 80.26% | test 70.79%


Epoch 218/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 218/240 | loss 1.695 | train 80.04% | test 70.96%


Epoch 219/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 219/240 | loss 1.696 | train 80.07% | test 70.80%


Epoch 220/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 220/240 | loss 1.702 | train 79.96% | test 70.73%


Epoch 221/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 221/240 | loss 1.696 | train 80.24% | test 70.75%


Epoch 222/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 222/240 | loss 1.694 | train 79.94% | test 70.67%


Epoch 223/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 223/240 | loss 1.696 | train 80.03% | test 70.74%


Epoch 224/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 224/240 | loss 1.695 | train 80.02% | test 70.84%


Epoch 225/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 225/240 | loss 1.694 | train 80.11% | test 70.57%


Epoch 226/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 226/240 | loss 1.698 | train 79.99% | test 70.83%


Epoch 227/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 227/240 | loss 1.691 | train 79.95% | test 70.90%


Epoch 228/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 228/240 | loss 1.692 | train 79.94% | test 70.74%


Epoch 229/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 229/240 | loss 1.699 | train 79.94% | test 70.84%


Epoch 230/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 230/240 | loss 1.690 | train 80.06% | test 70.74%


Epoch 231/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 231/240 | loss 1.699 | train 80.05% | test 70.86%


Epoch 232/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 232/240 | loss 1.690 | train 79.99% | test 70.93%


Epoch 233/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 233/240 | loss 1.693 | train 80.18% | test 70.83%


Epoch 234/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 234/240 | loss 1.698 | train 80.07% | test 70.76%


Epoch 235/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 235/240 | loss 1.694 | train 80.11% | test 70.82%


Epoch 236/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 236/240 | loss 1.702 | train 79.97% | test 70.91%


Epoch 237/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 237/240 | loss 1.694 | train 80.02% | test 70.81%


Epoch 238/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 238/240 | loss 1.694 | train 80.07% | test 70.78%


Epoch 239/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 239/240 | loss 1.694 | train 80.13% | test 70.90%


Epoch 240/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 240/240 | loss 1.689 | train 79.98% | test 70.95%
✅ KD training finished. Best Test Acc: 71.01%


In [8]:

teacher = resnet110(num_classes = 100)
t_ckpt  = torch.load("/kaggle/input/resnet-110/ckpt_epoch_240.pth", map_location=device, weights_only=False)
teacher.load_state_dict(t_ckpt['model'] if 'model' in t_ckpt else t_ckpt)

student = resnet20(num_classes=100)

train_via_KD(teacher, student, train_loader, test_loader, device, save_path= "110_t-20_s.pth")

Epoch 1/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   1/240 | loss 14.768 | train  8.68% | test 13.79%
  ↳ Saved best @ epoch 1 (test 13.79%) → 110_t-20_s.pth


Epoch 2/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   2/240 | loss 12.182 | train 19.06% | test 17.59%
  ↳ Saved best @ epoch 2 (test 17.59%) → 110_t-20_s.pth


Epoch 3/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   3/240 | loss 10.340 | train 27.93% | test 27.76%
  ↳ Saved best @ epoch 3 (test 27.76%) → 110_t-20_s.pth


Epoch 4/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   4/240 | loss 9.118 | train 34.35% | test 33.50%
  ↳ Saved best @ epoch 4 (test 33.50%) → 110_t-20_s.pth


Epoch 5/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   5/240 | loss 8.246 | train 39.30% | test 35.69%
  ↳ Saved best @ epoch 5 (test 35.69%) → 110_t-20_s.pth


Epoch 6/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   6/240 | loss 7.526 | train 43.66% | test 39.06%
  ↳ Saved best @ epoch 6 (test 39.06%) → 110_t-20_s.pth


Epoch 7/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   7/240 | loss 7.032 | train 46.68% | test 42.67%
  ↳ Saved best @ epoch 7 (test 42.67%) → 110_t-20_s.pth


Epoch 8/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   8/240 | loss 6.672 | train 48.63% | test 39.81%


Epoch 9/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   9/240 | loss 6.351 | train 50.84% | test 44.26%
  ↳ Saved best @ epoch 9 (test 44.26%) → 110_t-20_s.pth


Epoch 10/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  10/240 | loss 6.128 | train 52.53% | test 48.40%
  ↳ Saved best @ epoch 10 (test 48.40%) → 110_t-20_s.pth


Epoch 11/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  11/240 | loss 5.917 | train 53.90% | test 46.25%


Epoch 12/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  12/240 | loss 5.732 | train 54.99% | test 48.21%


Epoch 13/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  13/240 | loss 5.649 | train 55.58% | test 49.60%
  ↳ Saved best @ epoch 13 (test 49.60%) → 110_t-20_s.pth


Epoch 14/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  14/240 | loss 5.495 | train 56.60% | test 48.85%


Epoch 15/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  15/240 | loss 5.384 | train 57.35% | test 47.24%


Epoch 16/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  16/240 | loss 5.312 | train 57.66% | test 50.50%
  ↳ Saved best @ epoch 16 (test 50.50%) → 110_t-20_s.pth


Epoch 17/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  17/240 | loss 5.218 | train 58.38% | test 52.53%
  ↳ Saved best @ epoch 17 (test 52.53%) → 110_t-20_s.pth


Epoch 18/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  18/240 | loss 5.155 | train 58.89% | test 48.62%


Epoch 19/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  19/240 | loss 5.082 | train 59.45% | test 52.69%
  ↳ Saved best @ epoch 19 (test 52.69%) → 110_t-20_s.pth


Epoch 20/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  20/240 | loss 5.022 | train 59.74% | test 54.46%
  ↳ Saved best @ epoch 20 (test 54.46%) → 110_t-20_s.pth


Epoch 21/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  21/240 | loss 5.002 | train 60.01% | test 52.55%


Epoch 22/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  22/240 | loss 4.891 | train 60.46% | test 56.12%
  ↳ Saved best @ epoch 22 (test 56.12%) → 110_t-20_s.pth


Epoch 23/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  23/240 | loss 4.874 | train 60.75% | test 55.83%


Epoch 24/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  24/240 | loss 4.820 | train 61.06% | test 53.01%


Epoch 25/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  25/240 | loss 4.826 | train 61.25% | test 51.57%


Epoch 26/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  26/240 | loss 4.788 | train 61.38% | test 53.81%


Epoch 27/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  27/240 | loss 4.728 | train 61.70% | test 54.23%


Epoch 28/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  28/240 | loss 4.717 | train 61.90% | test 54.51%


Epoch 29/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  29/240 | loss 4.689 | train 61.90% | test 55.75%


Epoch 30/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  30/240 | loss 4.679 | train 62.02% | test 53.24%


Epoch 31/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  31/240 | loss 4.657 | train 62.09% | test 53.65%


Epoch 32/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  32/240 | loss 4.612 | train 62.70% | test 54.80%


Epoch 33/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  33/240 | loss 4.607 | train 62.56% | test 54.60%


Epoch 34/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  34/240 | loss 4.599 | train 62.66% | test 51.43%


Epoch 35/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  35/240 | loss 4.558 | train 63.06% | test 52.46%


Epoch 36/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  36/240 | loss 4.541 | train 63.19% | test 55.96%


Epoch 37/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  37/240 | loss 4.552 | train 62.92% | test 55.92%


Epoch 38/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  38/240 | loss 4.512 | train 63.29% | test 53.65%


Epoch 39/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  39/240 | loss 4.539 | train 63.15% | test 55.02%


Epoch 40/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  40/240 | loss 4.493 | train 63.35% | test 54.70%


Epoch 41/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  41/240 | loss 4.482 | train 63.66% | test 55.79%


Epoch 42/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  42/240 | loss 4.491 | train 63.35% | test 54.78%


Epoch 43/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  43/240 | loss 4.454 | train 63.70% | test 55.04%


Epoch 44/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  44/240 | loss 4.452 | train 63.62% | test 54.80%


Epoch 45/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  45/240 | loss 4.446 | train 63.92% | test 55.78%


Epoch 46/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  46/240 | loss 4.438 | train 63.94% | test 55.23%


Epoch 47/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  47/240 | loss 4.431 | train 64.08% | test 54.17%


Epoch 48/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  48/240 | loss 4.386 | train 64.00% | test 55.56%


Epoch 49/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  49/240 | loss 4.387 | train 64.06% | test 56.01%


Epoch 50/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  50/240 | loss 4.402 | train 63.92% | test 50.29%


Epoch 51/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  51/240 | loss 4.392 | train 64.14% | test 55.21%


Epoch 52/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  52/240 | loss 4.380 | train 64.20% | test 54.31%


Epoch 53/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  53/240 | loss 4.385 | train 64.24% | test 56.84%
  ↳ Saved best @ epoch 53 (test 56.84%) → 110_t-20_s.pth


Epoch 54/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  54/240 | loss 4.357 | train 64.43% | test 57.97%
  ↳ Saved best @ epoch 54 (test 57.97%) → 110_t-20_s.pth


Epoch 55/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  55/240 | loss 4.365 | train 64.20% | test 59.69%
  ↳ Saved best @ epoch 55 (test 59.69%) → 110_t-20_s.pth


Epoch 56/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  56/240 | loss 4.346 | train 64.34% | test 56.04%


Epoch 57/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  57/240 | loss 4.330 | train 64.57% | test 57.14%


Epoch 58/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  58/240 | loss 4.351 | train 64.44% | test 53.96%


Epoch 59/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  59/240 | loss 4.337 | train 64.45% | test 54.04%


Epoch 60/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  60/240 | loss 4.320 | train 64.55% | test 55.34%


Epoch 61/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  61/240 | loss 4.312 | train 64.75% | test 54.84%


Epoch 62/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  62/240 | loss 4.313 | train 64.71% | test 56.33%


Epoch 63/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  63/240 | loss 4.289 | train 64.90% | test 58.37%


Epoch 64/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  64/240 | loss 4.302 | train 64.89% | test 55.32%


Epoch 65/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  65/240 | loss 4.289 | train 64.86% | test 55.80%


Epoch 66/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  66/240 | loss 4.268 | train 64.79% | test 55.61%


Epoch 67/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  67/240 | loss 4.280 | train 64.79% | test 56.14%


Epoch 68/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  68/240 | loss 4.288 | train 64.86% | test 54.57%


Epoch 69/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  69/240 | loss 4.267 | train 64.82% | test 57.43%


Epoch 70/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  70/240 | loss 4.277 | train 65.11% | test 54.61%


Epoch 71/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  71/240 | loss 4.238 | train 65.19% | test 55.82%


Epoch 72/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  72/240 | loss 4.264 | train 64.82% | test 54.73%


Epoch 73/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  73/240 | loss 4.262 | train 65.10% | test 57.33%


Epoch 74/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  74/240 | loss 4.269 | train 64.98% | test 59.04%


Epoch 75/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  75/240 | loss 4.233 | train 65.48% | test 55.90%


Epoch 76/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  76/240 | loss 4.238 | train 65.22% | test 55.32%


Epoch 77/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  77/240 | loss 4.244 | train 64.89% | test 57.94%


Epoch 78/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  78/240 | loss 4.218 | train 65.52% | test 55.86%


Epoch 79/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  79/240 | loss 4.260 | train 65.06% | test 54.16%


Epoch 80/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  80/240 | loss 4.236 | train 65.34% | test 59.28%


Epoch 81/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  81/240 | loss 4.221 | train 65.45% | test 57.83%


Epoch 82/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  82/240 | loss 4.225 | train 65.56% | test 53.34%


Epoch 83/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  83/240 | loss 4.193 | train 65.28% | test 57.54%


Epoch 84/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  84/240 | loss 4.197 | train 65.44% | test 57.32%


Epoch 85/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  85/240 | loss 4.203 | train 65.64% | test 56.32%


Epoch 86/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  86/240 | loss 4.210 | train 65.57% | test 57.35%


Epoch 87/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  87/240 | loss 4.203 | train 65.52% | test 56.25%


Epoch 88/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  88/240 | loss 4.194 | train 65.71% | test 56.06%


Epoch 89/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  89/240 | loss 4.200 | train 65.76% | test 55.73%


Epoch 90/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  90/240 | loss 4.199 | train 65.51% | test 57.23%


Epoch 91/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  91/240 | loss 4.190 | train 65.31% | test 57.30%


Epoch 92/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  92/240 | loss 4.215 | train 65.49% | test 57.78%


Epoch 93/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  93/240 | loss 4.203 | train 65.58% | test 56.74%


Epoch 94/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  94/240 | loss 4.183 | train 65.56% | test 58.82%


Epoch 95/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  95/240 | loss 4.144 | train 65.87% | test 58.20%


Epoch 96/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  96/240 | loss 4.174 | train 65.72% | test 55.97%


Epoch 97/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  97/240 | loss 4.181 | train 65.87% | test 57.16%


Epoch 98/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  98/240 | loss 4.161 | train 65.72% | test 58.59%


Epoch 99/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  99/240 | loss 4.203 | train 65.77% | test 56.94%


Epoch 100/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 100/240 | loss 4.153 | train 65.91% | test 56.12%


Epoch 101/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 101/240 | loss 4.141 | train 66.12% | test 59.92%
  ↳ Saved best @ epoch 101 (test 59.92%) → 110_t-20_s.pth


Epoch 102/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 102/240 | loss 4.156 | train 65.64% | test 54.36%


Epoch 103/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 103/240 | loss 4.152 | train 65.85% | test 59.55%


Epoch 104/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 104/240 | loss 4.148 | train 65.92% | test 50.32%


Epoch 105/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 105/240 | loss 4.144 | train 65.75% | test 55.85%


Epoch 106/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 106/240 | loss 4.133 | train 65.88% | test 50.21%


Epoch 107/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 107/240 | loss 4.100 | train 66.28% | test 54.19%


Epoch 108/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 108/240 | loss 4.174 | train 65.70% | test 55.54%


Epoch 109/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 109/240 | loss 4.148 | train 65.73% | test 57.59%


Epoch 110/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 110/240 | loss 4.158 | train 65.87% | test 57.72%


Epoch 111/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 111/240 | loss 4.115 | train 66.16% | test 57.93%


Epoch 112/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 112/240 | loss 4.138 | train 65.96% | test 58.63%


Epoch 113/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 113/240 | loss 4.117 | train 66.18% | test 59.45%


Epoch 114/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 114/240 | loss 4.112 | train 66.07% | test 56.68%


Epoch 115/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 115/240 | loss 4.162 | train 65.98% | test 57.40%


Epoch 116/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 116/240 | loss 4.143 | train 66.06% | test 58.21%


Epoch 117/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 117/240 | loss 4.139 | train 65.90% | test 58.79%


Epoch 118/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 118/240 | loss 4.084 | train 66.40% | test 56.73%


Epoch 119/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 119/240 | loss 4.131 | train 66.02% | test 57.12%


Epoch 120/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 120/240 | loss 4.106 | train 66.27% | test 58.06%


Epoch 121/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 121/240 | loss 4.103 | train 66.16% | test 57.83%


Epoch 122/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 122/240 | loss 4.142 | train 65.87% | test 57.11%


Epoch 123/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 123/240 | loss 4.110 | train 66.10% | test 55.07%


Epoch 124/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 124/240 | loss 4.109 | train 66.02% | test 59.31%


Epoch 125/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 125/240 | loss 4.104 | train 66.00% | test 56.97%


Epoch 126/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 126/240 | loss 4.112 | train 66.14% | test 58.25%


Epoch 127/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 127/240 | loss 4.122 | train 65.89% | test 58.54%


Epoch 128/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 128/240 | loss 4.106 | train 66.35% | test 57.02%


Epoch 129/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 129/240 | loss 4.086 | train 66.39% | test 56.59%


Epoch 130/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 130/240 | loss 4.109 | train 66.39% | test 57.53%


Epoch 131/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 131/240 | loss 4.091 | train 66.31% | test 57.57%


Epoch 132/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 132/240 | loss 4.089 | train 66.18% | test 58.19%


Epoch 133/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 133/240 | loss 4.094 | train 66.32% | test 57.74%


Epoch 134/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 134/240 | loss 4.099 | train 66.22% | test 57.18%


Epoch 135/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 135/240 | loss 4.095 | train 66.48% | test 57.62%


Epoch 136/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 136/240 | loss 4.091 | train 66.44% | test 57.77%


Epoch 137/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 137/240 | loss 4.102 | train 66.65% | test 59.75%


Epoch 138/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 138/240 | loss 4.086 | train 66.32% | test 58.98%


Epoch 139/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 139/240 | loss 4.074 | train 66.27% | test 50.83%


Epoch 140/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 140/240 | loss 4.096 | train 66.14% | test 58.72%


Epoch 141/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 141/240 | loss 4.107 | train 66.15% | test 59.46%


Epoch 142/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 142/240 | loss 4.089 | train 66.06% | test 54.03%


Epoch 143/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 143/240 | loss 4.051 | train 66.54% | test 59.19%


Epoch 144/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 144/240 | loss 4.093 | train 66.47% | test 59.04%


Epoch 145/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 145/240 | loss 4.073 | train 66.39% | test 57.26%


Epoch 146/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 146/240 | loss 4.098 | train 66.32% | test 60.59%
  ↳ Saved best @ epoch 146 (test 60.59%) → 110_t-20_s.pth


Epoch 147/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 147/240 | loss 4.071 | train 66.39% | test 57.83%


Epoch 148/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 148/240 | loss 4.086 | train 66.58% | test 59.06%


Epoch 149/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 149/240 | loss 4.088 | train 66.27% | test 57.40%


Epoch 150/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 150/240 | loss 4.073 | train 66.53% | test 58.30%


Epoch 151/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 151/240 | loss 3.175 | train 73.06% | test 68.75%
  ↳ Saved best @ epoch 151 (test 68.75%) → 110_t-20_s.pth


Epoch 152/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 152/240 | loss 2.945 | train 74.72% | test 69.06%
  ↳ Saved best @ epoch 152 (test 69.06%) → 110_t-20_s.pth


Epoch 153/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 153/240 | loss 2.891 | train 75.08% | test 69.07%
  ↳ Saved best @ epoch 153 (test 69.07%) → 110_t-20_s.pth


Epoch 154/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 154/240 | loss 2.839 | train 75.54% | test 69.19%
  ↳ Saved best @ epoch 154 (test 69.19%) → 110_t-20_s.pth


Epoch 155/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 155/240 | loss 2.791 | train 76.23% | test 69.42%
  ↳ Saved best @ epoch 155 (test 69.42%) → 110_t-20_s.pth


Epoch 156/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 156/240 | loss 2.783 | train 76.21% | test 69.75%
  ↳ Saved best @ epoch 156 (test 69.75%) → 110_t-20_s.pth


Epoch 157/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 157/240 | loss 2.751 | train 76.36% | test 69.62%


Epoch 158/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 158/240 | loss 2.731 | train 76.74% | test 69.64%


Epoch 159/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 159/240 | loss 2.716 | train 76.67% | test 69.89%
  ↳ Saved best @ epoch 159 (test 69.89%) → 110_t-20_s.pth


Epoch 160/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 160/240 | loss 2.702 | train 76.93% | test 69.97%
  ↳ Saved best @ epoch 160 (test 69.97%) → 110_t-20_s.pth


Epoch 161/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 161/240 | loss 2.684 | train 77.16% | test 69.63%


Epoch 162/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 162/240 | loss 2.678 | train 76.95% | test 69.79%


Epoch 163/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 163/240 | loss 2.653 | train 77.53% | test 69.43%


Epoch 164/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 164/240 | loss 2.655 | train 77.43% | test 70.04%
  ↳ Saved best @ epoch 164 (test 70.04%) → 110_t-20_s.pth


Epoch 165/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 165/240 | loss 2.646 | train 77.64% | test 70.00%


Epoch 166/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 166/240 | loss 2.639 | train 77.80% | test 70.09%
  ↳ Saved best @ epoch 166 (test 70.09%) → 110_t-20_s.pth


Epoch 167/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 167/240 | loss 2.615 | train 77.92% | test 69.59%


Epoch 168/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 168/240 | loss 2.620 | train 78.05% | test 70.15%
  ↳ Saved best @ epoch 168 (test 70.15%) → 110_t-20_s.pth


Epoch 169/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 169/240 | loss 2.624 | train 78.00% | test 69.76%


Epoch 170/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 170/240 | loss 2.607 | train 78.07% | test 69.39%


Epoch 171/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 171/240 | loss 2.597 | train 78.15% | test 70.00%


Epoch 172/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 172/240 | loss 2.587 | train 78.23% | test 70.20%
  ↳ Saved best @ epoch 172 (test 70.20%) → 110_t-20_s.pth


Epoch 173/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 173/240 | loss 2.587 | train 78.29% | test 69.74%


Epoch 174/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 174/240 | loss 2.593 | train 78.24% | test 69.69%


Epoch 175/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 175/240 | loss 2.575 | train 78.54% | test 69.43%


Epoch 176/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 176/240 | loss 2.580 | train 78.43% | test 69.45%


Epoch 177/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 177/240 | loss 2.573 | train 78.64% | test 69.55%


Epoch 178/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 178/240 | loss 2.576 | train 78.42% | test 69.34%


Epoch 179/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 179/240 | loss 2.566 | train 78.76% | test 69.73%


Epoch 180/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 180/240 | loss 2.569 | train 78.70% | test 69.92%


Epoch 181/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 181/240 | loss 2.440 | train 79.90% | test 70.67%
  ↳ Saved best @ epoch 181 (test 70.67%) → 110_t-20_s.pth


Epoch 182/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 182/240 | loss 2.408 | train 80.18% | test 70.91%
  ↳ Saved best @ epoch 182 (test 70.91%) → 110_t-20_s.pth


Epoch 183/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 183/240 | loss 2.396 | train 80.16% | test 70.91%


Epoch 184/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 184/240 | loss 2.386 | train 80.11% | test 71.00%
  ↳ Saved best @ epoch 184 (test 71.00%) → 110_t-20_s.pth


Epoch 185/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 185/240 | loss 2.380 | train 80.34% | test 70.94%


Epoch 186/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 186/240 | loss 2.376 | train 80.17% | test 70.83%


Epoch 187/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 187/240 | loss 2.384 | train 80.16% | test 70.97%


Epoch 188/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 188/240 | loss 2.369 | train 80.46% | test 71.15%
  ↳ Saved best @ epoch 188 (test 71.15%) → 110_t-20_s.pth


Epoch 189/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 189/240 | loss 2.362 | train 80.62% | test 70.98%


Epoch 190/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 190/240 | loss 2.364 | train 80.40% | test 70.99%


Epoch 191/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 191/240 | loss 2.362 | train 80.65% | test 70.94%


Epoch 192/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 192/240 | loss 2.364 | train 80.54% | test 70.99%


Epoch 193/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 193/240 | loss 2.361 | train 80.48% | test 70.73%


Epoch 194/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 194/240 | loss 2.360 | train 80.55% | test 70.98%


Epoch 195/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 195/240 | loss 2.360 | train 80.54% | test 70.97%


Epoch 196/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 196/240 | loss 2.352 | train 80.74% | test 70.91%


Epoch 197/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 197/240 | loss 2.352 | train 80.74% | test 70.88%


Epoch 198/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 198/240 | loss 2.354 | train 80.73% | test 71.10%


Epoch 199/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 199/240 | loss 2.348 | train 80.65% | test 70.97%


Epoch 200/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 200/240 | loss 2.353 | train 80.71% | test 70.91%


Epoch 201/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 201/240 | loss 2.345 | train 80.72% | test 70.95%


Epoch 202/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 202/240 | loss 2.348 | train 80.54% | test 70.97%


Epoch 203/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 203/240 | loss 2.345 | train 80.62% | test 71.04%


Epoch 204/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 204/240 | loss 2.345 | train 80.57% | test 71.08%


Epoch 205/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 205/240 | loss 2.341 | train 80.66% | test 71.00%


Epoch 206/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 206/240 | loss 2.345 | train 80.84% | test 71.05%


Epoch 207/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 207/240 | loss 2.336 | train 80.83% | test 70.98%


Epoch 208/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 208/240 | loss 2.333 | train 80.82% | test 70.91%


Epoch 209/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 209/240 | loss 2.346 | train 80.82% | test 71.12%


Epoch 210/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 210/240 | loss 2.331 | train 80.81% | test 70.90%


Epoch 211/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 211/240 | loss 2.319 | train 81.02% | test 71.15%


Epoch 212/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 212/240 | loss 2.331 | train 80.92% | test 70.87%


Epoch 213/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 213/240 | loss 2.328 | train 81.13% | test 71.08%


Epoch 214/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 214/240 | loss 2.318 | train 81.25% | test 71.14%


Epoch 215/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 215/240 | loss 2.320 | train 81.09% | test 71.14%


Epoch 216/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 216/240 | loss 2.317 | train 81.06% | test 71.04%


Epoch 217/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 217/240 | loss 2.327 | train 80.96% | test 71.04%


Epoch 218/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 218/240 | loss 2.314 | train 81.13% | test 71.00%


Epoch 219/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 219/240 | loss 2.327 | train 80.70% | test 71.07%


Epoch 220/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 220/240 | loss 2.321 | train 81.16% | test 71.20%
  ↳ Saved best @ epoch 220 (test 71.20%) → 110_t-20_s.pth


Epoch 221/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 221/240 | loss 2.316 | train 81.04% | test 70.99%


Epoch 222/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 222/240 | loss 2.307 | train 81.15% | test 71.01%


Epoch 223/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 223/240 | loss 2.320 | train 81.07% | test 71.23%
  ↳ Saved best @ epoch 223 (test 71.23%) → 110_t-20_s.pth


Epoch 224/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 224/240 | loss 2.316 | train 81.14% | test 71.09%


Epoch 225/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 225/240 | loss 2.319 | train 80.99% | test 71.11%


Epoch 226/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 226/240 | loss 2.314 | train 81.11% | test 71.15%


Epoch 227/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 227/240 | loss 2.312 | train 81.27% | test 71.20%


Epoch 228/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 228/240 | loss 2.313 | train 81.19% | test 71.09%


Epoch 229/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 229/240 | loss 2.332 | train 80.82% | test 71.18%


Epoch 230/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 230/240 | loss 2.322 | train 81.00% | test 71.20%


Epoch 231/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 231/240 | loss 2.320 | train 80.97% | test 71.13%


Epoch 232/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 232/240 | loss 2.314 | train 81.24% | test 70.94%


Epoch 233/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 233/240 | loss 2.316 | train 80.97% | test 71.18%


Epoch 234/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 234/240 | loss 2.314 | train 80.81% | test 71.09%


Epoch 235/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 235/240 | loss 2.323 | train 81.04% | test 71.18%


Epoch 236/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 236/240 | loss 2.330 | train 81.00% | test 71.20%


Epoch 237/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 237/240 | loss 2.301 | train 81.11% | test 71.08%


Epoch 238/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 238/240 | loss 2.314 | train 81.01% | test 71.15%


Epoch 239/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 239/240 | loss 2.309 | train 81.16% | test 71.20%


Epoch 240/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 240/240 | loss 2.316 | train 80.94% | test 71.05%
✅ KD training finished. Best Test Acc: 71.23%


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
teacher = resnet110(num_classes = 100)
t_ckpt  = torch.load("/kaggle/input/resnet-110/ckpt_epoch_240.pth", map_location=device, weights_only=False)
teacher.load_state_dict(t_ckpt['model'] if 'model' in t_ckpt else t_ckpt)

student = resnet32_basic(num_classes=100)

train_via_KD(teacher, student, train_loader, test_loader, device, save_path= "110_t-32_s.pth")

Epoch 1/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   1/240 | loss 14.864 | train  7.82% | test 12.00%
  ↳ Saved best @ epoch 1 (test 12.00%) → 110_t-32_s.pth


Epoch 2/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   2/240 | loss 12.175 | train 18.45% | test 18.66%
  ↳ Saved best @ epoch 2 (test 18.66%) → 110_t-32_s.pth


Epoch 3/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   3/240 | loss 10.209 | train 28.25% | test 27.17%
  ↳ Saved best @ epoch 3 (test 27.17%) → 110_t-32_s.pth


Epoch 4/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   4/240 | loss 8.865 | train 35.63% | test 32.20%
  ↳ Saved best @ epoch 4 (test 32.20%) → 110_t-32_s.pth


Epoch 5/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   5/240 | loss 7.878 | train 41.24% | test 36.28%
  ↳ Saved best @ epoch 5 (test 36.28%) → 110_t-32_s.pth


Epoch 6/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   6/240 | loss 7.113 | train 45.68% | test 38.56%
  ↳ Saved best @ epoch 6 (test 38.56%) → 110_t-32_s.pth


Epoch 7/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   7/240 | loss 6.599 | train 49.27% | test 45.43%
  ↳ Saved best @ epoch 7 (test 45.43%) → 110_t-32_s.pth


Epoch 8/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   8/240 | loss 6.181 | train 51.35% | test 36.79%


Epoch 9/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch   9/240 | loss 5.876 | train 53.25% | test 47.87%
  ↳ Saved best @ epoch 9 (test 47.87%) → 110_t-32_s.pth


Epoch 10/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  10/240 | loss 5.630 | train 55.12% | test 46.92%


Epoch 11/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  11/240 | loss 5.369 | train 56.58% | test 50.97%
  ↳ Saved best @ epoch 11 (test 50.97%) → 110_t-32_s.pth


Epoch 12/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  12/240 | loss 5.247 | train 57.79% | test 51.35%
  ↳ Saved best @ epoch 12 (test 51.35%) → 110_t-32_s.pth


Epoch 13/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  13/240 | loss 5.116 | train 58.38% | test 53.96%
  ↳ Saved best @ epoch 13 (test 53.96%) → 110_t-32_s.pth


Epoch 14/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  14/240 | loss 5.015 | train 59.52% | test 46.42%


Epoch 15/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  15/240 | loss 4.903 | train 59.95% | test 53.97%
  ↳ Saved best @ epoch 15 (test 53.97%) → 110_t-32_s.pth


Epoch 16/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  16/240 | loss 4.812 | train 60.47% | test 53.70%


Epoch 17/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  17/240 | loss 4.697 | train 61.39% | test 49.45%


Epoch 18/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  18/240 | loss 4.626 | train 61.85% | test 54.83%
  ↳ Saved best @ epoch 18 (test 54.83%) → 110_t-32_s.pth


Epoch 19/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  19/240 | loss 4.575 | train 62.59% | test 49.86%


Epoch 20/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  20/240 | loss 4.500 | train 62.50% | test 51.63%


Epoch 21/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  21/240 | loss 4.457 | train 63.17% | test 51.62%


Epoch 22/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  22/240 | loss 4.420 | train 63.20% | test 52.08%


Epoch 23/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  23/240 | loss 4.390 | train 63.87% | test 55.93%
  ↳ Saved best @ epoch 23 (test 55.93%) → 110_t-32_s.pth


Epoch 24/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  24/240 | loss 4.327 | train 64.00% | test 57.13%
  ↳ Saved best @ epoch 24 (test 57.13%) → 110_t-32_s.pth


Epoch 25/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  25/240 | loss 4.284 | train 64.29% | test 57.92%
  ↳ Saved best @ epoch 25 (test 57.92%) → 110_t-32_s.pth


Epoch 26/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  26/240 | loss 4.243 | train 64.50% | test 52.83%


Epoch 27/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  27/240 | loss 4.246 | train 64.79% | test 55.34%


Epoch 28/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  28/240 | loss 4.232 | train 64.85% | test 56.49%


Epoch 29/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  29/240 | loss 4.187 | train 65.05% | test 58.11%
  ↳ Saved best @ epoch 29 (test 58.11%) → 110_t-32_s.pth


Epoch 30/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  30/240 | loss 4.170 | train 65.36% | test 52.79%


Epoch 31/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  31/240 | loss 4.143 | train 65.35% | test 58.31%
  ↳ Saved best @ epoch 31 (test 58.31%) → 110_t-32_s.pth


Epoch 32/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  32/240 | loss 4.106 | train 65.58% | test 56.13%


Epoch 33/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  33/240 | loss 4.114 | train 65.77% | test 55.74%


Epoch 34/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  34/240 | loss 4.039 | train 66.42% | test 57.29%


Epoch 35/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  35/240 | loss 4.079 | train 65.79% | test 56.80%


Epoch 36/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  36/240 | loss 4.053 | train 66.12% | test 57.94%


Epoch 37/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  37/240 | loss 4.039 | train 66.22% | test 59.68%
  ↳ Saved best @ epoch 37 (test 59.68%) → 110_t-32_s.pth


Epoch 38/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  38/240 | loss 3.983 | train 66.56% | test 55.68%


Epoch 39/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  39/240 | loss 3.981 | train 66.58% | test 55.90%


Epoch 40/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  40/240 | loss 3.976 | train 66.92% | test 57.49%


Epoch 41/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  41/240 | loss 3.976 | train 66.66% | test 56.21%


Epoch 42/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  42/240 | loss 3.959 | train 66.67% | test 56.18%


Epoch 43/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  43/240 | loss 3.924 | train 66.87% | test 57.77%


Epoch 44/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  44/240 | loss 3.925 | train 66.80% | test 57.31%


Epoch 45/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  45/240 | loss 3.916 | train 67.05% | test 51.92%


Epoch 46/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  46/240 | loss 3.903 | train 67.24% | test 53.39%


Epoch 47/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  47/240 | loss 3.912 | train 67.12% | test 57.29%


Epoch 48/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  48/240 | loss 3.904 | train 67.16% | test 56.70%


Epoch 49/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  49/240 | loss 3.895 | train 67.31% | test 53.82%


Epoch 50/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  50/240 | loss 3.883 | train 67.14% | test 58.22%


Epoch 51/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  51/240 | loss 3.847 | train 67.68% | test 56.12%


Epoch 52/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  52/240 | loss 3.835 | train 67.72% | test 57.20%


Epoch 53/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  53/240 | loss 3.842 | train 67.83% | test 55.18%


Epoch 54/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  54/240 | loss 3.834 | train 67.68% | test 57.71%


Epoch 55/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  55/240 | loss 3.820 | train 67.77% | test 59.15%


Epoch 56/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  56/240 | loss 3.839 | train 67.81% | test 58.70%


Epoch 57/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  57/240 | loss 3.854 | train 67.67% | test 59.51%


Epoch 58/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  58/240 | loss 3.803 | train 67.80% | test 59.35%


Epoch 59/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  59/240 | loss 3.792 | train 67.92% | test 56.04%


Epoch 60/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  60/240 | loss 3.795 | train 67.84% | test 58.55%


Epoch 61/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  61/240 | loss 3.765 | train 67.88% | test 57.72%


Epoch 62/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  62/240 | loss 3.792 | train 68.32% | test 57.09%


Epoch 63/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  63/240 | loss 3.778 | train 68.13% | test 59.77%
  ↳ Saved best @ epoch 63 (test 59.77%) → 110_t-32_s.pth


Epoch 64/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  64/240 | loss 3.727 | train 68.55% | test 56.85%


Epoch 65/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  65/240 | loss 3.794 | train 67.97% | test 57.47%


Epoch 66/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  66/240 | loss 3.763 | train 68.37% | test 59.01%


Epoch 67/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  67/240 | loss 3.775 | train 68.23% | test 57.35%


Epoch 68/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  68/240 | loss 3.733 | train 68.61% | test 54.79%


Epoch 69/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  69/240 | loss 3.757 | train 68.46% | test 54.70%


Epoch 70/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  70/240 | loss 3.753 | train 68.30% | test 51.50%


Epoch 71/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  71/240 | loss 3.742 | train 68.45% | test 54.42%


Epoch 72/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  72/240 | loss 3.715 | train 68.64% | test 55.48%


Epoch 73/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  73/240 | loss 3.713 | train 68.53% | test 58.90%


Epoch 74/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  74/240 | loss 3.716 | train 68.45% | test 61.27%
  ↳ Saved best @ epoch 74 (test 61.27%) → 110_t-32_s.pth


Epoch 75/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  75/240 | loss 3.713 | train 68.73% | test 56.88%


Epoch 76/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  76/240 | loss 3.683 | train 68.80% | test 53.79%


Epoch 77/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  77/240 | loss 3.703 | train 68.61% | test 59.75%


Epoch 78/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  78/240 | loss 3.700 | train 68.47% | test 59.46%


Epoch 79/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  79/240 | loss 3.701 | train 68.74% | test 55.93%


Epoch 80/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  80/240 | loss 3.715 | train 68.52% | test 61.24%


Epoch 81/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  81/240 | loss 3.698 | train 68.81% | test 61.82%
  ↳ Saved best @ epoch 81 (test 61.82%) → 110_t-32_s.pth


Epoch 82/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  82/240 | loss 3.689 | train 68.92% | test 60.13%


Epoch 83/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  83/240 | loss 3.655 | train 68.99% | test 60.16%


Epoch 84/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  84/240 | loss 3.663 | train 68.84% | test 61.01%


Epoch 85/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  85/240 | loss 3.661 | train 68.87% | test 58.10%


Epoch 86/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  86/240 | loss 3.657 | train 69.08% | test 58.85%


Epoch 87/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  87/240 | loss 3.699 | train 68.48% | test 57.67%


Epoch 88/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  88/240 | loss 3.663 | train 68.93% | test 57.31%


Epoch 89/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  89/240 | loss 3.638 | train 69.22% | test 58.49%


Epoch 90/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  90/240 | loss 3.651 | train 69.03% | test 59.83%


Epoch 91/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  91/240 | loss 3.659 | train 69.12% | test 60.28%


Epoch 92/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  92/240 | loss 3.676 | train 68.88% | test 60.00%


Epoch 93/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  93/240 | loss 3.644 | train 68.98% | test 59.25%


Epoch 94/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  94/240 | loss 3.653 | train 69.16% | test 57.23%


Epoch 95/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  95/240 | loss 3.656 | train 69.14% | test 60.55%


Epoch 96/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  96/240 | loss 3.635 | train 69.13% | test 59.88%


Epoch 97/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  97/240 | loss 3.639 | train 69.21% | test 58.20%


Epoch 98/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  98/240 | loss 3.633 | train 69.21% | test 58.14%


Epoch 99/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch  99/240 | loss 3.641 | train 69.14% | test 57.94%


Epoch 100/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 100/240 | loss 3.646 | train 69.40% | test 60.55%


Epoch 101/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 101/240 | loss 3.644 | train 69.24% | test 60.28%


Epoch 102/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 102/240 | loss 3.641 | train 69.13% | test 58.16%


Epoch 103/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 103/240 | loss 3.633 | train 69.44% | test 57.80%


Epoch 104/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 104/240 | loss 3.609 | train 69.45% | test 59.41%


Epoch 105/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 105/240 | loss 3.666 | train 69.36% | test 59.87%


Epoch 106/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 106/240 | loss 3.605 | train 69.81% | test 55.60%


Epoch 107/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 107/240 | loss 3.628 | train 69.49% | test 58.71%


Epoch 108/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 108/240 | loss 3.639 | train 69.19% | test 57.39%


Epoch 109/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 109/240 | loss 3.601 | train 69.42% | test 58.21%


Epoch 110/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 110/240 | loss 3.600 | train 69.65% | test 58.85%


Epoch 111/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 111/240 | loss 3.588 | train 69.79% | test 57.65%


Epoch 112/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 112/240 | loss 3.618 | train 69.25% | test 58.39%


Epoch 113/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 113/240 | loss 3.580 | train 69.77% | test 59.24%


Epoch 114/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 114/240 | loss 3.616 | train 69.58% | test 60.22%


Epoch 115/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 115/240 | loss 3.566 | train 69.59% | test 60.94%


Epoch 116/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 116/240 | loss 3.551 | train 69.99% | test 60.36%


Epoch 117/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 117/240 | loss 3.601 | train 69.33% | test 59.32%


Epoch 118/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 118/240 | loss 3.599 | train 69.40% | test 55.79%


Epoch 119/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 119/240 | loss 3.605 | train 69.55% | test 58.57%


Epoch 120/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 120/240 | loss 3.559 | train 69.50% | test 61.31%


Epoch 121/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 121/240 | loss 3.569 | train 69.60% | test 60.25%


Epoch 122/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 122/240 | loss 3.590 | train 69.81% | test 58.12%


Epoch 123/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 123/240 | loss 3.576 | train 69.74% | test 57.08%


Epoch 124/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 124/240 | loss 3.586 | train 69.50% | test 60.00%


Epoch 125/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 125/240 | loss 3.543 | train 69.85% | test 58.18%


Epoch 126/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 126/240 | loss 3.562 | train 69.83% | test 60.42%


Epoch 127/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 127/240 | loss 3.589 | train 69.70% | test 59.60%


Epoch 128/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 128/240 | loss 3.581 | train 69.76% | test 56.08%


Epoch 129/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 129/240 | loss 3.546 | train 69.77% | test 57.99%


Epoch 130/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 130/240 | loss 3.605 | train 69.60% | test 59.05%


Epoch 131/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 131/240 | loss 3.538 | train 70.01% | test 57.59%


Epoch 132/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 132/240 | loss 3.540 | train 69.99% | test 57.53%


Epoch 133/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 133/240 | loss 3.571 | train 69.59% | test 59.29%


Epoch 134/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 134/240 | loss 3.571 | train 69.79% | test 59.34%


Epoch 135/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 135/240 | loss 3.528 | train 69.85% | test 57.57%


Epoch 136/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 136/240 | loss 3.562 | train 69.96% | test 58.30%


Epoch 137/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 137/240 | loss 3.566 | train 69.74% | test 59.37%


Epoch 138/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 138/240 | loss 3.576 | train 69.47% | test 61.75%


Epoch 139/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 139/240 | loss 3.546 | train 69.96% | test 57.77%


Epoch 140/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 140/240 | loss 3.524 | train 70.02% | test 60.01%


Epoch 141/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 141/240 | loss 3.534 | train 70.12% | test 57.48%


Epoch 142/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 142/240 | loss 3.542 | train 69.91% | test 61.79%


Epoch 143/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 143/240 | loss 3.547 | train 69.70% | test 58.04%


Epoch 144/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 144/240 | loss 3.534 | train 69.98% | test 54.47%


Epoch 145/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 145/240 | loss 3.513 | train 70.19% | test 58.70%


Epoch 146/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 146/240 | loss 3.523 | train 70.21% | test 60.02%


Epoch 147/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 147/240 | loss 3.542 | train 69.91% | test 58.91%


Epoch 148/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 148/240 | loss 3.534 | train 70.08% | test 57.65%


Epoch 149/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 149/240 | loss 3.518 | train 70.01% | test 59.97%


Epoch 150/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 150/240 | loss 3.534 | train 70.04% | test 57.38%


Epoch 151/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 151/240 | loss 2.602 | train 77.27% | test 70.95%
  ↳ Saved best @ epoch 151 (test 70.95%) → 110_t-32_s.pth


Epoch 152/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 152/240 | loss 2.365 | train 79.07% | test 71.36%
  ↳ Saved best @ epoch 152 (test 71.36%) → 110_t-32_s.pth


Epoch 153/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 153/240 | loss 2.290 | train 79.85% | test 72.04%
  ↳ Saved best @ epoch 153 (test 72.04%) → 110_t-32_s.pth


Epoch 154/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 154/240 | loss 2.233 | train 80.30% | test 71.82%


Epoch 155/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 155/240 | loss 2.202 | train 80.75% | test 71.74%


Epoch 156/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 156/240 | loss 2.174 | train 81.10% | test 72.10%
  ↳ Saved best @ epoch 156 (test 72.10%) → 110_t-32_s.pth


Epoch 157/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 157/240 | loss 2.142 | train 81.29% | test 72.48%
  ↳ Saved best @ epoch 157 (test 72.48%) → 110_t-32_s.pth


Epoch 158/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 158/240 | loss 2.125 | train 81.59% | test 71.89%


Epoch 159/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 159/240 | loss 2.094 | train 81.81% | test 71.94%


Epoch 160/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 160/240 | loss 2.076 | train 82.07% | test 72.63%
  ↳ Saved best @ epoch 160 (test 72.63%) → 110_t-32_s.pth


Epoch 161/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 161/240 | loss 2.058 | train 82.24% | test 72.18%


Epoch 162/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 162/240 | loss 2.044 | train 82.45% | test 72.11%


Epoch 163/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 163/240 | loss 2.037 | train 82.34% | test 72.09%


Epoch 164/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 164/240 | loss 2.020 | train 82.58% | test 72.41%


Epoch 165/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 165/240 | loss 2.010 | train 82.84% | test 72.33%


Epoch 166/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 166/240 | loss 2.008 | train 82.94% | test 72.01%


Epoch 167/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 167/240 | loss 1.988 | train 83.21% | test 72.12%


Epoch 168/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 168/240 | loss 1.972 | train 83.34% | test 71.93%


Epoch 169/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 169/240 | loss 1.982 | train 83.35% | test 72.05%


Epoch 170/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 170/240 | loss 1.968 | train 83.46% | test 72.17%


Epoch 171/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 171/240 | loss 1.958 | train 83.74% | test 72.05%


Epoch 172/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 172/240 | loss 1.950 | train 83.74% | test 72.37%


Epoch 173/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 173/240 | loss 1.938 | train 83.77% | test 72.01%


Epoch 174/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 174/240 | loss 1.925 | train 84.07% | test 71.92%


Epoch 175/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 175/240 | loss 1.927 | train 84.27% | test 72.39%


Epoch 176/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 176/240 | loss 1.924 | train 84.06% | test 72.32%


Epoch 177/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 177/240 | loss 1.934 | train 84.01% | test 72.21%


Epoch 178/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 178/240 | loss 1.918 | train 84.21% | test 72.20%


Epoch 179/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 179/240 | loss 1.916 | train 84.35% | test 71.33%


Epoch 180/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 180/240 | loss 1.910 | train 84.29% | test 72.30%


Epoch 181/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 181/240 | loss 1.802 | train 85.46% | test 73.09%
  ↳ Saved best @ epoch 181 (test 73.09%) → 110_t-32_s.pth


Epoch 182/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 182/240 | loss 1.772 | train 85.76% | test 73.03%


Epoch 183/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 183/240 | loss 1.761 | train 85.79% | test 73.16%
  ↳ Saved best @ epoch 183 (test 73.16%) → 110_t-32_s.pth


Epoch 184/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 184/240 | loss 1.755 | train 86.00% | test 73.32%
  ↳ Saved best @ epoch 184 (test 73.32%) → 110_t-32_s.pth


Epoch 185/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 185/240 | loss 1.753 | train 85.94% | test 73.09%


Epoch 186/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 186/240 | loss 1.739 | train 85.96% | test 73.23%


Epoch 187/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 187/240 | loss 1.738 | train 86.17% | test 73.25%


Epoch 188/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 188/240 | loss 1.737 | train 86.13% | test 73.19%


Epoch 189/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 189/240 | loss 1.745 | train 86.11% | test 73.24%


Epoch 190/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 190/240 | loss 1.730 | train 86.09% | test 73.05%


Epoch 191/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 191/240 | loss 1.728 | train 86.28% | test 72.97%


Epoch 192/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 192/240 | loss 1.724 | train 86.40% | test 73.15%


Epoch 193/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 193/240 | loss 1.726 | train 86.16% | test 72.99%


Epoch 194/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 194/240 | loss 1.725 | train 86.27% | test 73.11%


Epoch 195/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 195/240 | loss 1.725 | train 86.26% | test 73.26%


Epoch 196/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 196/240 | loss 1.721 | train 86.35% | test 73.06%


Epoch 197/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 197/240 | loss 1.724 | train 86.53% | test 73.15%


Epoch 198/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 198/240 | loss 1.716 | train 86.63% | test 73.07%


Epoch 199/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 199/240 | loss 1.721 | train 86.49% | test 73.23%


Epoch 200/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 200/240 | loss 1.720 | train 86.30% | test 73.09%


Epoch 201/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 201/240 | loss 1.716 | train 86.43% | test 73.16%


Epoch 202/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 202/240 | loss 1.713 | train 86.57% | test 73.04%


Epoch 203/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 203/240 | loss 1.712 | train 86.54% | test 73.17%


Epoch 204/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 204/240 | loss 1.713 | train 86.51% | test 72.96%


Epoch 205/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 205/240 | loss 1.708 | train 86.57% | test 73.02%


Epoch 206/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 206/240 | loss 1.712 | train 86.61% | test 73.08%


Epoch 207/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 207/240 | loss 1.701 | train 86.49% | test 73.10%


Epoch 208/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 208/240 | loss 1.701 | train 86.64% | test 73.34%
  ↳ Saved best @ epoch 208 (test 73.34%) → 110_t-32_s.pth


Epoch 209/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 209/240 | loss 1.708 | train 86.62% | test 73.25%


Epoch 210/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 210/240 | loss 1.701 | train 86.73% | test 73.19%


Epoch 211/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 211/240 | loss 1.689 | train 86.82% | test 73.14%


Epoch 212/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 212/240 | loss 1.686 | train 86.77% | test 73.34%


Epoch 213/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 213/240 | loss 1.691 | train 86.78% | test 73.02%


Epoch 214/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 214/240 | loss 1.689 | train 86.92% | test 73.25%


Epoch 215/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 215/240 | loss 1.690 | train 86.88% | test 73.22%


Epoch 216/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 216/240 | loss 1.680 | train 86.96% | test 73.09%


Epoch 217/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 217/240 | loss 1.689 | train 86.81% | test 72.91%


Epoch 218/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 218/240 | loss 1.692 | train 87.03% | test 73.05%


Epoch 219/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 219/240 | loss 1.687 | train 87.06% | test 73.17%


Epoch 220/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 220/240 | loss 1.685 | train 87.01% | test 73.37%
  ↳ Saved best @ epoch 220 (test 73.37%) → 110_t-32_s.pth


Epoch 221/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 221/240 | loss 1.685 | train 87.01% | test 73.29%


Epoch 222/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 222/240 | loss 1.689 | train 86.80% | test 72.99%


Epoch 223/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 223/240 | loss 1.684 | train 86.90% | test 73.15%


Epoch 224/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 224/240 | loss 1.684 | train 86.69% | test 73.30%


Epoch 225/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 225/240 | loss 1.687 | train 86.88% | test 73.17%


Epoch 226/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 226/240 | loss 1.687 | train 86.90% | test 73.01%


Epoch 227/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 227/240 | loss 1.683 | train 86.95% | test 73.05%


Epoch 228/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 228/240 | loss 1.682 | train 86.95% | test 73.17%


Epoch 229/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 229/240 | loss 1.682 | train 86.91% | test 73.21%


Epoch 230/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 230/240 | loss 1.680 | train 86.93% | test 73.21%


Epoch 231/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 231/240 | loss 1.683 | train 86.70% | test 73.20%


Epoch 232/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 232/240 | loss 1.682 | train 86.98% | test 72.99%


Epoch 233/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 233/240 | loss 1.684 | train 86.98% | test 73.14%


Epoch 234/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 234/240 | loss 1.683 | train 87.02% | test 73.13%


Epoch 235/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 235/240 | loss 1.685 | train 86.91% | test 72.96%


Epoch 236/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 236/240 | loss 1.689 | train 86.77% | test 73.11%


Epoch 237/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 237/240 | loss 1.685 | train 86.88% | test 73.12%


Epoch 238/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 238/240 | loss 1.687 | train 86.93% | test 72.99%


Epoch 239/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 239/240 | loss 1.680 | train 86.94% | test 73.13%


Epoch 240/240:   0%|          | 0/391 [00:00<?, ?it/s]

Epoch 240/240 | loss 1.682 | train 86.96% | test 73.16%
✅ KD training finished. Best Test Acc: 73.37%
