In [1]:
import torch
from torch import nn, Tensor
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import CIFAR100
from typing import Optional, Callable
import os
import timm
import numpy as np
import pandas as pd
from torchvision.transforms import v2
from torch.backends import cudnn
from torch import GradScaler
from torch import optim
from tqdm import tqdm
# import wandb
from datetime import datetime

# ============ Configuration ============
config = {
    "dataset": "cifar100_noisy",
    "model": "resnet18",
    "pretrained": "imagenet",
    "epochs": 100,
    "batch_size": 128,
    "lr": 0.001,
    "momentum": 0.9,
    "weight_decay": 0.0005,
    "nesterov": True,
    "label_smoothing": 0.1,
    "optimizer": "adamw",
    "scheduler": "cosine",
    "cosine_eta_min": 5e-6,
    "early_stop_patience": 12,
    "early_stop_mode": "max",
    "early_stop_min_delta": 0.1,
    "device": "cuda",
    "mixed_precision": True,
    "wandb_project": "cifar100-noisy-competition",
    "upscale_size": 224,  # Upscale images from 32x32 to 224x224
    "aug_alpha": 0.5,          # Optimal alpha for Beta distribution
    "cutmix_prob": 1.0,        # Apply every batch (standard for strong regularization)
    "switch_epoch": 25,        # 0-49: MixUp, 50-100: CutMix
    "warmup_epochs": 5,
    "loss_threshold": 2.5,
    "dynamic_threshold_decay": 0.997
}

device = torch.device(config["device"])
print(f"Using device: {device}")
cudnn.benchmark = True
pin_memory = True
enable_half = config["mixed_precision"]  # Disable for CPU, it is slower!
scaler = GradScaler(device, enabled=enable_half)

class SimpleCachedDataset(Dataset):
    def __init__(self, dataset):
        # Runtime transforms are not implemented in this simple cached dataset.
        self.data = tuple([x for x in dataset])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]

class PreprocessedDataset(Dataset):
    """
    Cache preprocessed tensors - apply transforms once and store results.
    
    PERFORMANCE OPTIMIZATION:
    - Applies deterministic transforms (ToImage, Resize) once at startup
    - Stores uint8 tensors (4x less memory than float32)
    - Random augmentations applied at runtime each epoch
    - Test set only needs normalization at runtime (huge speedup!)
    """
    def __init__(self, dataset, transform):
        print(f"Preprocessing {len(dataset)} images (this happens once)...")
        self.data = []
        self.targets = []
        
        for img, target in tqdm(dataset, desc="Caching", leave=False):
            transformed = transform(img)
            self.data.append(transformed)
            self.targets.append(target)
        
        print(f"Cached {len(self.data)} preprocessed images")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        return self.data[i], self.targets[i]

class AugmentationWrapper(Dataset):
    """Apply runtime augmentations on already preprocessed tensors."""
    def __init__(self, preprocessed_dataset, runtime_transforms):
        self.dataset = preprocessed_dataset
        self.runtime_transforms = runtime_transforms
        
    def __len__(self):
        return len(self.dataset)
        
    def __getitem__(self, i):
        img_tensor, target = self.dataset[i]
        
        # Apply runtime augmentations (random transforms)
        if self.runtime_transforms is not None:
            img_tensor = self.runtime_transforms(img_tensor)
        
        return img_tensor, target

class CIFAR100_noisy_fine(Dataset):
    """
    See https://github.com/UCSC-REAL/cifar-10-100n, https://www.noisylabels.com/ and `Learning with Noisy Labels
    Revisited: A Study Using Real-World Human Annotations`.
    """

    def __init__(
        self, root: str, train: bool, transform: Optional[Callable], download: bool
    ):
        cifar100 = CIFAR100(
            root=root, train=train, transform=None, download=download
        )
        data, targets = tuple(zip(*cifar100))

        if train:
            noisy_label_file = os.path.join(root, "CIFAR-100-noisy.npz")
            if not os.path.isfile(noisy_label_file):
                raise FileNotFoundError(
                    f"{type(self).__name__} need {noisy_label_file} to be used!"
                )

            noise_file = np.load(noisy_label_file)
            if not np.array_equal(noise_file["clean_label"], targets):
                raise RuntimeError("Clean labels do not match!")
            targets = noise_file["noisy_label"]

        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, i: int):
        return self.data[i], self.targets[i]


class EarlyStopping:
    """Early stopping to stop training when validation metric doesn't improve."""
    def __init__(self, patience=10, min_delta=0.0, mode='max'):
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_epoch = 0
        
    def __call__(self, score, epoch):
        if self.best_score is None:
            self.best_score = score
            self.best_epoch = epoch
            return False
        
        if self.mode == 'max':
            # For accuracy (higher is better)
            if score > self.best_score + self.min_delta:
                self.best_score = score
                self.best_epoch = epoch
                self.counter = 0
            else:
                self.counter += 1
        else:
            # For loss (lower is better)
            if score < self.best_score - self.min_delta:
                self.best_score = score
                self.best_epoch = epoch
                self.counter = 0
            else:
                self.counter += 1
        
        if self.counter >= self.patience:
            self.early_stop = True
        
        return self.early_stop


# === PREPROCESSING (applied once and cached) ===
# Only deterministic, spatial transforms - stores uint8 tensors (saves memory!)
preprocess_transforms = v2.Compose([
    v2.ToImage(),
    v2.Resize(config["upscale_size"]),  # Upscale from 32x32 to 128x128
])

# === RUNTIME AUGMENTATION (applied at each epoch) ===
# Random transforms for training (includes normalization at the end)
train_runtime_transforms = v2.Compose([
    v2.RandomCrop(config["upscale_size"], padding=4),
    v2.RandomHorizontalFlip(p=0.5),
    v2.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    v2.RandomRotation(15),
    v2.ToDtype(torch.float32, scale=True),  # Convert to float [0,1]
    v2.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),  # CIFAR-100 stats
    v2.RandomErasing(p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3))
])

# Test set: only normalization needed (spatial transforms already done)
test_runtime_transforms = v2.Compose([
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])

# Load raw datasets
print("Loading datasets...")
train_set_raw = CIFAR100_noisy_fine('/kaggle/input/fii-atnn-2025-project-noisy-cifar-100/fii-atnn-2024-project-noisy-cifar-100', download=False, train=True, transform=None)
test_set_raw = CIFAR100_noisy_fine('/kaggle/input/fii-atnn-2025-project-noisy-cifar-100/fii-atnn-2024-project-noisy-cifar-100', download=False, train=False, transform=None)

# Cache raw PIL images (fast, lightweight)
train_set_cached = SimpleCachedDataset(train_set_raw)
test_set_cached = SimpleCachedDataset(test_set_raw)

# Preprocess and cache as tensors (done once!)
print("\n[TRAIN SET]")
train_set_preprocessed = PreprocessedDataset(train_set_cached, preprocess_transforms)
print("\n[TEST SET]")
test_set_preprocessed = PreprocessedDataset(test_set_cached, preprocess_transforms)

# Add runtime augmentations (applied each epoch for train, none for test)
train_set = AugmentationWrapper(train_set_preprocessed, train_runtime_transforms)
test_set = AugmentationWrapper(test_set_preprocessed, test_runtime_transforms)

print(f"\nTrain set ready: {len(train_set)} samples (with runtime augmentation)")
print(f"Test set ready: {len(test_set)} samples (fully cached)\n")

train_loader = DataLoader(train_set, batch_size=config["batch_size"], shuffle=True, pin_memory=pin_memory,num_workers=2,persistent_workers=True)
test_loader = DataLoader(test_set, batch_size=500, pin_memory=pin_memory,num_workers=2,persistent_workers=True)

# Load ResNet18 pretrained on ImageNet
print(f"Loading model: {config['model']} (pretrained on {config['pretrained']})")
model = timm.create_model(config["model"], pretrained=True, num_classes=100)
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}\n")

# Label smoothing helps with noisy labels
criterion = nn.CrossEntropyLoss(label_smoothing=config["label_smoothing"])

# Create optimizer based on config
if config["optimizer"].lower() == "adamw":
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config["lr"],
        weight_decay=config["weight_decay"],
        fused=True
    )
    print(f"Optimizer: AdamW (lr={config['lr']}, weight_decay={config['weight_decay']})")
elif config["optimizer"].lower() == "sgd":
    optimizer = optim.SGD(
        model.parameters(), 
        lr=config["lr"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
        nesterov=config["nesterov"],
        fused=True
    )
    print(f"Optimizer: SGD (lr={config['lr']}, momentum={config['momentum']}, weight_decay={config['weight_decay']}, nesterov={config['nesterov']})")
else:
    raise ValueError(f"Unknown optimizer: {config['optimizer']}. Supported: 'sgd', 'adamw'")

# Learning rate scheduler
if config["scheduler"] == "steplr":
    scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=config.get("step_size", 30),
        gamma=config.get("gamma", 0.1)
    )
    print(f"Scheduler: StepLR (step_size={config.get('step_size', 30)}, gamma={config.get('gamma', 0.1)})")
elif config["scheduler"] == "cosine":
    scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optimizer,
        T_max=config["epochs"],
        eta_min=config.get("cosine_eta_min", 1e-6)
    )
    print(f"Scheduler: CosineAnnealingLR (T_max={config['epochs']}, eta_min={config.get('cosine_eta_min', 1e-6)})")
else:
    scheduler = None
    print("Scheduler: None")

# === CUTMIX HELPER FUNCTION ===
def rand_bbox(size, lam):
    """Generates a random bounding box for CutMix."""
    W = size[2]
    H = size[3]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    # uniform
    cx = np.random.randint(W)
    cy = np.random.randint(H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2

loss_threshold = config["loss_threshold"]

def train(epoch):
    print(f"\nEpoch {epoch+1}/{config['epochs']}")
    model.train()
    correct = 0
    total = 0
    running_loss = 0.0
    global loss_threshold

    use_cutmix = epoch >= config["switch_epoch"]
    aug_mode = "CutMix" if use_cutmix else "MixUp"
    
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device, non_blocking=True), targets.to(device, non_blocking=True)
        
        # SMALL-LOSS FILTERING (Only after warm-up) 
        if epoch >= config["warmup_epochs"]:
            with torch.no_grad():
                with torch.autocast(device.type, enabled=enable_half):
                    raw_outputs = model(inputs)
                    sample_losses = torch.nn.functional.cross_entropy(raw_outputs, targets, reduction='none')
                
                mask = sample_losses < loss_threshold
                
            if mask.sum() < 2: 
                continue
            inputs = inputs[mask]
            targets = targets[mask]

        rand_index = torch.randperm(inputs.size(0)).to(device)
        target_a = targets
        target_b = targets[rand_index]
        lam = np.random.beta(config["aug_alpha"], config["aug_alpha"])

        if use_cutmix:
            bbx1, bby1, bbx2, bby2 = rand_bbox(inputs.size(), lam)
            lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (inputs.size()[-1] * inputs.size()[-2]))
            inputs[:, :, bbx1:bbx2, bby1:bby2] = inputs[rand_index, :, bbx1:bbx2, bby1:bby2]
        else:
            inputs = lam * inputs + (1 - lam) * inputs[rand_index, :]
        
        with torch.autocast(device.type, enabled=enable_half):
            outputs = model(inputs)
            loss = lam * criterion(outputs, target_a) + (1 - lam) * criterion(outputs, target_b)
            
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        running_loss += loss.item() * inputs.size(0)
        predicted = outputs.argmax(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    # UPDATE THRESHOLD
    if epoch >= config["warmup_epochs"]:
        loss_threshold *= config["dynamic_threshold_decay"]
    
    epoch_loss = running_loss / total
    epoch_acc = 100.0 * correct / total
    return epoch_loss, epoch_acc, aug_mode

@torch.inference_mode()
def val():
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0

    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device, non_blocking=True), targets.to(device, non_blocking=True)
        with torch.autocast(device.type, enabled=enable_half):
            outputs = model(inputs)
            loss = criterion(outputs, targets)

        running_loss += loss.item() * inputs.size(0)
        predicted = outputs.argmax(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = 100.0 * correct / total
    return epoch_loss, epoch_acc

@torch.inference_mode()
def inference():
    model.eval()
    
    labels = []
    
    for inputs, _ in test_loader:
        inputs = inputs.to(device, non_blocking=True)
        with torch.autocast(device.type, enabled=enable_half):
            outputs = model(inputs)

        predicted = outputs.argmax(1).tolist()
        labels.extend(predicted)
    
    return labels

# Initialize WandB
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
run_name = f"cifar100noisy_{config['model']}_{config['optimizer']}_lr{config['lr']}_bs{config['batch_size']}_{timestamp}"

# wandb.init(
#     project=config["wandb_project"],
#     name=run_name,
#     config=config
# )

best = 0.0
best_epoch = 0

# Initialize early stopping
early_stopping = EarlyStopping(
    patience=config["early_stop_patience"],
    min_delta=config["early_stop_min_delta"],
    mode=config["early_stop_mode"]
)

print(f"\n{'='*70}")
print(f"Starting Training - {config['epochs']} epochs")
print(f"Model: {config['model']} (pretrained on {config['pretrained']})")
print(f"Optimizer: {config['optimizer'].upper()}, LR: {config['lr']}, Batch Size: {config['batch_size']}")
if config["optimizer"].lower() == "sgd":
    print(f"Momentum: {config['momentum']}, Nesterov: {config['nesterov']}")
print(f"Weight Decay: {config['weight_decay']}, Label Smoothing: {config['label_smoothing']}")
print(f"Scheduler: {config['scheduler']}")
print(f"Early Stopping: Enabled (patience={config['early_stop_patience']}, mode={config['early_stop_mode']})")
print(f"{'='*70}\n")

with tqdm(range(config["epochs"])) as tbar:
    for epoch in tbar:
        train_loss, train_acc, mode = train(epoch)
        val_loss, val_acc = val()
        
        # Update learning rate
        if scheduler is not None:
            scheduler.step()
            current_lr = scheduler.get_last_lr()[0]
        else:
            current_lr = config["lr"]
        
        if val_acc > best:
            best = val_acc
            best_epoch = epoch
            # Save best model
            checkpoint = {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_acc': val_acc,
                'val_loss': val_loss,
            }
            if scheduler is not None:
                checkpoint['scheduler_state_dict'] = scheduler.state_dict()
            torch.save(checkpoint, './best_model.pth')
        
        # Log to WandB
        # wandb.log({
        #     "epoch": epoch,
        #     "train_loss": train_loss,
        #     "train_acc": train_acc,
        #     "val_loss": val_loss,
        #     "val_acc": val_acc,
        #     "best_val_acc": best,
        #     "lr": current_lr,
        #     "aug_mode": 1 if mode == "CutMix" else 0
        # })
        
        tbar.set_description(f"Epoch {epoch+1}/{config['epochs']} | Train: {train_acc:.2f}% | Val: {val_acc:.2f}% | Best: {best:.2f}% | LR: {current_lr:.6f}")
        print(f"Epoch {epoch+1}/{config['epochs']} | Train: {train_acc:.2f}% | Val: {val_acc:.2f}% | Best: {best:.2f}% | LR: {current_lr:.6f}")
        
        # Early stopping check
        if early_stopping(val_acc, epoch):
            print(f"\n{'='*60}")
            print(f"Early stopping triggered at epoch {epoch+1}")
            print(f"Best Val Accuracy: {best:.2f}% at epoch {best_epoch+1}")
            print(f"No improvement for {config['early_stop_patience']} epochs")
            print(f"{'='*60}\n")
            break
    

print(f"\n{'='*60}")
print(f"Training Complete!")
print(f"Best Val Accuracy: {best:.2f}% at epoch {best_epoch+1}")
print(f"Loading best model for inference...")
print(f"{'='*60}\n")

# Load best model for inference
checkpoint = torch.load('./best_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])
print(f"Best model loaded (Epoch {checkpoint['epoch']+1}, Val Acc: {checkpoint['val_acc']:.2f}%)\n")

# Generate submission
data = {
    "ID": [],
    "target": []
}

print("Generating predictions...")
for i, label in enumerate(inference()):
    data["ID"].append(i)
    data["target"].append(label)

df = pd.DataFrame(data)
df.to_csv("/kaggle/working/submission.csv", index=False)

# Log final results to WandB
# wandb.summary["final_best_val_acc"] = best
# wandb.summary["best_epoch"] = best_epoch
# wandb.summary["total_epochs"] = config["epochs"]
# if scheduler is not None:
#     wandb.summary["final_lr"] = scheduler.get_last_lr()[0]
# else:
#     wandb.summary["final_lr"] = config["lr"]
# wandb.summary["early_stopped"] = early_stopping.early_stop
# wandb.summary["epochs_trained"] = best_epoch + 1 if early_stopping.early_stop else config["epochs"]

print(f"\n{'='*60}")
print(f"Submission saved to: ./submission.csv")
print(f"Best model saved to: ./best_model.pth")
print(f"Best Val Accuracy: {best:.2f}% (Epoch {best_epoch+1})")
if early_stopping.early_stop:
    print(f"Training stopped early (patience reached)")
print(f"{'='*60}\n")

# Finish WandB run
# 

Using device: cuda
Loading datasets...

[TRAIN SET]
Preprocessing 50000 images (this happens once)...


                                                                

Cached 50000 preprocessed images

[TEST SET]
Preprocessing 10000 images (this happens once)...


                                                               

Cached 10000 preprocessed images

Train set ready: 50000 samples (with runtime augmentation)
Test set ready: 10000 samples (fully cached)

Loading model: resnet18 (pretrained on imagenet)


model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

Total parameters: 11,227,812
Trainable parameters: 11,227,812

Optimizer: AdamW (lr=0.001, weight_decay=0.0005)
Scheduler: CosineAnnealingLR (T_max=100, eta_min=5e-06)

Starting Training - 100 epochs
Model: resnet18 (pretrained on imagenet)
Optimizer: ADAMW, LR: 0.001, Batch Size: 128
Weight Decay: 0.0005, Label Smoothing: 0.1
Scheduler: cosine
Early Stopping: Enabled (patience=12, mode=max)



  0%|          | 0/100 [00:00<?, ?it/s]


Epoch 1/100


Epoch 1/100 | Train: 14.17% | Val: 56.71% | Best: 56.71% | LR: 0.001000:   1%|          | 1/100 [03:44<6:09:38, 224.03s/it]

Epoch 1/100 | Train: 14.17% | Val: 56.71% | Best: 56.71% | LR: 0.001000

Epoch 2/100


Epoch 2/100 | Train: 20.40% | Val: 61.90% | Best: 61.90% | LR: 0.000999:   2%|▏         | 2/100 [07:15<5:53:58, 216.72s/it]

Epoch 2/100 | Train: 20.40% | Val: 61.90% | Best: 61.90% | LR: 0.000999

Epoch 3/100


Epoch 3/100 | Train: 22.96% | Val: 63.60% | Best: 63.60% | LR: 0.000998:   3%|▎         | 3/100 [10:52<5:50:28, 216.79s/it]

Epoch 3/100 | Train: 22.96% | Val: 63.60% | Best: 63.60% | LR: 0.000998

Epoch 4/100


Epoch 4/100 | Train: 24.83% | Val: 65.62% | Best: 65.62% | LR: 0.000996:   4%|▍         | 4/100 [14:27<5:45:32, 215.97s/it]

Epoch 4/100 | Train: 24.83% | Val: 65.62% | Best: 65.62% | LR: 0.000996

Epoch 5/100


Epoch 5/100 | Train: 25.78% | Val: 65.93% | Best: 65.93% | LR: 0.000994:   5%|▌         | 5/100 [18:02<5:41:33, 215.72s/it]

Epoch 5/100 | Train: 25.78% | Val: 65.93% | Best: 65.93% | LR: 0.000994

Epoch 6/100


Epoch 6/100 | Train: 37.44% | Val: 64.33% | Best: 65.93% | LR: 0.000991:   6%|▌         | 6/100 [21:50<5:44:38, 219.98s/it]

Epoch 6/100 | Train: 37.44% | Val: 64.33% | Best: 65.93% | LR: 0.000991

Epoch 7/100


Epoch 7/100 | Train: 39.41% | Val: 66.25% | Best: 66.25% | LR: 0.000988:   7%|▋         | 7/100 [25:25<5:38:32, 218.42s/it]

Epoch 7/100 | Train: 39.41% | Val: 66.25% | Best: 66.25% | LR: 0.000988

Epoch 8/100


Epoch 8/100 | Train: 38.81% | Val: 66.83% | Best: 66.83% | LR: 0.000984:   8%|▊         | 8/100 [29:00<5:33:11, 217.30s/it]

Epoch 8/100 | Train: 38.81% | Val: 66.83% | Best: 66.83% | LR: 0.000984

Epoch 9/100


Epoch 9/100 | Train: 41.92% | Val: 68.55% | Best: 68.55% | LR: 0.000980:   9%|▉         | 9/100 [32:35<5:28:26, 216.56s/it]

Epoch 9/100 | Train: 41.92% | Val: 68.55% | Best: 68.55% | LR: 0.000980

Epoch 10/100


Epoch 10/100 | Train: 37.78% | Val: 67.60% | Best: 68.55% | LR: 0.000976:  10%|█         | 10/100 [36:09<5:23:38, 215.76s/it]

Epoch 10/100 | Train: 37.78% | Val: 67.60% | Best: 68.55% | LR: 0.000976

Epoch 11/100


Epoch 11/100 | Train: 40.78% | Val: 67.83% | Best: 68.55% | LR: 0.000971:  11%|█         | 11/100 [39:44<5:19:27, 215.36s/it]

Epoch 11/100 | Train: 40.78% | Val: 67.83% | Best: 68.55% | LR: 0.000971

Epoch 12/100


Epoch 12/100 | Train: 38.09% | Val: 67.55% | Best: 68.55% | LR: 0.000965:  12%|█▏        | 12/100 [43:18<5:15:27, 215.08s/it]

Epoch 12/100 | Train: 38.09% | Val: 67.55% | Best: 68.55% | LR: 0.000965

Epoch 13/100


Epoch 13/100 | Train: 41.74% | Val: 68.35% | Best: 68.55% | LR: 0.000959:  13%|█▎        | 13/100 [46:52<5:11:27, 214.80s/it]

Epoch 13/100 | Train: 41.74% | Val: 68.35% | Best: 68.55% | LR: 0.000959

Epoch 14/100


Epoch 14/100 | Train: 39.47% | Val: 68.20% | Best: 68.55% | LR: 0.000953:  14%|█▍        | 14/100 [50:26<5:07:23, 214.46s/it]

Epoch 14/100 | Train: 39.47% | Val: 68.20% | Best: 68.55% | LR: 0.000953

Epoch 15/100


Epoch 15/100 | Train: 40.41% | Val: 69.10% | Best: 69.10% | LR: 0.000946:  15%|█▌        | 15/100 [54:01<5:03:52, 214.50s/it]

Epoch 15/100 | Train: 40.41% | Val: 69.10% | Best: 69.10% | LR: 0.000946

Epoch 16/100


Epoch 16/100 | Train: 43.42% | Val: 68.59% | Best: 69.10% | LR: 0.000938:  16%|█▌        | 16/100 [57:33<4:59:19, 213.80s/it]

Epoch 16/100 | Train: 43.42% | Val: 68.59% | Best: 69.10% | LR: 0.000938

Epoch 17/100


Epoch 17/100 | Train: 42.85% | Val: 69.07% | Best: 69.10% | LR: 0.000931:  17%|█▋        | 17/100 [1:01:07<4:55:58, 213.96s/it]

Epoch 17/100 | Train: 42.85% | Val: 69.07% | Best: 69.10% | LR: 0.000931

Epoch 18/100


Epoch 18/100 | Train: 42.87% | Val: 68.82% | Best: 69.10% | LR: 0.000923:  18%|█▊        | 18/100 [1:04:42<4:52:51, 214.28s/it]

Epoch 18/100 | Train: 42.87% | Val: 68.82% | Best: 69.10% | LR: 0.000923

Epoch 19/100


Epoch 19/100 | Train: 40.93% | Val: 68.31% | Best: 69.10% | LR: 0.000914:  19%|█▉        | 19/100 [1:08:15<4:48:45, 213.90s/it]

Epoch 19/100 | Train: 40.93% | Val: 68.31% | Best: 69.10% | LR: 0.000914

Epoch 20/100


Epoch 20/100 | Train: 42.55% | Val: 68.93% | Best: 69.10% | LR: 0.000905:  20%|██        | 20/100 [1:11:50<4:45:25, 214.07s/it]

Epoch 20/100 | Train: 42.55% | Val: 68.93% | Best: 69.10% | LR: 0.000905

Epoch 21/100


Epoch 21/100 | Train: 43.96% | Val: 68.58% | Best: 69.10% | LR: 0.000896:  21%|██        | 21/100 [1:15:24<4:42:06, 214.26s/it]

Epoch 21/100 | Train: 43.96% | Val: 68.58% | Best: 69.10% | LR: 0.000896

Epoch 22/100


Epoch 22/100 | Train: 42.29% | Val: 68.74% | Best: 69.10% | LR: 0.000886:  22%|██▏       | 22/100 [1:18:59<4:38:45, 214.43s/it]

Epoch 22/100 | Train: 42.29% | Val: 68.74% | Best: 69.10% | LR: 0.000886

Epoch 23/100


Epoch 23/100 | Train: 45.19% | Val: 69.04% | Best: 69.10% | LR: 0.000876:  23%|██▎       | 23/100 [1:22:33<4:35:06, 214.37s/it]

Epoch 23/100 | Train: 45.19% | Val: 69.04% | Best: 69.10% | LR: 0.000876

Epoch 24/100


Epoch 24/100 | Train: 46.30% | Val: 68.61% | Best: 69.10% | LR: 0.000865:  24%|██▍       | 24/100 [1:26:06<4:31:00, 213.95s/it]

Epoch 24/100 | Train: 46.30% | Val: 68.61% | Best: 69.10% | LR: 0.000865

Epoch 25/100


Epoch 25/100 | Train: 41.85% | Val: 68.41% | Best: 69.10% | LR: 0.000854:  25%|██▌       | 25/100 [1:29:40<4:27:27, 213.97s/it]

Epoch 25/100 | Train: 41.85% | Val: 68.41% | Best: 69.10% | LR: 0.000854

Epoch 26/100


Epoch 26/100 | Train: 57.79% | Val: 70.48% | Best: 70.48% | LR: 0.000843:  26%|██▌       | 26/100 [1:33:16<4:24:31, 214.48s/it]

Epoch 26/100 | Train: 57.79% | Val: 70.48% | Best: 70.48% | LR: 0.000843

Epoch 27/100


Epoch 27/100 | Train: 57.32% | Val: 70.06% | Best: 70.48% | LR: 0.000832:  27%|██▋       | 27/100 [1:36:50<4:20:47, 214.35s/it]

Epoch 27/100 | Train: 57.32% | Val: 70.06% | Best: 70.48% | LR: 0.000832

Epoch 28/100


Epoch 28/100 | Train: 60.13% | Val: 70.22% | Best: 70.48% | LR: 0.000820:  28%|██▊       | 28/100 [1:40:26<4:17:56, 214.96s/it]

Epoch 28/100 | Train: 60.13% | Val: 70.22% | Best: 70.48% | LR: 0.000820

Epoch 29/100


Epoch 29/100 | Train: 57.77% | Val: 70.57% | Best: 70.57% | LR: 0.000807:  29%|██▉       | 29/100 [1:44:03<4:14:48, 215.32s/it]

Epoch 29/100 | Train: 57.77% | Val: 70.57% | Best: 70.57% | LR: 0.000807

Epoch 30/100


Epoch 30/100 | Train: 60.63% | Val: 70.22% | Best: 70.57% | LR: 0.000795:  30%|███       | 30/100 [1:47:38<4:11:09, 215.28s/it]

Epoch 30/100 | Train: 60.63% | Val: 70.22% | Best: 70.57% | LR: 0.000795

Epoch 31/100


Epoch 31/100 | Train: 58.73% | Val: 70.04% | Best: 70.57% | LR: 0.000782:  31%|███       | 31/100 [1:51:12<4:07:19, 215.06s/it]

Epoch 31/100 | Train: 58.73% | Val: 70.04% | Best: 70.57% | LR: 0.000782

Epoch 32/100


Epoch 32/100 | Train: 60.43% | Val: 70.48% | Best: 70.57% | LR: 0.000769:  32%|███▏      | 32/100 [1:54:47<4:03:27, 214.82s/it]

Epoch 32/100 | Train: 60.43% | Val: 70.48% | Best: 70.57% | LR: 0.000769

Epoch 33/100


Epoch 33/100 | Train: 60.89% | Val: 70.34% | Best: 70.57% | LR: 0.000756:  33%|███▎      | 33/100 [1:58:22<4:00:10, 215.08s/it]

Epoch 33/100 | Train: 60.89% | Val: 70.34% | Best: 70.57% | LR: 0.000756

Epoch 34/100


Epoch 34/100 | Train: 61.70% | Val: 69.39% | Best: 70.57% | LR: 0.000742:  34%|███▍      | 34/100 [2:01:56<3:56:03, 214.59s/it]

Epoch 34/100 | Train: 61.70% | Val: 69.39% | Best: 70.57% | LR: 0.000742

Epoch 35/100


Epoch 35/100 | Train: 60.92% | Val: 71.05% | Best: 71.05% | LR: 0.000728:  35%|███▌      | 35/100 [2:05:33<3:53:11, 215.26s/it]

Epoch 35/100 | Train: 60.92% | Val: 71.05% | Best: 71.05% | LR: 0.000728

Epoch 36/100


Epoch 36/100 | Train: 62.90% | Val: 70.92% | Best: 71.05% | LR: 0.000714:  36%|███▌      | 36/100 [2:09:08<3:49:33, 215.21s/it]

Epoch 36/100 | Train: 62.90% | Val: 70.92% | Best: 71.05% | LR: 0.000714

Epoch 37/100


Epoch 37/100 | Train: 60.48% | Val: 70.59% | Best: 71.05% | LR: 0.000700:  37%|███▋      | 37/100 [2:12:43<3:46:07, 215.36s/it]

Epoch 37/100 | Train: 60.48% | Val: 70.59% | Best: 71.05% | LR: 0.000700

Epoch 38/100


Epoch 38/100 | Train: 64.46% | Val: 70.47% | Best: 71.05% | LR: 0.000686:  38%|███▊      | 38/100 [2:16:20<3:42:47, 215.60s/it]

Epoch 38/100 | Train: 64.46% | Val: 70.47% | Best: 71.05% | LR: 0.000686

Epoch 39/100


Epoch 39/100 | Train: 61.58% | Val: 70.05% | Best: 71.05% | LR: 0.000671:  39%|███▉      | 39/100 [2:19:55<3:39:18, 215.71s/it]

Epoch 39/100 | Train: 61.58% | Val: 70.05% | Best: 71.05% | LR: 0.000671

Epoch 40/100


Epoch 40/100 | Train: 61.86% | Val: 70.52% | Best: 71.05% | LR: 0.000656:  40%|████      | 40/100 [2:23:30<3:35:28, 215.47s/it]

Epoch 40/100 | Train: 61.86% | Val: 70.52% | Best: 71.05% | LR: 0.000656

Epoch 41/100


Epoch 41/100 | Train: 60.89% | Val: 70.28% | Best: 71.05% | LR: 0.000641:  41%|████      | 41/100 [2:27:05<3:31:39, 215.25s/it]

Epoch 41/100 | Train: 60.89% | Val: 70.28% | Best: 71.05% | LR: 0.000641

Epoch 42/100


Epoch 42/100 | Train: 61.47% | Val: 70.51% | Best: 71.05% | LR: 0.000626:  42%|████▏     | 42/100 [2:30:41<3:28:16, 215.46s/it]

Epoch 42/100 | Train: 61.47% | Val: 70.51% | Best: 71.05% | LR: 0.000626

Epoch 43/100


Epoch 43/100 | Train: 64.55% | Val: 70.36% | Best: 71.05% | LR: 0.000611:  43%|████▎     | 43/100 [2:34:17<3:24:54, 215.68s/it]

Epoch 43/100 | Train: 64.55% | Val: 70.36% | Best: 71.05% | LR: 0.000611

Epoch 44/100


Epoch 44/100 | Train: 63.81% | Val: 71.03% | Best: 71.05% | LR: 0.000596:  44%|████▍     | 44/100 [2:37:52<3:20:57, 215.32s/it]

Epoch 44/100 | Train: 63.81% | Val: 71.03% | Best: 71.05% | LR: 0.000596

Epoch 45/100


Epoch 45/100 | Train: 60.35% | Val: 69.90% | Best: 71.05% | LR: 0.000580:  45%|████▌     | 45/100 [2:41:27<3:17:29, 215.44s/it]

Epoch 45/100 | Train: 60.35% | Val: 69.90% | Best: 71.05% | LR: 0.000580

Epoch 46/100


Epoch 46/100 | Train: 63.26% | Val: 70.46% | Best: 71.05% | LR: 0.000565:  46%|████▌     | 46/100 [2:45:02<3:13:34, 215.09s/it]

Epoch 46/100 | Train: 63.26% | Val: 70.46% | Best: 71.05% | LR: 0.000565

Epoch 47/100


Epoch 47/100 | Train: 62.07% | Val: 70.15% | Best: 71.05% | LR: 0.000549:  46%|████▌     | 46/100 [2:48:37<3:17:56, 219.94s/it]
  checkpoint = torch.load('./best_model.pth')


Epoch 47/100 | Train: 62.07% | Val: 70.15% | Best: 71.05% | LR: 0.000549

Early stopping triggered at epoch 47
Best Val Accuracy: 71.05% at epoch 35
No improvement for 12 epochs


Training Complete!
Best Val Accuracy: 71.05% at epoch 35
Loading best model for inference...

Best model loaded (Epoch 35, Val Acc: 71.05%)

Generating predictions...

Submission saved to: ./submission.csv
Best model saved to: ./best_model.pth
Best Val Accuracy: 71.05% (Epoch 35)
Training stopped early (patience reached)

