In [1]:
import os
import torch
import timm
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from typing import Optional, Callable

from torch import nn, Tensor, optim
from torch.utils.data import Dataset, DataLoader
from torch.backends import cudnn
from torch import GradScaler
import torch.nn.functional as F

from torchvision.datasets import CIFAR100
from torchvision.transforms import v2
from sklearn.neighbors import NearestNeighbors

# ============ Configuration ============
config = {
    "dataset": "cifar100_noisy",
    "model": "resnet18",
    "pretrained": "imagenet",
    "epochs": 100,
    "batch_size": 128,
    "lr": 0.001,
    "momentum": 0.9,
    "weight_decay": 0.02,
    "nesterov": True,
    "label_smoothing": 0.1,
    "optimizer": "adamw",
    "scheduler": "warm_restarts",     
    "warm_restarts_T0": 25,              
    "warm_restarts_mult": 1,             
    "cosine_eta_min": 1e-5,
    "scheduler_step_per_batch": True,
    "early_stop_patience": 35,
    "early_stop_mode": "max",
    "early_stop_min_delta": 0.01,
    "device": "cuda",
    "mixed_precision": True,
    "wandb_project": "cifar100-noisy-competition",
    "upscale_size": 224,  
    "aug_alpha": 0.5,          
    "cutmix_prob": 1.0,        
    "switch_epoch": 30,        
    "warmup_epochs": 5,
    "loss_threshold": 2.8,
    "dynamic_threshold_decay": 0.997,
    "soft_alpha": 0.45
}

device = torch.device(config["device"])
print(f"Using device: {device}")
cudnn.benchmark = True
pin_memory = True
enable_half = config["mixed_precision"]  # Disable for CPU, it is slower!
scaler = GradScaler(device, enabled=enable_half)

def get_refinement_metadata(dataset, device):
    print("\nExtracting Features for Analysis...")
    embed_model = timm.create_model('resnet18', pretrained=True, num_classes=0).to(device)
    embed_model.eval()
    
    analysis_transform = v2.Compose([
        v2.ToImage(), v2.Resize(224), v2.ToDtype(torch.float32, scale=True),
        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    features, labels = [], np.array(dataset.targets)
    with torch.no_grad():
        for i in tqdm(range(len(dataset)), desc="Analyzing Data"):
            img, _ = dataset[i]
            img_t = analysis_transform(img).unsqueeze(0).to(device)
            features.append(embed_model(img_t).cpu().numpy())
    
    features = np.concatenate(features, axis=0)
    
    print("Running k-NN Agreement & Class Balancing...")
    K = 10
    knn = NearestNeighbors(n_neighbors=K+1, metric='cosine').fit(features)
    _, indices = knn.kneighbors(features)
    
    refurbished_labels = []
    agreement_scores = []
    for i in range(len(features)):
        neighbor_labels = labels[indices[i, 1:]]
        counts = np.bincount(neighbor_labels, minlength=100)
        refurbished_labels.append(np.argmax(counts))
        agreement_scores.append(counts[labels[i]] / K)
    
    agreement_scores = np.array(agreement_scores)
    refurbished_labels = np.array(refurbished_labels)

    stable_mask = agreement_scores >= 0.9
    counts_per_class = np.bincount(labels[stable_mask], minlength=100)
    rare_classes = np.where(counts_per_class < 20)[0] # Target classes below 20 samples
    
    for c in rare_classes:
        # Bolster rare classes using medium agreement samples (above 75%)
        potential_candidates = np.where((agreement_scores >= 0.75) & (refurbished_labels == c))[0]
        agreement_scores[potential_candidates] = 0.95 # Promote to "Stable" status
            
    del embed_model
    torch.cuda.empty_cache()
    
    return {"agreement": agreement_scores, "refurbished": refurbished_labels}

class SimpleCachedDataset(Dataset):
    def __init__(self, dataset):
        # Runtime transforms are not implemented in this simple cached dataset.
        self.data = tuple([x for x in dataset])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]

class PreprocessedDataset(Dataset):
    """
    Cache preprocessed tensors - apply transforms once and store results.
    
    PERFORMANCE OPTIMIZATION:
    - Applies deterministic transforms (ToImage, Resize) once at startup
    - Stores uint8 tensors (4x less memory than float32)
    - Random augmentations applied at runtime each epoch
    - Test set only needs normalization at runtime (huge speedup!)
    """
    def __init__(self, dataset, transform):
        print(f"Preprocessing {len(dataset)} images (this happens once)...")
        self.data = []
        self.targets = []
        
        for img, target in tqdm(dataset, desc="Caching", leave=False):
            transformed = transform(img)
            self.data.append(transformed)
            self.targets.append(target)
        
        print(f"Cached {len(self.data)} preprocessed images")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        return self.data[i], self.targets[i]

class RefinedAugmentationWrapper(Dataset):
    def __init__(self, preprocessed_dataset, runtime_transforms, refinement_map=None, alpha=0.4):
        self.dataset = preprocessed_dataset
        self.runtime_transforms = runtime_transforms
        self.refinement_map = refinement_map
        self.alpha = alpha
        
    def __len__(self):
        return len(self.dataset)
        
    def __getitem__(self, i):
        img_tensor, original_target = self.dataset[i]
        if self.runtime_transforms is not None:
            img_tensor = self.runtime_transforms(img_tensor)
        
        # Default: Hard label, full weight
        target_a = original_target
        target_b = original_target
        lam_target = 1.0
        weight = 1.0 

        if self.refinement_map is not None:
            score = self.refinement_map["agreement"][i]
            refurbished = int(self.refinement_map["refurbished"][i])
            
            # SOFT LABELING LOGIC
            # If k-NN and Original disagree (Noise/Overlap), we blend them.
            if score < 0.7:
                target_a = original_target
                target_b = refurbished
                lam_target = 1.0 - self.alpha  # e.g., 0.6 original, 0.4 refurbished
                weight = max(score, 0.4)       # Loss is scaled by k-NN confidence
            else:
                # Highly stable samples
                target_a = original_target
                target_b = original_target
                lam_target = 1.0
                weight = 1.0
                
        # We return two targets and a lambda to handle the "Soft Label" in the loss function
        return img_tensor, target_a, target_b, lam_target, weight

class CIFAR100_noisy_fine(Dataset):
    """
    See https://github.com/UCSC-REAL/cifar-10-100n, https://www.noisylabels.com/ and `Learning with Noisy Labels
    Revisited: A Study Using Real-World Human Annotations`.
    """

    def __init__(
        self, root: str, train: bool, transform: Optional[Callable], download: bool
    ):
        cifar100 = CIFAR100(
            root=root, train=train, transform=None, download=download
        )
        data, targets = tuple(zip(*cifar100))

        if train:
            noisy_label_file = os.path.join(root, "CIFAR-100-noisy.npz")
            if not os.path.isfile(noisy_label_file):
                raise FileNotFoundError(
                    f"{type(self).__name__} need {noisy_label_file} to be used!"
                )

            noise_file = np.load(noisy_label_file)
            if not np.array_equal(noise_file["clean_label"], targets):
                raise RuntimeError("Clean labels do not match!")
            targets = noise_file["noisy_label"]

        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, i: int):
        return self.data[i], self.targets[i]


class EarlyStopping:
    """Early stopping to stop training when validation metric doesn't improve."""
    def __init__(self, patience=10, min_delta=0.0, mode='max'):
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_epoch = 0
        
    def __call__(self, score, epoch):
        if self.best_score is None:
            self.best_score = score
            self.best_epoch = epoch
            return False
        
        if self.mode == 'max':
            # For accuracy (higher is better)
            if score > self.best_score + self.min_delta:
                self.best_score = score
                self.best_epoch = epoch
                self.counter = 0
            else:
                self.counter += 1
        else:
            # For loss (lower is better)
            if score < self.best_score - self.min_delta:
                self.best_score = score
                self.best_epoch = epoch
                self.counter = 0
            else:
                self.counter += 1
        
        if self.counter >= self.patience:
            self.early_stop = True
        
        return self.early_stop


# === PREPROCESSING (applied once and cached) ===
# Only deterministic, spatial transforms - stores uint8 tensors (saves memory!)
preprocess_transforms = v2.Compose([
    v2.ToImage(),
    v2.Resize(config["upscale_size"]),  # Upscale from 32x32 to 128x128
])

# === RUNTIME AUGMENTATION (applied at each epoch) ===
# Random transforms for training (includes normalization at the end)
train_runtime_transforms = v2.Compose([
    v2.RandomCrop(config["upscale_size"], padding=4),
    v2.RandomHorizontalFlip(p=0.5),
    v2.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    v2.RandomRotation(15),
    v2.ToDtype(torch.float32, scale=True),  # Convert to float [0,1]
    v2.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),  # CIFAR-100 stats
    v2.RandomErasing(p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3))
])

# Test set: only normalization needed (spatial transforms already done)
test_runtime_transforms = v2.Compose([
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])

# Load raw datasets
print("Loading datasets...")
train_set_raw = CIFAR100_noisy_fine('/kaggle/input/fii-atnn-2025-project-noisy-cifar-100/fii-atnn-2024-project-noisy-cifar-100', download=False, train=True, transform=None)
test_set_raw = CIFAR100_noisy_fine('/kaggle/input/fii-atnn-2025-project-noisy-cifar-100/fii-atnn-2024-project-noisy-cifar-100', download=False, train=False, transform=None)

# Cache raw PIL images (fast, lightweight)
train_set_cached = SimpleCachedDataset(train_set_raw)
test_set_cached = SimpleCachedDataset(test_set_raw)

# Preprocess and cache as tensors (done once!)
print("\n[TRAIN SET]")
train_set_preprocessed = PreprocessedDataset(train_set_cached, preprocess_transforms)
print("\n[TEST SET]")
test_set_preprocessed = PreprocessedDataset(test_set_cached, preprocess_transforms)

refinement_map = get_refinement_metadata(train_set_raw, device)

# Add runtime augmentations (applied each epoch for train, none for test)
train_set = RefinedAugmentationWrapper(train_set_preprocessed, train_runtime_transforms, refinement_map=refinement_map)
test_set = RefinedAugmentationWrapper(test_set_preprocessed, test_runtime_transforms)

print(f"\nTrain set ready: {len(train_set)} samples (with runtime augmentation)")
print(f"Test set ready: {len(test_set)} samples (fully cached)\n")

train_loader = DataLoader(train_set, batch_size=config["batch_size"], shuffle=True, pin_memory=pin_memory,num_workers=2,persistent_workers=True)
test_loader = DataLoader(test_set, batch_size=500, pin_memory=pin_memory,num_workers=2,persistent_workers=True)

# Load ResNet18 pretrained on ImageNet
print(f"Loading model: {config['model']} (pretrained on {config['pretrained']})")
model = timm.create_model(config["model"], pretrained=True, num_classes=100)
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}\n")

# Label smoothing helps with noisy labels
criterion = nn.CrossEntropyLoss(label_smoothing=config["label_smoothing"])

# Create optimizer based on config
if config["optimizer"].lower() == "adamw":
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config["lr"],
        weight_decay=config["weight_decay"],
        fused=True
    )
    print(f"Optimizer: AdamW (lr={config['lr']}, weight_decay={config['weight_decay']})")
elif config["optimizer"].lower() == "sgd":
    optimizer = optim.SGD(
        model.parameters(), 
        lr=config["lr"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
        nesterov=config["nesterov"],
        fused=True
    )
    print(f"Optimizer: SGD (lr={config['lr']}, momentum={config['momentum']}, weight_decay={config['weight_decay']}, nesterov={config['nesterov']})")
else:
    raise ValueError(f"Unknown optimizer: {config['optimizer']}. Supported: 'sgd', 'adamw'")

# Learning rate scheduler
if config["scheduler"] == "steplr":
    scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=config.get("step_size", 30),
        gamma=config.get("gamma", 0.1)
    )
    print(f"Scheduler: StepLR (step_size={config.get('step_size', 30)}, gamma={config.get('gamma', 0.1)})")

elif config["scheduler"] == "cosine":
    scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optimizer,
        T_max=config["epochs"],
        eta_min=config.get("cosine_eta_min", 1e-6)
    )
    print(f"Scheduler: CosineAnnealingLR (T_max={config['epochs']}, eta_min={config['cosine_eta_min']})")

elif config["scheduler"] == "warm_restarts":
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer,
        T_0=config["warm_restarts_T0"],         
        T_mult=config.get("warm_restarts_mult", 1), 
        eta_min=config.get("cosine_eta_min", 1e-6)
    )
    # Highlight the per-batch setting so you know it's active
    step_mode = "Per-Batch" if config.get("scheduler_step_per_batch") else "Per-Epoch"
    print(f"Scheduler: WarmRestarts (T_0={config['warm_restarts_T0']}, T_mult={config['warm_restarts_mult']}, Step: {step_mode})")

else:
    scheduler = None
    print("Scheduler: None")

# === CUTMIX HELPER FUNCTION ===
def rand_bbox(size, lam):
    """Generates a random bounding box for CutMix."""
    W = size[2]
    H = size[3]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    # uniform
    cx = np.random.randint(W)
    cy = np.random.randint(H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2

loss_threshold = config["loss_threshold"]

def train(epoch):
    print(f"\nEpoch {epoch+1}/{config['epochs']}")
    model.train()
    correct = 0
    total = 0
    running_loss = 0.0
    global loss_threshold
    
    # DYNAMIC ALPHA LOGIC
    # Starts at 0.35 and scales to 0.60 by epoch 50
    # This ensures we don't trust the refurbished labels until the features are sharp.
    current_soft_alpha = 0.35 + (min(epoch, 50) / 50) * 0.25
    train_loader.dataset.alpha = current_soft_alpha
    
    initial_batch_count = 0 
    use_cutmix = epoch >= config["switch_epoch"]
    aug_mode = "CutMix" if use_cutmix else "MixUp"
    
    for batch_idx, (inputs, t_a, t_b, lam_t, weights) in enumerate(train_loader):
        inputs = inputs.to(device, non_blocking=True)
        t_a = t_a.to(device, non_blocking=True)
        t_b = t_b.to(device, non_blocking=True)
        lam_t = lam_t.unsqueeze(1).to(device, non_blocking=True) 
        weights = weights.to(device, non_blocking=True)
        
        initial_batch_count += inputs.size(0)

        # DYNAMIC FILTERING
        if epoch >= config["warmup_epochs"]:
            with torch.no_grad():
                with torch.autocast(device.type, enabled=enable_half):
                    raw_outputs = model(inputs)
                    sample_losses = F.cross_entropy(raw_outputs, t_a, reduction='none')
                mask = sample_losses < loss_threshold
            if mask.sum() < 2: continue
            inputs, t_a, t_b, lam_t, weights = inputs[mask], t_a[mask], t_b[mask], lam_t[mask], weights[mask]

        # AUGMENTATION
        rand_index = torch.randperm(inputs.size(0)).to(device)
        lam_aug = np.random.beta(config["aug_alpha"], config["aug_alpha"])

        if use_cutmix:
            bbx1, bby1, bbx2, bby2 = rand_bbox(inputs.size(), lam_aug)
            lam_aug = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (inputs.size()[-1] * inputs.size()[-2]))
            inputs[:, :, bbx1:bbx2, bby1:bby2] = inputs[rand_index, :, bbx1:bbx2, bby1:bby2]
        else:
            inputs = lam_aug * inputs + (1 - lam_aug) * inputs[rand_index, :]

        # FORWARD & SOFT-LOSS
        with torch.autocast(device.type, enabled=enable_half):
            outputs = model(inputs)
            
            def get_soft_loss(out, target_a, target_b, l_t):
                return l_t.view(-1) * criterion(out, target_a) + (1 - l_t.view(-1)) * criterion(out, target_b)

            loss_current = get_soft_loss(outputs, t_a, t_b, lam_t)
            loss_shuffled = get_soft_loss(outputs, t_a[rand_index], t_b[rand_index], lam_t[rand_index])
            
            mixed_loss = lam_aug * loss_current + (1 - lam_aug) * loss_shuffled
            batch_weights = lam_aug * weights + (1 - lam_aug) * weights[rand_index]
            loss = (mixed_loss * batch_weights).mean()

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        # DECAYING WARM RESTARTS
        if config.get("scheduler_step_per_batch") and scheduler is not None:
            cycle_idx = epoch // config["warm_restarts_T0"]
            cycle_decay = 0.8 ** cycle_idx 
            
            # Use base_lrs to force the scheduler to restart at a lower peak (e.g. 0.0008 instead of 0.001)
            scheduler.base_lrs = [config["lr"] * cycle_decay for _ in range(len(optimizer.param_groups))]
            
            scheduler.step(epoch + batch_idx / len(train_loader))

        running_loss += loss.item() * inputs.size(0)
        total += t_a.size(0)
        correct += outputs.argmax(1).eq(t_a).sum().item()

    if epoch >= config["warmup_epochs"]:
        loss_threshold *= config["dynamic_threshold_decay"]
    
    epoch_acc = 100.0 * correct / total
    keep_rate = (total / initial_batch_count) * 100
    
    # Debug print for Alpha
    print(f"Keep Rate: {keep_rate:.2f}% | Alpha: {current_soft_alpha:.3f} | Mode: {aug_mode}")
    return running_loss / total, epoch_acc, aug_mode

@torch.inference_mode()
def val():
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0

    for inputs, t_a, t_b, lam_t, _ in test_loader:
        inputs, targets = inputs.to(device), t_a.to(device)
        with torch.autocast(device.type, enabled=enable_half):
            outputs = model(inputs)
            loss = criterion(outputs, targets)

        running_loss += loss.item() * inputs.size(0)
        predicted = outputs.argmax(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = 100.0 * correct / total
    return epoch_loss, epoch_acc

@torch.inference_mode()
def inference():
    model.eval()
    
    labels = []
    
    for inputs, _, _, _, _ in test_loader:
        inputs = inputs.to(device, non_blocking=True)
        with torch.autocast(device.type, enabled=enable_half):
            outputs = model(inputs)

        predicted = outputs.argmax(1).tolist()
        labels.extend(predicted)
    
    return labels

# Initialize WandB
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
run_name = f"cifar100noisy_{config['model']}_{config['optimizer']}_lr{config['lr']}_bs{config['batch_size']}_{timestamp}"

# wandb.init(
#     project=config["wandb_project"],
#     name=run_name,
#     config=config
# )

best = 0.0
best_epoch = 0

# Initialize early stopping
early_stopping = EarlyStopping(
    patience=config["early_stop_patience"],
    min_delta=config["early_stop_min_delta"],
    mode=config["early_stop_mode"]
)

print(f"\n{'='*70}")
print(f"Starting Training - {config['epochs']} epochs")
print(f"Model: {config['model']} (pretrained on {config['pretrained']})")
print(f"Optimizer: {config['optimizer'].upper()}, LR: {config['lr']}, Batch Size: {config['batch_size']}")
if config["optimizer"].lower() == "sgd":
    print(f"Momentum: {config['momentum']}, Nesterov: {config['nesterov']}")
print(f"Weight Decay: {config['weight_decay']}, Label Smoothing: {config['label_smoothing']}")
print(f"Scheduler: {config['scheduler']}")
print(f"Early Stopping: Enabled (patience={config['early_stop_patience']}, mode={config['early_stop_mode']})")
print(f"{'='*70}\n")

with tqdm(range(config["epochs"])) as tbar:
    for epoch in tbar:
        # 1. Run Train (includes internal per-batch scheduler steps)
        train_loss, train_acc, mode = train(epoch)
        
        # 2. Run Validation
        val_loss, val_acc = val()
        
        # 3. Handle LR logging (step only if NOT per-batch)
        if scheduler is not None:
            if not config.get("scheduler_step_per_batch"):
                scheduler.step()
            current_lr = scheduler.get_last_lr()[0]
        else:
            current_lr = config["lr"]
        
        # 4. Checkpointing
        if val_acc > best:
            best = val_acc
            best_epoch = epoch
            checkpoint = {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_acc': val_acc,
                'val_loss': val_loss,
            }
            if scheduler is not None:
                checkpoint['scheduler_state_dict'] = scheduler.state_dict()
            torch.save(checkpoint, './best_model.pth')
        
        # 5. Update Progress Bar & Console
        status = f"Epoch {epoch+1}/{config['epochs']} | Train: {train_acc:.2f}% | Val: {val_acc:.2f}% | Best: {best:.2f}% | LR: {current_lr:.6f}"
        tbar.set_description(status)
        print(status)
        
        # 6. Early stopping check
        if early_stopping(val_acc, epoch):
            print(f"\n{'='*60}")
            print(f"Early stopping triggered at epoch {epoch+1}")
            print(f"Best Val Accuracy: {best:.2f}% at epoch {best_epoch+1}")
            print(f"{'='*60}\n")
            break
    

print(f"\n{'='*60}")
print(f"Training Complete!")
print(f"Best Val Accuracy: {best:.2f}% at epoch {best_epoch+1}")
print(f"Loading best model for inference...")
print(f"{'='*60}\n")

# Load best model for inference
checkpoint = torch.load('./best_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])
print(f"Best model loaded (Epoch {checkpoint['epoch']+1}, Val Acc: {checkpoint['val_acc']:.2f}%)\n")

# Generate submission
data = {
    "ID": [],
    "target": []
}

print("Generating predictions...")
for i, label in enumerate(inference()):
    data["ID"].append(i)
    data["target"].append(label)

df = pd.DataFrame(data)
df.to_csv("/kaggle/working/submission.csv", index=False)

# Log final results to WandB
# wandb.summary["final_best_val_acc"] = best
# wandb.summary["best_epoch"] = best_epoch
# wandb.summary["total_epochs"] = config["epochs"]
# if scheduler is not None:
#     wandb.summary["final_lr"] = scheduler.get_last_lr()[0]
# else:
#     wandb.summary["final_lr"] = config["lr"]
# wandb.summary["early_stopped"] = early_stopping.early_stop
# wandb.summary["epochs_trained"] = best_epoch + 1 if early_stopping.early_stop else config["epochs"]

print(f"\n{'='*60}")
print(f"Submission saved to: ./submission.csv")
print(f"Best model saved to: ./best_model.pth")
print(f"Best Val Accuracy: {best:.2f}% (Epoch {best_epoch+1})")
if early_stopping.early_stop:
    print(f"Training stopped early (patience reached)")
print(f"{'='*60}\n")

# Finish WandB run
#

Using device: cuda
Loading datasets...

[TRAIN SET]
Preprocessing 50000 images (this happens once)...


                                                                

Cached 50000 preprocessed images

[TEST SET]
Preprocessing 10000 images (this happens once)...


                                                               

Cached 10000 preprocessed images

Extracting Features for Analysis...


model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

Analyzing Data: 100%|██████████| 50000/50000 [04:05<00:00, 203.94it/s]


Running k-NN Agreement & Class Balancing...

Train set ready: 50000 samples (with runtime augmentation)
Test set ready: 10000 samples (fully cached)

Loading model: resnet18 (pretrained on imagenet)
Total parameters: 11,227,812
Trainable parameters: 11,227,812

Optimizer: AdamW (lr=0.001, weight_decay=0.02)
Scheduler: WarmRestarts (T_0=25, T_mult=1, Step: Per-Batch)

Starting Training - 100 epochs
Model: resnet18 (pretrained on imagenet)
Optimizer: ADAMW, LR: 0.001, Batch Size: 128
Weight Decay: 0.02, Label Smoothing: 0.1
Scheduler: warm_restarts
Early Stopping: Enabled (patience=35, mode=max)



  0%|          | 0/100 [00:00<?, ?it/s]


Epoch 1/100
Keep Rate: 100.00% | Alpha: 0.350 | Mode: MixUp


Epoch 1/100 | Train: 14.81% | Val: 54.61% | Best: 54.61% | LR: 0.000996:   1%|          | 1/100 [04:15<7:00:57, 255.12s/it]

Epoch 1/100 | Train: 14.81% | Val: 54.61% | Best: 54.61% | LR: 0.000996

Epoch 2/100
Keep Rate: 100.00% | Alpha: 0.355 | Mode: MixUp


Epoch 2/100 | Train: 23.10% | Val: 60.55% | Best: 60.55% | LR: 0.000984:   2%|▏         | 2/100 [08:20<6:47:24, 249.43s/it]

Epoch 2/100 | Train: 23.10% | Val: 60.55% | Best: 60.55% | LR: 0.000984

Epoch 3/100
Keep Rate: 100.00% | Alpha: 0.360 | Mode: MixUp


Epoch 3/100 | Train: 24.43% | Val: 63.54% | Best: 63.54% | LR: 0.000965:   3%|▎         | 3/100 [12:26<6:40:54, 247.99s/it]

Epoch 3/100 | Train: 24.43% | Val: 63.54% | Best: 63.54% | LR: 0.000965

Epoch 4/100
Keep Rate: 100.00% | Alpha: 0.365 | Mode: MixUp


Epoch 4/100 | Train: 23.85% | Val: 65.54% | Best: 65.54% | LR: 0.000939:   4%|▍         | 4/100 [16:34<6:36:37, 247.89s/it]

Epoch 4/100 | Train: 23.85% | Val: 65.54% | Best: 65.54% | LR: 0.000939

Epoch 5/100
Keep Rate: 100.00% | Alpha: 0.370 | Mode: MixUp


Epoch 5/100 | Train: 26.19% | Val: 66.23% | Best: 66.23% | LR: 0.000906:   5%|▌         | 5/100 [20:40<6:31:33, 247.31s/it]

Epoch 5/100 | Train: 26.19% | Val: 66.23% | Best: 66.23% | LR: 0.000906

Epoch 6/100
Keep Rate: 69.30% | Alpha: 0.375 | Mode: MixUp


Epoch 6/100 | Train: 33.55% | Val: 66.03% | Best: 66.23% | LR: 0.000866:   6%|▌         | 6/100 [25:01<6:34:44, 251.96s/it]

Epoch 6/100 | Train: 33.55% | Val: 66.03% | Best: 66.23% | LR: 0.000866

Epoch 7/100
Keep Rate: 69.29% | Alpha: 0.380 | Mode: MixUp


Epoch 7/100 | Train: 35.96% | Val: 66.42% | Best: 66.42% | LR: 0.000821:   7%|▋         | 7/100 [29:09<6:28:27, 250.61s/it]

Epoch 7/100 | Train: 35.96% | Val: 66.42% | Best: 66.42% | LR: 0.000821

Epoch 8/100
Keep Rate: 69.91% | Alpha: 0.385 | Mode: MixUp


Epoch 8/100 | Train: 39.93% | Val: 67.32% | Best: 67.32% | LR: 0.000770:   8%|▊         | 8/100 [33:17<6:22:53, 249.71s/it]

Epoch 8/100 | Train: 39.93% | Val: 67.32% | Best: 67.32% | LR: 0.000770

Epoch 9/100
Keep Rate: 70.45% | Alpha: 0.390 | Mode: MixUp


Epoch 9/100 | Train: 35.30% | Val: 68.35% | Best: 68.35% | LR: 0.000716:   9%|▉         | 9/100 [37:23<6:17:03, 248.61s/it]

Epoch 9/100 | Train: 35.30% | Val: 68.35% | Best: 68.35% | LR: 0.000716

Epoch 10/100
Keep Rate: 70.74% | Alpha: 0.395 | Mode: MixUp


Epoch 10/100 | Train: 38.01% | Val: 68.33% | Best: 68.35% | LR: 0.000658:  10%|█         | 10/100 [41:30<6:11:58, 247.98s/it]

Epoch 10/100 | Train: 38.01% | Val: 68.33% | Best: 68.35% | LR: 0.000658

Epoch 11/100
Keep Rate: 71.19% | Alpha: 0.400 | Mode: MixUp


Epoch 11/100 | Train: 34.19% | Val: 68.65% | Best: 68.65% | LR: 0.000598:  11%|█         | 11/100 [45:37<6:07:22, 247.67s/it]

Epoch 11/100 | Train: 34.19% | Val: 68.65% | Best: 68.65% | LR: 0.000598

Epoch 12/100
Keep Rate: 71.47% | Alpha: 0.405 | Mode: MixUp


Epoch 12/100 | Train: 37.24% | Val: 69.38% | Best: 69.38% | LR: 0.000536:  12%|█▏        | 12/100 [49:43<6:02:35, 247.22s/it]

Epoch 12/100 | Train: 37.24% | Val: 69.38% | Best: 69.38% | LR: 0.000536

Epoch 13/100
Keep Rate: 71.90% | Alpha: 0.410 | Mode: MixUp


Epoch 13/100 | Train: 42.83% | Val: 70.25% | Best: 70.25% | LR: 0.000474:  13%|█▎        | 13/100 [53:50<5:58:19, 247.12s/it]

Epoch 13/100 | Train: 42.83% | Val: 70.25% | Best: 70.25% | LR: 0.000474

Epoch 14/100
Keep Rate: 72.30% | Alpha: 0.415 | Mode: MixUp


Epoch 14/100 | Train: 39.11% | Val: 70.02% | Best: 70.25% | LR: 0.000412:  14%|█▍        | 14/100 [57:56<5:53:47, 246.83s/it]

Epoch 14/100 | Train: 39.11% | Val: 70.02% | Best: 70.25% | LR: 0.000412

Epoch 15/100
Keep Rate: 72.65% | Alpha: 0.420 | Mode: MixUp


Epoch 15/100 | Train: 41.34% | Val: 69.65% | Best: 70.25% | LR: 0.000352:  15%|█▌        | 15/100 [1:02:03<5:49:38, 246.81s/it]

Epoch 15/100 | Train: 41.34% | Val: 69.65% | Best: 70.25% | LR: 0.000352

Epoch 16/100
Keep Rate: 72.79% | Alpha: 0.425 | Mode: MixUp


Epoch 16/100 | Train: 38.96% | Val: 70.03% | Best: 70.25% | LR: 0.000294:  16%|█▌        | 16/100 [1:06:09<5:45:30, 246.80s/it]

Epoch 16/100 | Train: 38.96% | Val: 70.03% | Best: 70.25% | LR: 0.000294

Epoch 17/100
Keep Rate: 73.09% | Alpha: 0.430 | Mode: MixUp


Epoch 17/100 | Train: 38.31% | Val: 70.17% | Best: 70.25% | LR: 0.000240:  17%|█▋        | 17/100 [1:10:17<5:41:31, 246.89s/it]

Epoch 17/100 | Train: 38.31% | Val: 70.17% | Best: 70.25% | LR: 0.000240

Epoch 18/100
Keep Rate: 73.23% | Alpha: 0.435 | Mode: MixUp


Epoch 18/100 | Train: 40.79% | Val: 70.42% | Best: 70.42% | LR: 0.000190:  18%|█▊        | 18/100 [1:14:23<5:37:08, 246.69s/it]

Epoch 18/100 | Train: 40.79% | Val: 70.42% | Best: 70.42% | LR: 0.000190

Epoch 19/100
Keep Rate: 73.27% | Alpha: 0.440 | Mode: MixUp


Epoch 19/100 | Train: 39.82% | Val: 70.49% | Best: 70.49% | LR: 0.000144:  19%|█▉        | 19/100 [1:18:31<5:33:34, 247.10s/it]

Epoch 19/100 | Train: 39.82% | Val: 70.49% | Best: 70.49% | LR: 0.000144

Epoch 20/100
Keep Rate: 73.45% | Alpha: 0.445 | Mode: MixUp


Epoch 20/100 | Train: 43.48% | Val: 70.94% | Best: 70.94% | LR: 0.000105:  20%|██        | 20/100 [1:22:37<5:29:16, 246.95s/it]

Epoch 20/100 | Train: 43.48% | Val: 70.94% | Best: 70.94% | LR: 0.000105

Epoch 21/100
Keep Rate: 73.60% | Alpha: 0.450 | Mode: MixUp


Epoch 21/100 | Train: 40.06% | Val: 70.84% | Best: 70.94% | LR: 0.000071:  21%|██        | 21/100 [1:26:44<5:25:03, 246.88s/it]

Epoch 21/100 | Train: 40.06% | Val: 70.84% | Best: 70.94% | LR: 0.000071

Epoch 22/100
Keep Rate: 73.74% | Alpha: 0.455 | Mode: MixUp


Epoch 22/100 | Train: 44.22% | Val: 71.32% | Best: 71.32% | LR: 0.000045:  22%|██▏       | 22/100 [1:30:51<5:20:43, 246.71s/it]

Epoch 22/100 | Train: 44.22% | Val: 71.32% | Best: 71.32% | LR: 0.000045

Epoch 23/100
Keep Rate: 73.74% | Alpha: 0.460 | Mode: MixUp


Epoch 23/100 | Train: 40.00% | Val: 71.06% | Best: 71.32% | LR: 0.000026:  23%|██▎       | 23/100 [1:34:57<5:16:26, 246.58s/it]

Epoch 23/100 | Train: 40.00% | Val: 71.06% | Best: 71.32% | LR: 0.000026

Epoch 24/100
Keep Rate: 73.74% | Alpha: 0.465 | Mode: MixUp


Epoch 24/100 | Train: 43.10% | Val: 70.99% | Best: 71.32% | LR: 0.000014:  24%|██▍       | 24/100 [1:39:02<5:11:54, 246.24s/it]

Epoch 24/100 | Train: 43.10% | Val: 70.99% | Best: 71.32% | LR: 0.000014

Epoch 25/100
Keep Rate: 73.51% | Alpha: 0.470 | Mode: MixUp


Epoch 25/100 | Train: 41.75% | Val: 71.10% | Best: 71.32% | LR: 0.000010:  25%|██▌       | 25/100 [1:43:06<5:06:48, 245.45s/it]

Epoch 25/100 | Train: 41.75% | Val: 71.10% | Best: 71.32% | LR: 0.000010

Epoch 26/100
Keep Rate: 71.95% | Alpha: 0.475 | Mode: MixUp


Epoch 26/100 | Train: 37.02% | Val: 68.74% | Best: 71.32% | LR: 0.000797:  26%|██▌       | 26/100 [1:47:11<5:02:46, 245.49s/it]

Epoch 26/100 | Train: 37.02% | Val: 68.74% | Best: 71.32% | LR: 0.000797

Epoch 27/100
Keep Rate: 71.92% | Alpha: 0.480 | Mode: MixUp


Epoch 27/100 | Train: 38.07% | Val: 68.33% | Best: 71.32% | LR: 0.000788:  27%|██▋       | 27/100 [1:51:20<4:59:58, 246.56s/it]

Epoch 27/100 | Train: 38.07% | Val: 68.33% | Best: 71.32% | LR: 0.000788

Epoch 28/100
Keep Rate: 71.79% | Alpha: 0.485 | Mode: MixUp


Epoch 28/100 | Train: 38.88% | Val: 68.50% | Best: 71.32% | LR: 0.000772:  28%|██▊       | 28/100 [1:55:27<4:55:47, 246.49s/it]

Epoch 28/100 | Train: 38.88% | Val: 68.50% | Best: 71.32% | LR: 0.000772

Epoch 29/100
Keep Rate: 71.92% | Alpha: 0.490 | Mode: MixUp


Epoch 29/100 | Train: 39.80% | Val: 69.18% | Best: 71.32% | LR: 0.000751:  29%|██▉       | 29/100 [1:59:34<4:51:46, 246.58s/it]

Epoch 29/100 | Train: 39.80% | Val: 69.18% | Best: 71.32% | LR: 0.000751

Epoch 30/100
Keep Rate: 72.24% | Alpha: 0.495 | Mode: MixUp


Epoch 30/100 | Train: 40.56% | Val: 69.02% | Best: 71.32% | LR: 0.000725:  30%|███       | 30/100 [2:03:41<4:47:59, 246.86s/it]

Epoch 30/100 | Train: 40.56% | Val: 69.02% | Best: 71.32% | LR: 0.000725

Epoch 31/100
Keep Rate: 72.27% | Alpha: 0.500 | Mode: CutMix


Epoch 31/100 | Train: 56.07% | Val: 69.64% | Best: 71.32% | LR: 0.000693:  31%|███       | 31/100 [2:07:47<4:43:30, 246.53s/it]

Epoch 31/100 | Train: 56.07% | Val: 69.64% | Best: 71.32% | LR: 0.000693

Epoch 32/100
Keep Rate: 72.18% | Alpha: 0.505 | Mode: CutMix


Epoch 32/100 | Train: 56.63% | Val: 69.82% | Best: 71.32% | LR: 0.000657:  32%|███▏      | 32/100 [2:11:54<4:39:36, 246.71s/it]

Epoch 32/100 | Train: 56.63% | Val: 69.82% | Best: 71.32% | LR: 0.000657

Epoch 33/100
Keep Rate: 72.45% | Alpha: 0.510 | Mode: CutMix


Epoch 33/100 | Train: 56.04% | Val: 68.94% | Best: 71.32% | LR: 0.000617:  33%|███▎      | 33/100 [2:16:01<4:35:34, 246.78s/it]

Epoch 33/100 | Train: 56.04% | Val: 68.94% | Best: 71.32% | LR: 0.000617

Epoch 34/100
Keep Rate: 72.61% | Alpha: 0.515 | Mode: CutMix


Epoch 34/100 | Train: 56.53% | Val: 69.46% | Best: 71.32% | LR: 0.000573:  34%|███▍      | 34/100 [2:20:06<4:30:50, 246.21s/it]

Epoch 34/100 | Train: 56.53% | Val: 69.46% | Best: 71.32% | LR: 0.000573

Epoch 35/100
Keep Rate: 72.72% | Alpha: 0.520 | Mode: CutMix


Epoch 35/100 | Train: 60.82% | Val: 70.89% | Best: 71.32% | LR: 0.000527:  35%|███▌      | 35/100 [2:24:11<4:26:28, 245.98s/it]

Epoch 35/100 | Train: 60.82% | Val: 70.89% | Best: 71.32% | LR: 0.000527

Epoch 36/100
Keep Rate: 72.96% | Alpha: 0.525 | Mode: CutMix


Epoch 36/100 | Train: 58.21% | Val: 69.53% | Best: 71.32% | LR: 0.000479:  36%|███▌      | 36/100 [2:28:18<4:22:33, 246.15s/it]

Epoch 36/100 | Train: 58.21% | Val: 69.53% | Best: 71.32% | LR: 0.000479

Epoch 37/100
Keep Rate: 73.14% | Alpha: 0.530 | Mode: CutMix


Epoch 37/100 | Train: 55.67% | Val: 70.21% | Best: 71.32% | LR: 0.000430:  37%|███▋      | 37/100 [2:32:27<4:19:24, 247.05s/it]

Epoch 37/100 | Train: 55.67% | Val: 70.21% | Best: 71.32% | LR: 0.000430

Epoch 38/100
Keep Rate: 73.26% | Alpha: 0.535 | Mode: CutMix


Epoch 38/100 | Train: 59.12% | Val: 70.42% | Best: 71.32% | LR: 0.000380:  38%|███▊      | 38/100 [2:36:33<4:15:07, 246.89s/it]

Epoch 38/100 | Train: 59.12% | Val: 70.42% | Best: 71.32% | LR: 0.000380

Epoch 39/100
Keep Rate: 73.33% | Alpha: 0.540 | Mode: CutMix


Epoch 39/100 | Train: 58.60% | Val: 70.74% | Best: 71.32% | LR: 0.000331:  39%|███▉      | 39/100 [2:40:41<4:11:12, 247.09s/it]

Epoch 39/100 | Train: 58.60% | Val: 70.74% | Best: 71.32% | LR: 0.000331

Epoch 40/100
Keep Rate: 73.58% | Alpha: 0.545 | Mode: CutMix


Epoch 40/100 | Train: 60.38% | Val: 71.08% | Best: 71.32% | LR: 0.000283:  40%|████      | 40/100 [2:44:49<4:07:15, 247.26s/it]

Epoch 40/100 | Train: 60.38% | Val: 71.08% | Best: 71.32% | LR: 0.000283

Epoch 41/100
Keep Rate: 73.59% | Alpha: 0.550 | Mode: CutMix


Epoch 41/100 | Train: 58.07% | Val: 70.91% | Best: 71.32% | LR: 0.000237:  41%|████      | 41/100 [2:48:56<4:03:03, 247.18s/it]

Epoch 41/100 | Train: 58.07% | Val: 70.91% | Best: 71.32% | LR: 0.000237

Epoch 42/100
Keep Rate: 73.67% | Alpha: 0.555 | Mode: CutMix


Epoch 42/100 | Train: 60.61% | Val: 71.11% | Best: 71.32% | LR: 0.000193:  42%|████▏     | 42/100 [2:53:02<3:58:39, 246.89s/it]

Epoch 42/100 | Train: 60.61% | Val: 71.11% | Best: 71.32% | LR: 0.000193

Epoch 43/100
Keep Rate: 73.95% | Alpha: 0.560 | Mode: CutMix


Epoch 43/100 | Train: 60.32% | Val: 71.25% | Best: 71.32% | LR: 0.000153:  43%|████▎     | 43/100 [2:57:09<3:54:44, 247.10s/it]

Epoch 43/100 | Train: 60.32% | Val: 71.25% | Best: 71.32% | LR: 0.000153

Epoch 44/100
Keep Rate: 73.97% | Alpha: 0.565 | Mode: CutMix


Epoch 44/100 | Train: 61.91% | Val: 71.39% | Best: 71.39% | LR: 0.000117:  44%|████▍     | 44/100 [3:01:17<3:50:46, 247.26s/it]

Epoch 44/100 | Train: 61.91% | Val: 71.39% | Best: 71.39% | LR: 0.000117

Epoch 45/100
Keep Rate: 74.08% | Alpha: 0.570 | Mode: CutMix


Epoch 45/100 | Train: 62.43% | Val: 71.29% | Best: 71.39% | LR: 0.000086:  45%|████▌     | 45/100 [3:05:25<3:46:44, 247.36s/it]

Epoch 45/100 | Train: 62.43% | Val: 71.29% | Best: 71.39% | LR: 0.000086

Epoch 46/100
Keep Rate: 74.10% | Alpha: 0.575 | Mode: CutMix


Epoch 46/100 | Train: 61.79% | Val: 71.08% | Best: 71.39% | LR: 0.000059:  46%|████▌     | 46/100 [3:09:31<3:42:19, 247.03s/it]

Epoch 46/100 | Train: 61.79% | Val: 71.08% | Best: 71.39% | LR: 0.000059

Epoch 47/100
Keep Rate: 74.17% | Alpha: 0.580 | Mode: CutMix


Epoch 47/100 | Train: 61.92% | Val: 71.24% | Best: 71.39% | LR: 0.000038:  47%|████▋     | 47/100 [3:13:37<3:38:00, 246.81s/it]

Epoch 47/100 | Train: 61.92% | Val: 71.24% | Best: 71.39% | LR: 0.000038

Epoch 48/100
Keep Rate: 74.19% | Alpha: 0.585 | Mode: CutMix


Epoch 48/100 | Train: 62.55% | Val: 71.34% | Best: 71.39% | LR: 0.000022:  48%|████▊     | 48/100 [3:17:45<3:34:04, 247.01s/it]

Epoch 48/100 | Train: 62.55% | Val: 71.34% | Best: 71.39% | LR: 0.000022

Epoch 49/100
Keep Rate: 74.25% | Alpha: 0.590 | Mode: CutMix


Epoch 49/100 | Train: 63.10% | Val: 71.37% | Best: 71.39% | LR: 0.000013:  49%|████▉     | 49/100 [3:21:52<3:29:54, 246.94s/it]

Epoch 49/100 | Train: 63.10% | Val: 71.37% | Best: 71.39% | LR: 0.000013

Epoch 50/100
Keep Rate: 74.14% | Alpha: 0.595 | Mode: CutMix


Epoch 50/100 | Train: 59.33% | Val: 71.37% | Best: 71.39% | LR: 0.000010:  50%|█████     | 50/100 [3:25:58<3:25:40, 246.82s/it]

Epoch 50/100 | Train: 59.33% | Val: 71.37% | Best: 71.39% | LR: 0.000010

Epoch 51/100
Keep Rate: 73.24% | Alpha: 0.600 | Mode: CutMix


Epoch 51/100 | Train: 58.64% | Val: 69.99% | Best: 71.39% | LR: 0.000638:  51%|█████     | 51/100 [3:30:05<3:21:30, 246.75s/it]

Epoch 51/100 | Train: 58.64% | Val: 69.99% | Best: 71.39% | LR: 0.000638

Epoch 52/100
Keep Rate: 72.96% | Alpha: 0.600 | Mode: CutMix


Epoch 52/100 | Train: 57.46% | Val: 70.03% | Best: 71.39% | LR: 0.000630:  52%|█████▏    | 52/100 [3:34:10<3:17:08, 246.43s/it]

Epoch 52/100 | Train: 57.46% | Val: 70.03% | Best: 71.39% | LR: 0.000630

Epoch 53/100
Keep Rate: 73.00% | Alpha: 0.600 | Mode: CutMix


Epoch 53/100 | Train: 62.58% | Val: 69.73% | Best: 71.39% | LR: 0.000618:  53%|█████▎    | 53/100 [3:38:18<3:13:13, 246.66s/it]

Epoch 53/100 | Train: 62.58% | Val: 69.73% | Best: 71.39% | LR: 0.000618

Epoch 54/100
Keep Rate: 73.16% | Alpha: 0.600 | Mode: CutMix


Epoch 54/100 | Train: 59.90% | Val: 69.82% | Best: 71.39% | LR: 0.000601:  54%|█████▍    | 54/100 [3:42:24<3:09:06, 246.67s/it]

Epoch 54/100 | Train: 59.90% | Val: 69.82% | Best: 71.39% | LR: 0.000601

Epoch 55/100
Keep Rate: 73.23% | Alpha: 0.600 | Mode: CutMix


Epoch 55/100 | Train: 60.74% | Val: 69.83% | Best: 71.39% | LR: 0.000580:  55%|█████▌    | 55/100 [3:46:31<3:04:55, 246.58s/it]

Epoch 55/100 | Train: 60.74% | Val: 69.83% | Best: 71.39% | LR: 0.000580

Epoch 56/100
Keep Rate: 73.34% | Alpha: 0.600 | Mode: CutMix


Epoch 56/100 | Train: 62.78% | Val: 69.91% | Best: 71.39% | LR: 0.000555:  56%|█████▌    | 56/100 [3:50:36<3:00:31, 246.17s/it]

Epoch 56/100 | Train: 62.78% | Val: 69.91% | Best: 71.39% | LR: 0.000555

Epoch 57/100
Keep Rate: 73.40% | Alpha: 0.600 | Mode: CutMix


Epoch 57/100 | Train: 61.33% | Val: 70.34% | Best: 71.39% | LR: 0.000526:  57%|█████▋    | 57/100 [3:54:42<2:56:28, 246.26s/it]

Epoch 57/100 | Train: 61.33% | Val: 70.34% | Best: 71.39% | LR: 0.000526

Epoch 58/100
Keep Rate: 73.65% | Alpha: 0.600 | Mode: CutMix


Epoch 58/100 | Train: 60.24% | Val: 69.73% | Best: 71.39% | LR: 0.000494:  58%|█████▊    | 58/100 [3:58:49<2:52:31, 246.46s/it]

Epoch 58/100 | Train: 60.24% | Val: 69.73% | Best: 71.39% | LR: 0.000494

Epoch 59/100
Keep Rate: 73.63% | Alpha: 0.600 | Mode: CutMix


Epoch 59/100 | Train: 60.75% | Val: 69.51% | Best: 71.39% | LR: 0.000459:  59%|█████▉    | 59/100 [4:02:56<2:48:30, 246.60s/it]

Epoch 59/100 | Train: 60.75% | Val: 69.51% | Best: 71.39% | LR: 0.000459

Epoch 60/100
Keep Rate: 73.75% | Alpha: 0.600 | Mode: CutMix


Epoch 60/100 | Train: 61.32% | Val: 69.75% | Best: 71.39% | LR: 0.000422:  60%|██████    | 60/100 [4:07:04<2:44:39, 247.00s/it]

Epoch 60/100 | Train: 61.32% | Val: 69.75% | Best: 71.39% | LR: 0.000422

Epoch 61/100
Keep Rate: 73.86% | Alpha: 0.600 | Mode: CutMix


Epoch 61/100 | Train: 60.82% | Val: 69.98% | Best: 71.39% | LR: 0.000384:  61%|██████    | 61/100 [4:11:11<2:40:36, 247.09s/it]

Epoch 61/100 | Train: 60.82% | Val: 69.98% | Best: 71.39% | LR: 0.000384

Epoch 62/100
Keep Rate: 74.01% | Alpha: 0.600 | Mode: CutMix


Epoch 62/100 | Train: 62.31% | Val: 69.94% | Best: 71.39% | LR: 0.000345:  62%|██████▏   | 62/100 [4:15:19<2:36:38, 247.34s/it]

Epoch 62/100 | Train: 62.31% | Val: 69.94% | Best: 71.39% | LR: 0.000345

Epoch 63/100
Keep Rate: 74.16% | Alpha: 0.600 | Mode: CutMix


Epoch 63/100 | Train: 61.68% | Val: 69.87% | Best: 71.39% | LR: 0.000305:  63%|██████▎   | 63/100 [4:19:26<2:32:28, 247.26s/it]

Epoch 63/100 | Train: 61.68% | Val: 69.87% | Best: 71.39% | LR: 0.000305

Epoch 64/100
Keep Rate: 74.20% | Alpha: 0.600 | Mode: CutMix


Epoch 64/100 | Train: 59.42% | Val: 70.11% | Best: 71.39% | LR: 0.000266:  64%|██████▍   | 64/100 [4:23:34<2:28:20, 247.23s/it]

Epoch 64/100 | Train: 59.42% | Val: 70.11% | Best: 71.39% | LR: 0.000266

Epoch 65/100
Keep Rate: 74.19% | Alpha: 0.600 | Mode: CutMix


Epoch 65/100 | Train: 63.70% | Val: 70.08% | Best: 71.39% | LR: 0.000228:  65%|██████▌   | 65/100 [4:27:40<2:24:02, 246.92s/it]

Epoch 65/100 | Train: 63.70% | Val: 70.08% | Best: 71.39% | LR: 0.000228

Epoch 66/100
Keep Rate: 74.48% | Alpha: 0.600 | Mode: CutMix


Epoch 66/100 | Train: 63.18% | Val: 70.50% | Best: 71.39% | LR: 0.000191:  66%|██████▌   | 66/100 [4:31:47<2:19:57, 246.98s/it]

Epoch 66/100 | Train: 63.18% | Val: 70.50% | Best: 71.39% | LR: 0.000191

Epoch 67/100
Keep Rate: 74.54% | Alpha: 0.600 | Mode: CutMix


Epoch 67/100 | Train: 63.83% | Val: 70.41% | Best: 71.39% | LR: 0.000156:  67%|██████▋   | 67/100 [4:35:55<2:16:04, 247.40s/it]

Epoch 67/100 | Train: 63.83% | Val: 70.41% | Best: 71.39% | LR: 0.000156

Epoch 68/100
Keep Rate: 74.52% | Alpha: 0.600 | Mode: CutMix


Epoch 68/100 | Train: 62.41% | Val: 70.30% | Best: 71.39% | LR: 0.000124:  68%|██████▊   | 68/100 [4:40:02<2:11:50, 247.19s/it]

Epoch 68/100 | Train: 62.41% | Val: 70.30% | Best: 71.39% | LR: 0.000124

Epoch 69/100
Keep Rate: 74.67% | Alpha: 0.600 | Mode: CutMix


Epoch 69/100 | Train: 62.68% | Val: 70.46% | Best: 71.39% | LR: 0.000095:  69%|██████▉   | 69/100 [4:44:08<2:07:34, 246.93s/it]

Epoch 69/100 | Train: 62.68% | Val: 70.46% | Best: 71.39% | LR: 0.000095

Epoch 70/100
Keep Rate: 74.65% | Alpha: 0.600 | Mode: CutMix


Epoch 70/100 | Train: 63.80% | Val: 70.36% | Best: 71.39% | LR: 0.000070:  70%|███████   | 70/100 [4:48:14<2:03:21, 246.72s/it]

Epoch 70/100 | Train: 63.80% | Val: 70.36% | Best: 71.39% | LR: 0.000070

Epoch 71/100
Keep Rate: 74.85% | Alpha: 0.600 | Mode: CutMix


Epoch 71/100 | Train: 63.71% | Val: 70.45% | Best: 71.39% | LR: 0.000049:  71%|███████   | 71/100 [4:52:21<1:59:16, 246.77s/it]

Epoch 71/100 | Train: 63.71% | Val: 70.45% | Best: 71.39% | LR: 0.000049

Epoch 72/100
Keep Rate: 74.78% | Alpha: 0.600 | Mode: CutMix


Epoch 72/100 | Train: 62.57% | Val: 70.50% | Best: 71.39% | LR: 0.000032:  72%|███████▏  | 72/100 [4:56:28<1:55:08, 246.72s/it]

Epoch 72/100 | Train: 62.57% | Val: 70.50% | Best: 71.39% | LR: 0.000032

Epoch 73/100
Keep Rate: 74.73% | Alpha: 0.600 | Mode: CutMix


Epoch 73/100 | Train: 63.57% | Val: 70.51% | Best: 71.39% | LR: 0.000020:  73%|███████▎  | 73/100 [5:00:35<1:51:00, 246.68s/it]

Epoch 73/100 | Train: 63.57% | Val: 70.51% | Best: 71.39% | LR: 0.000020

Epoch 74/100
Keep Rate: 74.72% | Alpha: 0.600 | Mode: CutMix


Epoch 74/100 | Train: 63.09% | Val: 70.82% | Best: 71.39% | LR: 0.000012:  74%|███████▍  | 74/100 [5:04:41<1:46:55, 246.76s/it]

Epoch 74/100 | Train: 63.09% | Val: 70.82% | Best: 71.39% | LR: 0.000012

Epoch 75/100
Keep Rate: 74.70% | Alpha: 0.600 | Mode: CutMix


Epoch 75/100 | Train: 62.61% | Val: 70.84% | Best: 71.39% | LR: 0.000010:  75%|███████▌  | 75/100 [5:08:49<1:42:52, 246.90s/it]

Epoch 75/100 | Train: 62.61% | Val: 70.84% | Best: 71.39% | LR: 0.000010

Epoch 76/100
Keep Rate: 74.08% | Alpha: 0.600 | Mode: CutMix


Epoch 76/100 | Train: 58.72% | Val: 70.48% | Best: 71.39% | LR: 0.000510:  76%|███████▌  | 76/100 [5:12:55<1:38:42, 246.77s/it]

Epoch 76/100 | Train: 58.72% | Val: 70.48% | Best: 71.39% | LR: 0.000510

Epoch 77/100
Keep Rate: 73.94% | Alpha: 0.600 | Mode: CutMix


Epoch 77/100 | Train: 62.43% | Val: 69.83% | Best: 71.39% | LR: 0.000504:  77%|███████▋  | 77/100 [5:17:01<1:34:31, 246.58s/it]

Epoch 77/100 | Train: 62.43% | Val: 69.83% | Best: 71.39% | LR: 0.000504

Epoch 78/100
Keep Rate: 74.00% | Alpha: 0.600 | Mode: CutMix


Epoch 78/100 | Train: 63.37% | Val: 69.62% | Best: 71.39% | LR: 0.000494:  78%|███████▊  | 78/100 [5:21:08<1:30:24, 246.57s/it]

Epoch 78/100 | Train: 63.37% | Val: 69.62% | Best: 71.39% | LR: 0.000494

Epoch 79/100
Keep Rate: 73.89% | Alpha: 0.600 | Mode: CutMix


Epoch 79/100 | Train: 65.34% | Val: 69.25% | Best: 71.39% | LR: 0.000481:  78%|███████▊  | 78/100 [5:25:14<1:31:44, 250.19s/it]
  checkpoint = torch.load('./best_model.pth')


Epoch 79/100 | Train: 65.34% | Val: 69.25% | Best: 71.39% | LR: 0.000481

Early stopping triggered at epoch 79
Best Val Accuracy: 71.39% at epoch 44


Training Complete!
Best Val Accuracy: 71.39% at epoch 44
Loading best model for inference...

Best model loaded (Epoch 44, Val Acc: 71.39%)

Generating predictions...

Submission saved to: ./submission.csv
Best model saved to: ./best_model.pth
Best Val Accuracy: 71.39% (Epoch 44)
Training stopped early (patience reached)

