In [1]:
import os
import torch
import timm
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from typing import Optional, Callable

from torch import nn, Tensor, optim
from torch.utils.data import Dataset, DataLoader
from torch.backends import cudnn
from torch import GradScaler
import torch.nn.functional as F

from torchvision.datasets import CIFAR100
from torchvision.transforms import v2
from sklearn.neighbors import NearestNeighbors

# ============ Configuration ============
config = {
    "dataset": "cifar100_noisy",
    "model": "resnet18",
    "pretrained": "imagenet",
    "epochs": 100,
    "batch_size": 128,
    "lr": 0.0005,
    "momentum": 0.9,
    "weight_decay": 0.05,
    "nesterov": True,
    "label_smoothing": 0.1,
    "optimizer": "adamw",
    "scheduler": "warm_restarts",     
    "warm_restarts_T0": 30,              
    "warm_restarts_mult": 1,             
    "cosine_eta_min": 1e-6,
    "scheduler_step_per_batch": True,
    "early_stop_patience": 35,
    "early_stop_mode": "max",
    "early_stop_min_delta": 0.05,
    "device": "cuda",
    "mixed_precision": True,
    "wandb_project": "cifar100-noisy-competition",
    "upscale_size": 224,  
    "aug_alpha": 0.5,          
    "cutmix_prob": 1.0,        
    "switch_epoch": 35,        
    "warmup_epochs": 10,
    "loss_threshold": 3.0,
    "dynamic_threshold_decay": 0.995,
    "soft_alpha": 0.4
}

device = torch.device(config["device"])
print(f"Using device: {device}")
cudnn.benchmark = True
pin_memory = True
enable_half = config["mixed_precision"]  # Disable for CPU, it is slower!
scaler = GradScaler(device, enabled=enable_half)

def get_refinement_metadata(dataset, device):
    print("\nExtracting Features for Analysis...")
    embed_model = timm.create_model('resnet18', pretrained=True, num_classes=0).to(device)
    embed_model.eval()
    
    analysis_transform = v2.Compose([
        v2.ToImage(), v2.Resize(224), v2.ToDtype(torch.float32, scale=True),
        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    features, labels = [], np.array(dataset.targets)
    with torch.no_grad():
        for i in tqdm(range(len(dataset)), desc="Analyzing Data"):
            img, _ = dataset[i]
            img_t = analysis_transform(img).unsqueeze(0).to(device)
            features.append(embed_model(img_t).cpu().numpy())
    
    features = np.concatenate(features, axis=0)
    
    print("Running k-NN Agreement & Class Balancing...")
    K = 10
    knn = NearestNeighbors(n_neighbors=K+1, metric='cosine').fit(features)
    _, indices = knn.kneighbors(features)
    
    refurbished_labels = []
    agreement_scores = []
    for i in range(len(features)):
        neighbor_labels = labels[indices[i, 1:]]
        counts = np.bincount(neighbor_labels, minlength=100)
        refurbished_labels.append(np.argmax(counts))
        agreement_scores.append(counts[labels[i]] / K)
    
    agreement_scores = np.array(agreement_scores)
    refurbished_labels = np.array(refurbished_labels)

    stable_mask = agreement_scores >= 0.9
    counts_per_class = np.bincount(labels[stable_mask], minlength=100)
    rare_classes = np.where(counts_per_class < 20)[0] # Target classes below 20 samples
    
    for c in rare_classes:
        # Bolster rare classes using medium agreement samples (above 75%)
        potential_candidates = np.where((agreement_scores >= 0.75) & (refurbished_labels == c))[0]
        agreement_scores[potential_candidates] = 0.95 # Promote to "Stable" status
            
    del embed_model
    torch.cuda.empty_cache()
    
    return {"agreement": agreement_scores, "refurbished": refurbished_labels}

class SimpleCachedDataset(Dataset):
    def __init__(self, dataset):
        # Runtime transforms are not implemented in this simple cached dataset.
        self.data = tuple([x for x in dataset])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]

class PreprocessedDataset(Dataset):
    """
    Cache preprocessed tensors - apply transforms once and store results.
    
    PERFORMANCE OPTIMIZATION:
    - Applies deterministic transforms (ToImage, Resize) once at startup
    - Stores uint8 tensors (4x less memory than float32)
    - Random augmentations applied at runtime each epoch
    - Test set only needs normalization at runtime (huge speedup!)
    """
    def __init__(self, dataset, transform):
        print(f"Preprocessing {len(dataset)} images (this happens once)...")
        self.data = []
        self.targets = []
        
        for img, target in tqdm(dataset, desc="Caching", leave=False):
            transformed = transform(img)
            self.data.append(transformed)
            self.targets.append(target)
        
        print(f"Cached {len(self.data)} preprocessed images")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        return self.data[i], self.targets[i]

class RefinedAugmentationWrapper(Dataset):
    def __init__(self, preprocessed_dataset, runtime_transforms, refinement_map=None, alpha=0.4):
        self.dataset = preprocessed_dataset
        self.runtime_transforms = runtime_transforms
        self.refinement_map = refinement_map
        self.alpha = alpha
        
    def __len__(self):
        return len(self.dataset)
        
    def __getitem__(self, i):
        img_tensor, original_target = self.dataset[i]
        if self.runtime_transforms is not None:
            img_tensor = self.runtime_transforms(img_tensor)
        
        # Default: Hard label, full weight
        target_a = original_target
        target_b = original_target
        lam_target = 1.0
        weight = 1.0 

        if self.refinement_map is not None:
            score = self.refinement_map["agreement"][i]
            refurbished = int(self.refinement_map["refurbished"][i])
            
            # SOFT LABELING LOGIC
            # If k-NN and Original disagree (Noise/Overlap), we blend them.
            if score < 0.7:
                target_a = original_target
                target_b = refurbished
                lam_target = 1.0 - self.alpha  # e.g., 0.6 original, 0.4 refurbished
                weight = max(score, 0.4)       # Loss is scaled by k-NN confidence
            else:
                # Highly stable samples
                target_a = original_target
                target_b = original_target
                lam_target = 1.0
                weight = 1.0
                
        # We return two targets and a lambda to handle the "Soft Label" in the loss function
        return img_tensor, target_a, target_b, lam_target, weight

class CIFAR100_noisy_fine(Dataset):
    """
    See https://github.com/UCSC-REAL/cifar-10-100n, https://www.noisylabels.com/ and `Learning with Noisy Labels
    Revisited: A Study Using Real-World Human Annotations`.
    """

    def __init__(
        self, root: str, train: bool, transform: Optional[Callable], download: bool
    ):
        cifar100 = CIFAR100(
            root=root, train=train, transform=None, download=download
        )
        data, targets = tuple(zip(*cifar100))

        if train:
            noisy_label_file = os.path.join(root, "CIFAR-100-noisy.npz")
            if not os.path.isfile(noisy_label_file):
                raise FileNotFoundError(
                    f"{type(self).__name__} need {noisy_label_file} to be used!"
                )

            noise_file = np.load(noisy_label_file)
            if not np.array_equal(noise_file["clean_label"], targets):
                raise RuntimeError("Clean labels do not match!")
            targets = noise_file["noisy_label"]

        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, i: int):
        return self.data[i], self.targets[i]


class EarlyStopping:
    """Early stopping to stop training when validation metric doesn't improve."""
    def __init__(self, patience=10, min_delta=0.0, mode='max'):
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.best_epoch = 0
        
    def __call__(self, score, epoch):
        if self.best_score is None:
            self.best_score = score
            self.best_epoch = epoch
            return False
        
        if self.mode == 'max':
            # For accuracy (higher is better)
            if score > self.best_score + self.min_delta:
                self.best_score = score
                self.best_epoch = epoch
                self.counter = 0
            else:
                self.counter += 1
        else:
            # For loss (lower is better)
            if score < self.best_score - self.min_delta:
                self.best_score = score
                self.best_epoch = epoch
                self.counter = 0
            else:
                self.counter += 1
        
        if self.counter >= self.patience:
            self.early_stop = True
        
        return self.early_stop


# === PREPROCESSING (applied once and cached) ===
# Only deterministic, spatial transforms - stores uint8 tensors (saves memory!)
preprocess_transforms = v2.Compose([
    v2.ToImage(),
    v2.Resize(config["upscale_size"]),  # Upscale from 32x32 to 128x128
])

# === RUNTIME AUGMENTATION (applied at each epoch) ===
# Random transforms for training (includes normalization at the end)
train_runtime_transforms = v2.Compose([
    v2.RandomCrop(config["upscale_size"], padding=4),
    v2.RandomHorizontalFlip(p=0.5),
    v2.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    v2.RandomRotation(15),
    v2.ToDtype(torch.float32, scale=True),  # Convert to float [0,1]
    v2.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),  # CIFAR-100 stats
    v2.RandomErasing(p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3))
])

# Test set: only normalization needed (spatial transforms already done)
test_runtime_transforms = v2.Compose([
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])

# Load raw datasets
print("Loading datasets...")
train_set_raw = CIFAR100_noisy_fine('/kaggle/input/fii-atnn-2025-project-noisy-cifar-100/fii-atnn-2024-project-noisy-cifar-100', download=False, train=True, transform=None)
test_set_raw = CIFAR100_noisy_fine('/kaggle/input/fii-atnn-2025-project-noisy-cifar-100/fii-atnn-2024-project-noisy-cifar-100', download=False, train=False, transform=None)

# Cache raw PIL images (fast, lightweight)
train_set_cached = SimpleCachedDataset(train_set_raw)
test_set_cached = SimpleCachedDataset(test_set_raw)

# Preprocess and cache as tensors (done once!)
print("\n[TRAIN SET]")
train_set_preprocessed = PreprocessedDataset(train_set_cached, preprocess_transforms)
print("\n[TEST SET]")
test_set_preprocessed = PreprocessedDataset(test_set_cached, preprocess_transforms)

refinement_map = get_refinement_metadata(train_set_raw, device)

# Add runtime augmentations (applied each epoch for train, none for test)
train_set = RefinedAugmentationWrapper(train_set_preprocessed, train_runtime_transforms, refinement_map=refinement_map)
test_set = RefinedAugmentationWrapper(test_set_preprocessed, test_runtime_transforms)

print(f"\nTrain set ready: {len(train_set)} samples (with runtime augmentation)")
print(f"Test set ready: {len(test_set)} samples (fully cached)\n")

train_loader = DataLoader(train_set, batch_size=config["batch_size"], shuffle=True, pin_memory=pin_memory,num_workers=2,persistent_workers=True)
test_loader = DataLoader(test_set, batch_size=500, pin_memory=pin_memory,num_workers=2,persistent_workers=True)

# Load ResNet18 pretrained on ImageNet
print(f"Loading model: {config['model']} (pretrained on {config['pretrained']})")
model = timm.create_model(config["model"], pretrained=True, num_classes=100)
model = model.to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}\n")

# Label smoothing helps with noisy labels
criterion = nn.CrossEntropyLoss(label_smoothing=config["label_smoothing"])

# Create optimizer based on config
if config["optimizer"].lower() == "adamw":
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config["lr"],
        weight_decay=config["weight_decay"],
        fused=True
    )
    print(f"Optimizer: AdamW (lr={config['lr']}, weight_decay={config['weight_decay']})")
elif config["optimizer"].lower() == "sgd":
    optimizer = optim.SGD(
        model.parameters(), 
        lr=config["lr"],
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
        nesterov=config["nesterov"],
        fused=True
    )
    print(f"Optimizer: SGD (lr={config['lr']}, momentum={config['momentum']}, weight_decay={config['weight_decay']}, nesterov={config['nesterov']})")
else:
    raise ValueError(f"Unknown optimizer: {config['optimizer']}. Supported: 'sgd', 'adamw'")

# Learning rate scheduler
if config["scheduler"] == "steplr":
    scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=config.get("step_size", 30),
        gamma=config.get("gamma", 0.1)
    )
    print(f"Scheduler: StepLR (step_size={config.get('step_size', 30)}, gamma={config.get('gamma', 0.1)})")

elif config["scheduler"] == "cosine":
    scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optimizer,
        T_max=config["epochs"],
        eta_min=config.get("cosine_eta_min", 1e-6)
    )
    print(f"Scheduler: CosineAnnealingLR (T_max={config['epochs']}, eta_min={config['cosine_eta_min']})")

elif config["scheduler"] == "warm_restarts":
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer,
        T_0=config["warm_restarts_T0"],         
        T_mult=config.get("warm_restarts_mult", 1), 
        eta_min=config.get("cosine_eta_min", 1e-6)
    )
    # Highlight the per-batch setting so you know it's active
    step_mode = "Per-Batch" if config.get("scheduler_step_per_batch") else "Per-Epoch"
    print(f"Scheduler: WarmRestarts (T_0={config['warm_restarts_T0']}, T_mult={config['warm_restarts_mult']}, Step: {step_mode})")

else:
    scheduler = None
    print("Scheduler: None")

# === CUTMIX HELPER FUNCTION ===
def rand_bbox(size, lam):
    """Generates a random bounding box for CutMix."""
    W = size[2]
    H = size[3]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)

    # uniform
    cx = np.random.randint(W)
    cy = np.random.randint(H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2

loss_threshold = config["loss_threshold"]

def train(epoch):
    print(f"\nEpoch {epoch+1}/{config['epochs']}")
    model.train()
    correct = 0
    total = 0
    running_loss = 0.0
    global loss_threshold
    
    initial_batch_count = 0 
    use_cutmix = epoch >= config["switch_epoch"]
    aug_mode = "CutMix" if use_cutmix else "MixUp"
    
    # Note: Updated unpack to include target_a, target_b, and lam_target
    for batch_idx, (inputs, t_a, t_b, lam_t, weights) in enumerate(train_loader):
        inputs = inputs.to(device, non_blocking=True)
        t_a = t_a.to(device, non_blocking=True)
        t_b = t_b.to(device, non_blocking=True)
        lam_t = lam_t.unsqueeze(1).to(device, non_blocking=True) # [B, 1]
        weights = weights.to(device, non_blocking=True)
        
        initial_batch_count += inputs.size(0)

        # 1. Dynamic Filtering (Optional, keep for extreme outliers)
        if epoch >= config["warmup_epochs"]:
            with torch.no_grad():
                with torch.autocast(device.type, enabled=enable_half):
                    raw_outputs = model(inputs)
                    # Use t_a as primary for loss filtering
                    sample_losses = F.cross_entropy(raw_outputs, t_a, reduction='none')
                mask = sample_losses < loss_threshold
            if mask.sum() < 2: continue
            inputs, t_a, t_b, lam_t, weights = inputs[mask], t_a[mask], t_b[mask], lam_t[mask], weights[mask]

        # 2. Augmentation (MixUp/CutMix)
        rand_index = torch.randperm(inputs.size(0)).to(device)
        lam_aug = np.random.beta(config["aug_alpha"], config["aug_alpha"])

        if use_cutmix:
            bbx1, bby1, bbx2, bby2 = rand_bbox(inputs.size(), lam_aug)
            lam_aug = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (inputs.size()[-1] * inputs.size()[-2]))
            inputs[:, :, bbx1:bbx2, bby1:bby2] = inputs[rand_index, :, bbx1:bbx2, bby1:bby2]
        else:
            inputs = lam_aug * inputs + (1 - lam_aug) * inputs[rand_index, :]

        # 3. Forward & Soft-Loss
        with torch.autocast(device.type, enabled=enable_half):
            outputs = model(inputs)
            
            # Combine the two internal soft targets (Original vs Refurbished)
            # Loss = lam_aug * (lam_t*loss(t_a) + (1-lam_t)*loss(t_b)) + (1-lam_aug)*MixedSamples...
            def get_soft_loss(out, target_a, target_b, l_t):
                return l_t.view(-1) * criterion(out, target_a) + (1 - l_t.view(-1)) * criterion(out, target_b)

            loss_current = get_soft_loss(outputs, t_a, t_b, lam_t)
            loss_shuffled = get_soft_loss(outputs, t_a[rand_index], t_b[rand_index], lam_t[rand_index])
            
            mixed_loss = lam_aug * loss_current + (1 - lam_aug) * loss_shuffled
            
            # Apply k-NN weights and mean
            batch_weights = lam_aug * weights + (1 - lam_aug) * weights[rand_index]
            loss = (mixed_loss * batch_weights).mean()

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        if config.get("scheduler_step_per_batch") and scheduler is not None:
            scheduler.step(epoch + batch_idx / len(train_loader))

        running_loss += loss.item() * inputs.size(0)
        total += t_a.size(0)
        correct += outputs.argmax(1).eq(t_a).sum().item()

    if epoch >= config["warmup_epochs"]:
        loss_threshold *= config["dynamic_threshold_decay"]
    
    epoch_acc = 100.0 * correct / total
    keep_rate = (total / initial_batch_count) * 100
    print(f"Keep Rate: {keep_rate:.2f}% | Threshold: {loss_threshold:.4f} | Mode: {aug_mode}")
    return running_loss / total, epoch_acc, aug_mode

@torch.inference_mode()
def val():
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0

    for inputs, t_a, t_b, lam_t, _ in test_loader:
        inputs, targets = inputs.to(device), t_a.to(device)
        with torch.autocast(device.type, enabled=enable_half):
            outputs = model(inputs)
            loss = criterion(outputs, targets)

        running_loss += loss.item() * inputs.size(0)
        predicted = outputs.argmax(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = 100.0 * correct / total
    return epoch_loss, epoch_acc

@torch.inference_mode()
def inference():
    model.eval()
    
    labels = []
    
    for inputs, _, _, _, _ in test_loader:
        inputs = inputs.to(device, non_blocking=True)
        with torch.autocast(device.type, enabled=enable_half):
            outputs = model(inputs)

        predicted = outputs.argmax(1).tolist()
        labels.extend(predicted)
    
    return labels

# Initialize WandB
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
run_name = f"cifar100noisy_{config['model']}_{config['optimizer']}_lr{config['lr']}_bs{config['batch_size']}_{timestamp}"

# wandb.init(
#     project=config["wandb_project"],
#     name=run_name,
#     config=config
# )

best = 0.0
best_epoch = 0

# Initialize early stopping
early_stopping = EarlyStopping(
    patience=config["early_stop_patience"],
    min_delta=config["early_stop_min_delta"],
    mode=config["early_stop_mode"]
)

print(f"\n{'='*70}")
print(f"Starting Training - {config['epochs']} epochs")
print(f"Model: {config['model']} (pretrained on {config['pretrained']})")
print(f"Optimizer: {config['optimizer'].upper()}, LR: {config['lr']}, Batch Size: {config['batch_size']}")
if config["optimizer"].lower() == "sgd":
    print(f"Momentum: {config['momentum']}, Nesterov: {config['nesterov']}")
print(f"Weight Decay: {config['weight_decay']}, Label Smoothing: {config['label_smoothing']}")
print(f"Scheduler: {config['scheduler']}")
print(f"Early Stopping: Enabled (patience={config['early_stop_patience']}, mode={config['early_stop_mode']})")
print(f"{'='*70}\n")

with tqdm(range(config["epochs"])) as tbar:
    for epoch in tbar:
        # 1. Run Train (includes internal per-batch scheduler steps)
        train_loss, train_acc, mode = train(epoch)
        
        # 2. Run Validation
        val_loss, val_acc = val()
        
        # 3. Handle LR logging (step only if NOT per-batch)
        if scheduler is not None:
            if not config.get("scheduler_step_per_batch"):
                scheduler.step()
            current_lr = scheduler.get_last_lr()[0]
        else:
            current_lr = config["lr"]
        
        # 4. Checkpointing
        if val_acc > best:
            best = val_acc
            best_epoch = epoch
            checkpoint = {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_acc': val_acc,
                'val_loss': val_loss,
            }
            if scheduler is not None:
                checkpoint['scheduler_state_dict'] = scheduler.state_dict()
            torch.save(checkpoint, './best_model.pth')
        
        # 5. Update Progress Bar & Console
        status = f"Epoch {epoch+1}/{config['epochs']} | Train: {train_acc:.2f}% | Val: {val_acc:.2f}% | Best: {best:.2f}% | LR: {current_lr:.6f}"
        tbar.set_description(status)
        print(status)
        
        # 6. Early stopping check
        if early_stopping(val_acc, epoch):
            print(f"\n{'='*60}")
            print(f"Early stopping triggered at epoch {epoch+1}")
            print(f"Best Val Accuracy: {best:.2f}% at epoch {best_epoch+1}")
            print(f"{'='*60}\n")
            break
    

print(f"\n{'='*60}")
print(f"Training Complete!")
print(f"Best Val Accuracy: {best:.2f}% at epoch {best_epoch+1}")
print(f"Loading best model for inference...")
print(f"{'='*60}\n")

# Load best model for inference
checkpoint = torch.load('./best_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])
print(f"Best model loaded (Epoch {checkpoint['epoch']+1}, Val Acc: {checkpoint['val_acc']:.2f}%)\n")

# Generate submission
data = {
    "ID": [],
    "target": []
}

print("Generating predictions...")
for i, label in enumerate(inference()):
    data["ID"].append(i)
    data["target"].append(label)

df = pd.DataFrame(data)
df.to_csv("/kaggle/working/submission.csv", index=False)

# Log final results to WandB
# wandb.summary["final_best_val_acc"] = best
# wandb.summary["best_epoch"] = best_epoch
# wandb.summary["total_epochs"] = config["epochs"]
# if scheduler is not None:
#     wandb.summary["final_lr"] = scheduler.get_last_lr()[0]
# else:
#     wandb.summary["final_lr"] = config["lr"]
# wandb.summary["early_stopped"] = early_stopping.early_stop
# wandb.summary["epochs_trained"] = best_epoch + 1 if early_stopping.early_stop else config["epochs"]

print(f"\n{'='*60}")
print(f"Submission saved to: ./submission.csv")
print(f"Best model saved to: ./best_model.pth")
print(f"Best Val Accuracy: {best:.2f}% (Epoch {best_epoch+1})")
if early_stopping.early_stop:
    print(f"Training stopped early (patience reached)")
print(f"{'='*60}\n")

# Finish WandB run
# 

Using device: cuda
Loading datasets...

[TRAIN SET]
Preprocessing 50000 images (this happens once)...


                                                                

Cached 50000 preprocessed images

[TEST SET]
Preprocessing 10000 images (this happens once)...


                                                               

Cached 10000 preprocessed images

Extracting Features for Analysis...


model.safetensors:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

Analyzing Data: 100%|██████████| 50000/50000 [03:45<00:00, 221.69it/s]


Running k-NN Agreement & Class Balancing...

Train set ready: 50000 samples (with runtime augmentation)
Test set ready: 10000 samples (fully cached)

Loading model: resnet18 (pretrained on imagenet)
Total parameters: 11,227,812
Trainable parameters: 11,227,812

Optimizer: AdamW (lr=0.0005, weight_decay=0.05)
Scheduler: WarmRestarts (T_0=30, T_mult=1, Step: Per-Batch)

Starting Training - 100 epochs
Model: resnet18 (pretrained on imagenet)
Optimizer: ADAMW, LR: 0.0005, Batch Size: 128
Weight Decay: 0.05, Label Smoothing: 0.1
Scheduler: warm_restarts
Early Stopping: Enabled (patience=35, mode=max)



  0%|          | 0/100 [00:00<?, ?it/s]


Epoch 1/100
Keep Rate: 100.00% | Threshold: 3.0000 | Mode: MixUp


Epoch 1/100 | Train: 13.74% | Val: 54.14% | Best: 54.14% | LR: 0.000499:   1%|          | 1/100 [03:52<6:23:42, 232.55s/it]

Epoch 1/100 | Train: 13.74% | Val: 54.14% | Best: 54.14% | LR: 0.000499

Epoch 2/100
Keep Rate: 100.00% | Threshold: 3.0000 | Mode: MixUp


Epoch 2/100 | Train: 21.26% | Val: 60.68% | Best: 60.68% | LR: 0.000495:   2%|▏         | 2/100 [07:33<6:09:06, 225.98s/it]

Epoch 2/100 | Train: 21.26% | Val: 60.68% | Best: 60.68% | LR: 0.000495

Epoch 3/100
Keep Rate: 100.00% | Threshold: 3.0000 | Mode: MixUp


Epoch 3/100 | Train: 22.11% | Val: 63.62% | Best: 63.62% | LR: 0.000488:   3%|▎         | 3/100 [11:16<6:02:29, 224.22s/it]

Epoch 3/100 | Train: 22.11% | Val: 63.62% | Best: 63.62% | LR: 0.000488

Epoch 4/100
Keep Rate: 100.00% | Threshold: 3.0000 | Mode: MixUp


Epoch 4/100 | Train: 24.40% | Val: 64.83% | Best: 64.83% | LR: 0.000478:   4%|▍         | 4/100 [14:56<5:56:34, 222.86s/it]

Epoch 4/100 | Train: 24.40% | Val: 64.83% | Best: 64.83% | LR: 0.000478

Epoch 5/100
Keep Rate: 100.00% | Threshold: 3.0000 | Mode: MixUp


Epoch 5/100 | Train: 24.04% | Val: 66.52% | Best: 66.52% | LR: 0.000467:   5%|▌         | 5/100 [18:35<5:50:22, 221.29s/it]

Epoch 5/100 | Train: 24.04% | Val: 66.52% | Best: 66.52% | LR: 0.000467

Epoch 6/100
Keep Rate: 100.00% | Threshold: 3.0000 | Mode: MixUp


Epoch 6/100 | Train: 26.58% | Val: 67.38% | Best: 67.38% | LR: 0.000452:   6%|▌         | 6/100 [22:14<5:45:19, 220.42s/it]

Epoch 6/100 | Train: 26.58% | Val: 67.38% | Best: 67.38% | LR: 0.000452

Epoch 7/100
Keep Rate: 100.00% | Threshold: 3.0000 | Mode: MixUp


Epoch 7/100 | Train: 28.07% | Val: 66.78% | Best: 67.38% | LR: 0.000436:   7%|▋         | 7/100 [25:55<5:42:20, 220.86s/it]

Epoch 7/100 | Train: 28.07% | Val: 66.78% | Best: 67.38% | LR: 0.000436

Epoch 8/100
Keep Rate: 100.00% | Threshold: 3.0000 | Mode: MixUp


Epoch 8/100 | Train: 24.41% | Val: 68.69% | Best: 68.69% | LR: 0.000417:   8%|▊         | 8/100 [29:39<5:39:50, 221.64s/it]

Epoch 8/100 | Train: 24.41% | Val: 68.69% | Best: 68.69% | LR: 0.000417

Epoch 9/100
Keep Rate: 100.00% | Threshold: 3.0000 | Mode: MixUp


Epoch 9/100 | Train: 27.97% | Val: 69.11% | Best: 69.11% | LR: 0.000397:   9%|▉         | 9/100 [33:19<5:35:21, 221.11s/it]

Epoch 9/100 | Train: 27.97% | Val: 69.11% | Best: 69.11% | LR: 0.000397

Epoch 10/100
Keep Rate: 100.00% | Threshold: 3.0000 | Mode: MixUp


Epoch 10/100 | Train: 26.79% | Val: 68.00% | Best: 69.11% | LR: 0.000375:  10%|█         | 10/100 [36:58<5:30:53, 220.60s/it]

Epoch 10/100 | Train: 26.79% | Val: 68.00% | Best: 69.11% | LR: 0.000375

Epoch 11/100
Keep Rate: 74.83% | Threshold: 2.9850 | Mode: MixUp


Epoch 11/100 | Train: 37.60% | Val: 69.42% | Best: 69.42% | LR: 0.000352:  11%|█         | 11/100 [40:55<5:34:34, 225.55s/it]

Epoch 11/100 | Train: 37.60% | Val: 69.42% | Best: 69.42% | LR: 0.000352

Epoch 12/100
Keep Rate: 74.53% | Threshold: 2.9701 | Mode: MixUp


Epoch 12/100 | Train: 39.35% | Val: 69.83% | Best: 69.83% | LR: 0.000328:  12%|█▏        | 12/100 [44:34<5:28:06, 223.71s/it]

Epoch 12/100 | Train: 39.35% | Val: 69.83% | Best: 69.83% | LR: 0.000328

Epoch 13/100
Keep Rate: 74.73% | Threshold: 2.9552 | Mode: MixUp


Epoch 13/100 | Train: 34.99% | Val: 69.89% | Best: 69.89% | LR: 0.000302:  13%|█▎        | 13/100 [48:11<5:21:08, 221.48s/it]

Epoch 13/100 | Train: 34.99% | Val: 69.89% | Best: 69.89% | LR: 0.000302

Epoch 14/100
Keep Rate: 74.76% | Threshold: 2.9404 | Mode: MixUp


Epoch 14/100 | Train: 35.28% | Val: 70.22% | Best: 70.22% | LR: 0.000277:  14%|█▍        | 14/100 [51:48<5:15:40, 220.24s/it]

Epoch 14/100 | Train: 35.28% | Val: 70.22% | Best: 70.22% | LR: 0.000277

Epoch 15/100
Keep Rate: 74.87% | Threshold: 2.9257 | Mode: MixUp


Epoch 15/100 | Train: 36.83% | Val: 70.67% | Best: 70.67% | LR: 0.000251:  15%|█▌        | 15/100 [55:23<5:09:49, 218.70s/it]

Epoch 15/100 | Train: 36.83% | Val: 70.67% | Best: 70.67% | LR: 0.000251

Epoch 16/100
Keep Rate: 74.90% | Threshold: 2.9111 | Mode: MixUp


Epoch 16/100 | Train: 37.53% | Val: 70.34% | Best: 70.67% | LR: 0.000224:  16%|█▌        | 16/100 [58:58<5:04:22, 217.41s/it]

Epoch 16/100 | Train: 37.53% | Val: 70.34% | Best: 70.67% | LR: 0.000224

Epoch 17/100
Keep Rate: 75.07% | Threshold: 2.8966 | Mode: MixUp


Epoch 17/100 | Train: 35.58% | Val: 70.29% | Best: 70.67% | LR: 0.000199:  17%|█▋        | 17/100 [1:02:32<4:59:25, 216.45s/it]

Epoch 17/100 | Train: 35.58% | Val: 70.29% | Best: 70.67% | LR: 0.000199

Epoch 18/100
Keep Rate: 75.25% | Threshold: 2.8821 | Mode: MixUp


Epoch 18/100 | Train: 36.84% | Val: 70.60% | Best: 70.67% | LR: 0.000173:  18%|█▊        | 18/100 [1:06:07<4:55:20, 216.11s/it]

Epoch 18/100 | Train: 36.84% | Val: 70.60% | Best: 70.67% | LR: 0.000173

Epoch 19/100
Keep Rate: 75.41% | Threshold: 2.8677 | Mode: MixUp


Epoch 19/100 | Train: 37.70% | Val: 70.77% | Best: 70.77% | LR: 0.000149:  19%|█▉        | 19/100 [1:09:42<4:51:15, 215.75s/it]

Epoch 19/100 | Train: 37.70% | Val: 70.77% | Best: 70.77% | LR: 0.000149

Epoch 20/100
Keep Rate: 75.17% | Threshold: 2.8533 | Mode: MixUp


Epoch 20/100 | Train: 38.01% | Val: 70.76% | Best: 70.77% | LR: 0.000126:  20%|██        | 20/100 [1:13:17<4:47:14, 215.44s/it]

Epoch 20/100 | Train: 38.01% | Val: 70.76% | Best: 70.77% | LR: 0.000126

Epoch 21/100
Keep Rate: 75.42% | Threshold: 2.8391 | Mode: MixUp


Epoch 21/100 | Train: 40.85% | Val: 70.96% | Best: 70.96% | LR: 0.000104:  21%|██        | 21/100 [1:16:52<4:43:44, 215.50s/it]

Epoch 21/100 | Train: 40.85% | Val: 70.96% | Best: 70.96% | LR: 0.000104

Epoch 22/100
Keep Rate: 75.34% | Threshold: 2.8249 | Mode: MixUp


Epoch 22/100 | Train: 40.28% | Val: 70.86% | Best: 70.96% | LR: 0.000084:  22%|██▏       | 22/100 [1:20:25<4:39:03, 214.67s/it]

Epoch 22/100 | Train: 40.28% | Val: 70.86% | Best: 70.96% | LR: 0.000084

Epoch 23/100
Keep Rate: 75.28% | Threshold: 2.8107 | Mode: MixUp


Epoch 23/100 | Train: 38.92% | Val: 70.88% | Best: 70.96% | LR: 0.000065:  23%|██▎       | 23/100 [1:23:58<4:34:57, 214.26s/it]

Epoch 23/100 | Train: 38.92% | Val: 70.88% | Best: 70.96% | LR: 0.000065

Epoch 24/100
Keep Rate: 75.41% | Threshold: 2.7967 | Mode: MixUp


Epoch 24/100 | Train: 39.95% | Val: 71.48% | Best: 71.48% | LR: 0.000049:  24%|██▍       | 24/100 [1:27:33<4:31:31, 214.36s/it]

Epoch 24/100 | Train: 39.95% | Val: 71.48% | Best: 71.48% | LR: 0.000049

Epoch 25/100
Keep Rate: 75.39% | Threshold: 2.7827 | Mode: MixUp


Epoch 25/100 | Train: 37.97% | Val: 71.61% | Best: 71.61% | LR: 0.000034:  25%|██▌       | 25/100 [1:31:07<4:27:48, 214.25s/it]

Epoch 25/100 | Train: 37.97% | Val: 71.61% | Best: 71.61% | LR: 0.000034

Epoch 26/100
Keep Rate: 75.42% | Threshold: 2.7688 | Mode: MixUp


Epoch 26/100 | Train: 40.37% | Val: 71.41% | Best: 71.61% | LR: 0.000023:  26%|██▌       | 26/100 [1:34:42<4:24:22, 214.35s/it]

Epoch 26/100 | Train: 40.37% | Val: 71.41% | Best: 71.61% | LR: 0.000023

Epoch 27/100
Keep Rate: 75.49% | Threshold: 2.7549 | Mode: MixUp


Epoch 27/100 | Train: 40.89% | Val: 71.58% | Best: 71.61% | LR: 0.000013:  27%|██▋       | 27/100 [1:38:20<4:22:20, 215.62s/it]

Epoch 27/100 | Train: 40.89% | Val: 71.58% | Best: 71.61% | LR: 0.000013

Epoch 28/100
Keep Rate: 75.20% | Threshold: 2.7412 | Mode: MixUp


Epoch 28/100 | Train: 39.27% | Val: 71.36% | Best: 71.61% | LR: 0.000006:  28%|██▊       | 28/100 [1:41:54<4:17:54, 214.93s/it]

Epoch 28/100 | Train: 39.27% | Val: 71.36% | Best: 71.61% | LR: 0.000006

Epoch 29/100
Keep Rate: 75.14% | Threshold: 2.7275 | Mode: MixUp


Epoch 29/100 | Train: 39.79% | Val: 71.38% | Best: 71.61% | LR: 0.000002:  29%|██▉       | 29/100 [1:45:27<4:13:44, 214.43s/it]

Epoch 29/100 | Train: 39.79% | Val: 71.38% | Best: 71.61% | LR: 0.000002

Epoch 30/100
Keep Rate: 75.15% | Threshold: 2.7138 | Mode: MixUp


Epoch 30/100 | Train: 39.49% | Val: 71.08% | Best: 71.61% | LR: 0.000001:  30%|███       | 30/100 [1:49:01<4:10:04, 214.35s/it]

Epoch 30/100 | Train: 39.49% | Val: 71.08% | Best: 71.61% | LR: 0.000001

Epoch 31/100
Keep Rate: 73.48% | Threshold: 2.7003 | Mode: MixUp


Epoch 31/100 | Train: 39.90% | Val: 69.16% | Best: 71.61% | LR: 0.000499:  31%|███       | 31/100 [1:52:35<4:06:21, 214.22s/it]

Epoch 31/100 | Train: 39.90% | Val: 69.16% | Best: 71.61% | LR: 0.000499

Epoch 32/100
Keep Rate: 73.21% | Threshold: 2.6868 | Mode: MixUp


Epoch 32/100 | Train: 39.83% | Val: 69.12% | Best: 71.61% | LR: 0.000495:  32%|███▏      | 32/100 [1:56:09<4:02:50, 214.28s/it]

Epoch 32/100 | Train: 39.83% | Val: 69.12% | Best: 71.61% | LR: 0.000495

Epoch 33/100
Keep Rate: 73.16% | Threshold: 2.6733 | Mode: MixUp


Epoch 33/100 | Train: 37.96% | Val: 69.69% | Best: 71.61% | LR: 0.000488:  33%|███▎      | 33/100 [1:59:42<3:58:53, 213.93s/it]

Epoch 33/100 | Train: 37.96% | Val: 69.69% | Best: 71.61% | LR: 0.000488

Epoch 34/100
Keep Rate: 73.11% | Threshold: 2.6600 | Mode: MixUp


Epoch 34/100 | Train: 37.68% | Val: 69.33% | Best: 71.61% | LR: 0.000478:  34%|███▍      | 34/100 [2:03:16<3:55:07, 213.75s/it]

Epoch 34/100 | Train: 37.68% | Val: 69.33% | Best: 71.61% | LR: 0.000478

Epoch 35/100
Keep Rate: 73.10% | Threshold: 2.6467 | Mode: MixUp


Epoch 35/100 | Train: 39.10% | Val: 68.84% | Best: 71.61% | LR: 0.000467:  35%|███▌      | 35/100 [2:06:58<3:54:16, 216.26s/it]

Epoch 35/100 | Train: 39.10% | Val: 68.84% | Best: 71.61% | LR: 0.000467

Epoch 36/100
Keep Rate: 73.05% | Threshold: 2.6334 | Mode: CutMix


Epoch 36/100 | Train: 52.97% | Val: 70.05% | Best: 71.61% | LR: 0.000452:  36%|███▌      | 36/100 [2:11:13<4:02:59, 227.81s/it]

Epoch 36/100 | Train: 52.97% | Val: 70.05% | Best: 71.61% | LR: 0.000452

Epoch 37/100
Keep Rate: 72.93% | Threshold: 2.6203 | Mode: CutMix


Epoch 37/100 | Train: 54.55% | Val: 70.34% | Best: 71.61% | LR: 0.000436:  37%|███▋      | 37/100 [2:15:00<3:58:56, 227.56s/it]

Epoch 37/100 | Train: 54.55% | Val: 70.34% | Best: 71.61% | LR: 0.000436

Epoch 38/100
Keep Rate: 73.08% | Threshold: 2.6072 | Mode: CutMix


Epoch 38/100 | Train: 54.45% | Val: 70.60% | Best: 71.61% | LR: 0.000417:  38%|███▊      | 38/100 [2:18:49<3:55:36, 228.01s/it]

Epoch 38/100 | Train: 54.45% | Val: 70.60% | Best: 71.61% | LR: 0.000417

Epoch 39/100
Keep Rate: 72.99% | Threshold: 2.5941 | Mode: CutMix


Epoch 39/100 | Train: 55.19% | Val: 69.84% | Best: 71.61% | LR: 0.000397:  39%|███▉      | 39/100 [2:22:37<3:51:56, 228.15s/it]

Epoch 39/100 | Train: 55.19% | Val: 69.84% | Best: 71.61% | LR: 0.000397

Epoch 40/100
Keep Rate: 73.06% | Threshold: 2.5812 | Mode: CutMix


Epoch 40/100 | Train: 57.71% | Val: 70.73% | Best: 71.61% | LR: 0.000375:  40%|████      | 40/100 [2:26:23<3:47:29, 227.49s/it]

Epoch 40/100 | Train: 57.71% | Val: 70.73% | Best: 71.61% | LR: 0.000375

Epoch 41/100
Keep Rate: 73.08% | Threshold: 2.5682 | Mode: CutMix


Epoch 41/100 | Train: 55.72% | Val: 71.12% | Best: 71.61% | LR: 0.000352:  41%|████      | 41/100 [2:30:07<3:42:46, 226.55s/it]

Epoch 41/100 | Train: 55.72% | Val: 71.12% | Best: 71.61% | LR: 0.000352

Epoch 42/100
Keep Rate: 73.35% | Threshold: 2.5554 | Mode: CutMix


Epoch 42/100 | Train: 55.85% | Val: 70.12% | Best: 71.61% | LR: 0.000328:  42%|████▏     | 42/100 [2:33:55<3:39:23, 226.96s/it]

Epoch 42/100 | Train: 55.85% | Val: 70.12% | Best: 71.61% | LR: 0.000328

Epoch 43/100
Keep Rate: 73.43% | Threshold: 2.5426 | Mode: CutMix


Epoch 43/100 | Train: 53.28% | Val: 70.74% | Best: 71.61% | LR: 0.000302:  43%|████▎     | 43/100 [2:37:41<3:35:19, 226.65s/it]

Epoch 43/100 | Train: 53.28% | Val: 70.74% | Best: 71.61% | LR: 0.000302

Epoch 44/100
Keep Rate: 73.37% | Threshold: 2.5299 | Mode: CutMix


Epoch 44/100 | Train: 58.69% | Val: 70.82% | Best: 71.61% | LR: 0.000277:  44%|████▍     | 44/100 [2:41:26<3:30:59, 226.07s/it]

Epoch 44/100 | Train: 58.69% | Val: 70.82% | Best: 71.61% | LR: 0.000277

Epoch 45/100
Keep Rate: 73.61% | Threshold: 2.5173 | Mode: CutMix


Epoch 45/100 | Train: 57.85% | Val: 71.28% | Best: 71.61% | LR: 0.000251:  45%|████▌     | 45/100 [2:45:12<3:27:11, 226.04s/it]

Epoch 45/100 | Train: 57.85% | Val: 71.28% | Best: 71.61% | LR: 0.000251

Epoch 46/100
Keep Rate: 73.67% | Threshold: 2.5047 | Mode: CutMix


Epoch 46/100 | Train: 56.50% | Val: 71.53% | Best: 71.61% | LR: 0.000224:  46%|████▌     | 46/100 [2:49:01<3:24:20, 227.04s/it]

Epoch 46/100 | Train: 56.50% | Val: 71.53% | Best: 71.61% | LR: 0.000224

Epoch 47/100
Keep Rate: 73.89% | Threshold: 2.4922 | Mode: CutMix


Epoch 47/100 | Train: 58.68% | Val: 71.50% | Best: 71.61% | LR: 0.000199:  47%|████▋     | 47/100 [2:53:05<3:25:00, 232.08s/it]

Epoch 47/100 | Train: 58.68% | Val: 71.50% | Best: 71.61% | LR: 0.000199

Epoch 48/100
Keep Rate: 73.86% | Threshold: 2.4797 | Mode: CutMix


Epoch 48/100 | Train: 59.49% | Val: 71.34% | Best: 71.61% | LR: 0.000173:  48%|████▊     | 48/100 [2:57:33<3:30:21, 242.72s/it]

Epoch 48/100 | Train: 59.49% | Val: 71.34% | Best: 71.61% | LR: 0.000173

Epoch 49/100
Keep Rate: 73.91% | Threshold: 2.4673 | Mode: CutMix


Epoch 49/100 | Train: 57.84% | Val: 71.88% | Best: 71.88% | LR: 0.000149:  49%|████▉     | 49/100 [3:01:31<3:25:17, 241.53s/it]

Epoch 49/100 | Train: 57.84% | Val: 71.88% | Best: 71.88% | LR: 0.000149

Epoch 50/100
Keep Rate: 74.00% | Threshold: 2.4550 | Mode: CutMix


Epoch 50/100 | Train: 59.73% | Val: 71.58% | Best: 71.88% | LR: 0.000126:  50%|█████     | 50/100 [3:05:14<3:16:37, 235.95s/it]

Epoch 50/100 | Train: 59.73% | Val: 71.58% | Best: 71.88% | LR: 0.000126

Epoch 51/100
Keep Rate: 73.93% | Threshold: 2.4427 | Mode: CutMix


Epoch 51/100 | Train: 59.10% | Val: 71.67% | Best: 71.88% | LR: 0.000104:  51%|█████     | 51/100 [3:09:01<3:10:21, 233.09s/it]

Epoch 51/100 | Train: 59.10% | Val: 71.67% | Best: 71.88% | LR: 0.000104

Epoch 52/100
Keep Rate: 74.15% | Threshold: 2.4305 | Mode: CutMix


Epoch 52/100 | Train: 57.47% | Val: 71.28% | Best: 71.88% | LR: 0.000084:  52%|█████▏    | 52/100 [3:13:08<3:09:45, 237.19s/it]

Epoch 52/100 | Train: 57.47% | Val: 71.28% | Best: 71.88% | LR: 0.000084

Epoch 53/100
Keep Rate: 74.23% | Threshold: 2.4183 | Mode: CutMix


Epoch 53/100 | Train: 60.00% | Val: 71.53% | Best: 71.88% | LR: 0.000065:  53%|█████▎    | 53/100 [3:16:54<3:03:19, 234.04s/it]

Epoch 53/100 | Train: 60.00% | Val: 71.53% | Best: 71.88% | LR: 0.000065

Epoch 54/100
Keep Rate: 74.15% | Threshold: 2.4062 | Mode: CutMix


Epoch 54/100 | Train: 58.47% | Val: 71.49% | Best: 71.88% | LR: 0.000049:  54%|█████▍    | 54/100 [3:20:42<2:57:53, 232.04s/it]

Epoch 54/100 | Train: 58.47% | Val: 71.49% | Best: 71.88% | LR: 0.000049

Epoch 55/100
Keep Rate: 74.05% | Threshold: 2.3942 | Mode: CutMix


Epoch 55/100 | Train: 60.17% | Val: 71.49% | Best: 71.88% | LR: 0.000034:  55%|█████▌    | 55/100 [3:24:30<2:53:09, 230.87s/it]

Epoch 55/100 | Train: 60.17% | Val: 71.49% | Best: 71.88% | LR: 0.000034

Epoch 56/100
Keep Rate: 74.16% | Threshold: 2.3822 | Mode: CutMix


Epoch 56/100 | Train: 60.41% | Val: 71.47% | Best: 71.88% | LR: 0.000023:  56%|█████▌    | 56/100 [3:28:15<2:48:04, 229.19s/it]

Epoch 56/100 | Train: 60.41% | Val: 71.47% | Best: 71.88% | LR: 0.000023

Epoch 57/100
Keep Rate: 74.10% | Threshold: 2.3703 | Mode: CutMix


Epoch 57/100 | Train: 57.97% | Val: 71.48% | Best: 71.88% | LR: 0.000013:  57%|█████▋    | 57/100 [3:32:11<2:45:37, 231.11s/it]

Epoch 57/100 | Train: 57.97% | Val: 71.48% | Best: 71.88% | LR: 0.000013

Epoch 58/100
Keep Rate: 73.99% | Threshold: 2.3585 | Mode: CutMix


Epoch 58/100 | Train: 60.36% | Val: 71.84% | Best: 71.88% | LR: 0.000006:  58%|█████▊    | 58/100 [3:35:54<2:40:12, 228.86s/it]

Epoch 58/100 | Train: 60.36% | Val: 71.84% | Best: 71.88% | LR: 0.000006

Epoch 59/100
Keep Rate: 74.05% | Threshold: 2.3467 | Mode: CutMix


Epoch 59/100 | Train: 57.83% | Val: 71.74% | Best: 71.88% | LR: 0.000002:  59%|█████▉    | 59/100 [3:39:35<2:34:46, 226.51s/it]

Epoch 59/100 | Train: 57.83% | Val: 71.74% | Best: 71.88% | LR: 0.000002

Epoch 60/100
Keep Rate: 73.95% | Threshold: 2.3349 | Mode: CutMix


Epoch 60/100 | Train: 61.53% | Val: 71.62% | Best: 71.88% | LR: 0.000001:  60%|██████    | 60/100 [3:43:20<2:30:41, 226.04s/it]

Epoch 60/100 | Train: 61.53% | Val: 71.62% | Best: 71.88% | LR: 0.000001

Epoch 61/100
Keep Rate: 72.48% | Threshold: 2.3233 | Mode: CutMix


Epoch 61/100 | Train: 58.07% | Val: 69.42% | Best: 71.88% | LR: 0.000499:  61%|██████    | 61/100 [3:47:04<2:26:27, 225.31s/it]

Epoch 61/100 | Train: 58.07% | Val: 69.42% | Best: 71.88% | LR: 0.000499

Epoch 62/100
Keep Rate: 72.03% | Threshold: 2.3116 | Mode: CutMix


Epoch 62/100 | Train: 58.91% | Val: 69.50% | Best: 71.88% | LR: 0.000495:  62%|██████▏   | 62/100 [3:50:55<2:23:47, 227.03s/it]

Epoch 62/100 | Train: 58.91% | Val: 69.50% | Best: 71.88% | LR: 0.000495

Epoch 63/100
Keep Rate: 72.22% | Threshold: 2.3001 | Mode: CutMix


Epoch 63/100 | Train: 57.16% | Val: 69.56% | Best: 71.88% | LR: 0.000488:  63%|██████▎   | 63/100 [3:54:48<2:21:03, 228.76s/it]

Epoch 63/100 | Train: 57.16% | Val: 69.56% | Best: 71.88% | LR: 0.000488

Epoch 64/100
Keep Rate: 71.98% | Threshold: 2.2886 | Mode: CutMix


Epoch 64/100 | Train: 58.46% | Val: 70.25% | Best: 71.88% | LR: 0.000478:  64%|██████▍   | 64/100 [3:58:39<2:17:47, 229.65s/it]

Epoch 64/100 | Train: 58.46% | Val: 70.25% | Best: 71.88% | LR: 0.000478

Epoch 65/100
Keep Rate: 72.03% | Threshold: 2.2771 | Mode: CutMix


Epoch 65/100 | Train: 60.48% | Val: 71.15% | Best: 71.88% | LR: 0.000467:  65%|██████▌   | 65/100 [4:02:30<2:14:09, 229.99s/it]

Epoch 65/100 | Train: 60.48% | Val: 71.15% | Best: 71.88% | LR: 0.000467

Epoch 66/100
Keep Rate: 71.96% | Threshold: 2.2658 | Mode: CutMix


Epoch 66/100 | Train: 60.77% | Val: 69.74% | Best: 71.88% | LR: 0.000452:  66%|██████▌   | 66/100 [4:06:22<2:10:36, 230.49s/it]

Epoch 66/100 | Train: 60.77% | Val: 69.74% | Best: 71.88% | LR: 0.000452

Epoch 67/100
Keep Rate: 72.04% | Threshold: 2.2544 | Mode: CutMix


Epoch 67/100 | Train: 60.54% | Val: 70.13% | Best: 71.88% | LR: 0.000436:  67%|██████▋   | 67/100 [4:10:13<2:06:56, 230.80s/it]

Epoch 67/100 | Train: 60.54% | Val: 70.13% | Best: 71.88% | LR: 0.000436

Epoch 68/100
Keep Rate: 72.51% | Threshold: 2.2432 | Mode: CutMix


Epoch 68/100 | Train: 58.81% | Val: 69.38% | Best: 71.88% | LR: 0.000417:  68%|██████▊   | 68/100 [4:13:59<2:02:16, 229.27s/it]

Epoch 68/100 | Train: 58.81% | Val: 69.38% | Best: 71.88% | LR: 0.000417

Epoch 69/100
Keep Rate: 72.29% | Threshold: 2.2319 | Mode: CutMix


Epoch 69/100 | Train: 60.61% | Val: 69.97% | Best: 71.88% | LR: 0.000397:  69%|██████▉   | 69/100 [4:17:59<2:00:02, 232.34s/it]

Epoch 69/100 | Train: 60.61% | Val: 69.97% | Best: 71.88% | LR: 0.000397

Epoch 70/100
Keep Rate: 72.32% | Threshold: 2.2208 | Mode: CutMix


Epoch 70/100 | Train: 58.79% | Val: 69.57% | Best: 71.88% | LR: 0.000375:  70%|███████   | 70/100 [4:21:58<1:57:12, 234.42s/it]

Epoch 70/100 | Train: 58.79% | Val: 69.57% | Best: 71.88% | LR: 0.000375

Epoch 71/100
Keep Rate: 72.35% | Threshold: 2.2097 | Mode: CutMix


Epoch 71/100 | Train: 58.95% | Val: 70.28% | Best: 71.88% | LR: 0.000352:  71%|███████   | 71/100 [4:25:47<1:52:30, 232.77s/it]

Epoch 71/100 | Train: 58.95% | Val: 70.28% | Best: 71.88% | LR: 0.000352

Epoch 72/100
Keep Rate: 72.67% | Threshold: 2.1986 | Mode: CutMix


Epoch 72/100 | Train: 60.00% | Val: 69.74% | Best: 71.88% | LR: 0.000328:  72%|███████▏  | 72/100 [4:29:37<1:48:14, 231.95s/it]

Epoch 72/100 | Train: 60.00% | Val: 69.74% | Best: 71.88% | LR: 0.000328

Epoch 73/100
Keep Rate: 72.76% | Threshold: 2.1876 | Mode: CutMix


Epoch 73/100 | Train: 57.35% | Val: 69.73% | Best: 71.88% | LR: 0.000302:  73%|███████▎  | 73/100 [4:33:29<1:44:22, 231.94s/it]

Epoch 73/100 | Train: 57.35% | Val: 69.73% | Best: 71.88% | LR: 0.000302

Epoch 74/100
Keep Rate: 73.01% | Threshold: 2.1767 | Mode: CutMix


Epoch 74/100 | Train: 63.27% | Val: 70.59% | Best: 71.88% | LR: 0.000277:  74%|███████▍  | 74/100 [4:37:17<1:40:05, 231.00s/it]

Epoch 74/100 | Train: 63.27% | Val: 70.59% | Best: 71.88% | LR: 0.000277

Epoch 75/100
Keep Rate: 72.85% | Threshold: 2.1658 | Mode: CutMix


Epoch 75/100 | Train: 61.43% | Val: 70.50% | Best: 71.88% | LR: 0.000251:  75%|███████▌  | 75/100 [4:41:05<1:35:48, 229.92s/it]

Epoch 75/100 | Train: 61.43% | Val: 70.50% | Best: 71.88% | LR: 0.000251

Epoch 76/100
Keep Rate: 73.03% | Threshold: 2.1550 | Mode: CutMix


Epoch 76/100 | Train: 62.08% | Val: 70.09% | Best: 71.88% | LR: 0.000224:  76%|███████▌  | 76/100 [4:44:54<1:31:54, 229.76s/it]

Epoch 76/100 | Train: 62.08% | Val: 70.09% | Best: 71.88% | LR: 0.000224

Epoch 77/100
Keep Rate: 73.24% | Threshold: 2.1442 | Mode: CutMix


Epoch 77/100 | Train: 62.78% | Val: 70.54% | Best: 71.88% | LR: 0.000199:  77%|███████▋  | 77/100 [4:48:40<1:27:39, 228.69s/it]

Epoch 77/100 | Train: 62.78% | Val: 70.54% | Best: 71.88% | LR: 0.000199

Epoch 78/100
Keep Rate: 73.31% | Threshold: 2.1335 | Mode: CutMix


Epoch 78/100 | Train: 63.28% | Val: 70.47% | Best: 71.88% | LR: 0.000173:  78%|███████▊  | 78/100 [4:52:29<1:23:52, 228.74s/it]

Epoch 78/100 | Train: 63.28% | Val: 70.47% | Best: 71.88% | LR: 0.000173

Epoch 79/100
Keep Rate: 73.39% | Threshold: 2.1228 | Mode: CutMix


Epoch 79/100 | Train: 62.78% | Val: 70.77% | Best: 71.88% | LR: 0.000149:  79%|███████▉  | 79/100 [4:56:17<1:19:57, 228.43s/it]

Epoch 79/100 | Train: 62.78% | Val: 70.77% | Best: 71.88% | LR: 0.000149

Epoch 80/100
Keep Rate: 73.39% | Threshold: 2.1122 | Mode: CutMix


Epoch 80/100 | Train: 62.94% | Val: 70.84% | Best: 71.88% | LR: 0.000126:  80%|████████  | 80/100 [5:00:04<1:16:00, 228.01s/it]

Epoch 80/100 | Train: 62.94% | Val: 70.84% | Best: 71.88% | LR: 0.000126

Epoch 81/100
Keep Rate: 73.55% | Threshold: 2.1016 | Mode: CutMix


Epoch 81/100 | Train: 65.35% | Val: 70.49% | Best: 71.88% | LR: 0.000104:  81%|████████  | 81/100 [5:03:55<1:12:30, 228.97s/it]

Epoch 81/100 | Train: 65.35% | Val: 70.49% | Best: 71.88% | LR: 0.000104

Epoch 82/100
Keep Rate: 73.69% | Threshold: 2.0911 | Mode: CutMix


Epoch 82/100 | Train: 62.80% | Val: 70.52% | Best: 71.88% | LR: 0.000084:  82%|████████▏ | 82/100 [5:07:45<1:08:47, 229.33s/it]

Epoch 82/100 | Train: 62.80% | Val: 70.52% | Best: 71.88% | LR: 0.000084

Epoch 83/100
Keep Rate: 73.58% | Threshold: 2.0807 | Mode: CutMix


Epoch 83/100 | Train: 63.66% | Val: 70.71% | Best: 71.88% | LR: 0.000065:  83%|████████▎ | 83/100 [5:11:34<1:04:55, 229.16s/it]

Epoch 83/100 | Train: 63.66% | Val: 70.71% | Best: 71.88% | LR: 0.000065

Epoch 84/100
Keep Rate: 73.66% | Threshold: 2.0703 | Mode: CutMix


Epoch 84/100 | Train: 64.89% | Val: 70.66% | Best: 71.88% | LR: 0.000049:  83%|████████▎ | 83/100 [5:15:28<1:04:36, 228.06s/it]
  checkpoint = torch.load('./best_model.pth')


Epoch 84/100 | Train: 64.89% | Val: 70.66% | Best: 71.88% | LR: 0.000049

Early stopping triggered at epoch 84
Best Val Accuracy: 71.88% at epoch 49


Training Complete!
Best Val Accuracy: 71.88% at epoch 49
Loading best model for inference...

Best model loaded (Epoch 49, Val Acc: 71.88%)

Generating predictions...

Submission saved to: ./submission.csv
Best model saved to: ./best_model.pth
Best Val Accuracy: 71.88% (Epoch 49)
Training stopped early (patience reached)

