In [2]:
import os
import gc
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchvision import models
from sklearn.metrics import classification_report, f1_score
import numpy as np
from tqdm import tqdm
import pandas as pd

# =====================================================
# CONFIGURATION
# =====================================================
DATA_DIR = "./data/processed"
SAVE_DIR = "./resnet-18"
os.makedirs(SAVE_DIR, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

EPOCHS = 20
PATIENCE = 4
BATCH_SIZE = 64
LEARNING_RATE = 1e-4
TRAIN_LIMIT = 10000
TEST_LIMIT = 2000
VAL_SPLIT = 0.1  # 10% for validation

# =====================================================
# HELPER FUNCTIONS
# =====================================================

def load_dataset(dataset_name, split_type, aug_type):
    """Load dataset from pickle file."""
    path = os.path.join(DATA_DIR, f"{dataset_name}_{split_type}", f"{aug_type}.pkl")
    print(f"Loading: {path}")
    with open(path, "rb") as f:
        data = pickle.load(f)
    return data["images"], data["labels"]

def preprocess_images(images, dataset_type):
    """Convert images to tensor format and normalize."""
    # Ensure 4D: (N, H, W, C)
    if images.ndim == 3:
        images = np.expand_dims(images, axis=-1)
    
    # Transpose to PyTorch format: (N, C, H, W)
    images = images.transpose(0, 3, 1, 2)
    
    # Convert to tensor and normalize to [0, 1]
    tensor = torch.tensor(images, dtype=torch.float32)
    if tensor.max() > 1.0:
        tensor = tensor / 255.0
    
    # Convert grayscale to RGB by repeating channels
    if dataset_type == "grayscale" and tensor.shape[1] == 1:
        tensor = tensor.repeat(1, 3, 1, 1)
    
    return tensor

def get_model(num_classes, pretrained=True):
    """Initialize ResNet-18 model."""
    if pretrained:
        model = models.resnet18(weights="IMAGENET1K_V1")
    else:
        model = models.resnet18(weights=None)
    
    # Replace final layer for our number of classes
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    return model.to(device)

def train_model(model, train_loader, val_loader, epochs=EPOCHS, patience=PATIENCE):
    """Train model with early stopping."""
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    best_val_f1 = 0
    patience_counter = 0
    best_state = None
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        train_correct = 0
        train_total = 0
        
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
        for X, y in pbar:
            X, y = X.to(device), y.to(device)
            
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            train_total += y.size(0)
            train_correct += (predicted == y).sum().item()
            
            pbar.set_postfix({'loss': f'{loss.item():.4f}'})
        
        avg_loss = running_loss / len(train_loader)
        train_acc = 100 * train_correct / train_total
        
        # Validation phase
        val_f1, val_acc = evaluate_model(model, val_loader, return_accuracy=True)
        
        print(f"Epoch {epoch+1}/{epochs} - "
              f"Loss: {avg_loss:.4f}, "
              f"Train Acc: {train_acc:.2f}%, "
              f"Val Acc: {val_acc:.2f}%, "
              f"Val F1: {val_f1:.4f}")
        
        # Early stopping check
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            patience_counter = 0
            best_state = model.state_dict().copy()
            print(f"  ✓ New best validation F1: {best_val_f1:.4f}")
        else:
            patience_counter += 1
            print(f"  No improvement ({patience_counter}/{patience})")
            if patience_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break
    
    # Load best model
    if best_state is not None:
        model.load_state_dict(best_state)
        print(f"\nLoaded best model with Val F1: {best_val_f1:.4f}")
    
    return model, best_val_f1

def evaluate_model(model, data_loader, return_accuracy=False, silent=False):
    """Evaluate model on given data loader."""
    model.eval()
    preds, true = [], []
    
    with torch.no_grad():
        for X, y in data_loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            _, predicted = torch.max(outputs, 1)
            preds.extend(predicted.cpu().numpy())
            true.extend(y.cpu().numpy())
    
    f1 = f1_score(true, preds, average="macro")
    
    if return_accuracy:
        acc = 100 * np.mean(np.array(true) == np.array(preds))
        return f1, acc
    
    if not silent:
        report = classification_report(true, preds, digits=4)
        print(report)
        return f1, true, preds
    
    return f1

def clear_memory():
    """Clear GPU and CPU memory."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# =====================================================
# MAIN EXECUTION
# =====================================================

datasets = {
    "mnist": {"classes": 10, "type": "grayscale"},
    "cifar10": {"classes": 10, "type": "color"}
}

train_types = ["original", "mixed_augmented", "combined_augmented"]
test_types = ["original", "all_combined", "noise", "occlusion_25", "rotation_15", "scaling_0.8"]

results = []

for dataset_name, info in datasets.items():
    print(f"\n{'='*80}")
    print(f"PROCESSING DATASET: {dataset_name.upper()}")
    print(f"{'='*80}\n")
    
    for train_aug in train_types:
        print(f"\n{'*'*60}")
        print(f"Training configuration: {train_aug}")
        print(f"{'*'*60}\n")
        
        # Load training data ONCE
        train_images, train_labels = load_dataset(dataset_name, "train", train_aug)
        train_images = train_images[:TRAIN_LIMIT]
        train_labels = train_labels[:TRAIN_LIMIT]
        
        print(f"Training samples: {len(train_images)}")
        print(f"Image shape: {train_images.shape}")
        print(f"Label distribution: {np.bincount(train_labels)}")
        
        # Preprocess training data
        X_train = preprocess_images(train_images, info["type"])
        y_train = torch.tensor(train_labels, dtype=torch.long)
        
        # Create train/val split
        full_dataset = TensorDataset(X_train, y_train)
        train_size = int((1 - VAL_SPLIT) * len(full_dataset))
        val_size = len(full_dataset) - train_size
        
        # Use fixed seed for reproducibility
        torch.manual_seed(42)
        train_subset, val_subset = torch.utils.data.random_split(
            full_dataset, [train_size, val_size]
        )
        
        train_loader = DataLoader(
            train_subset, 
            batch_size=BATCH_SIZE, 
            shuffle=True,
            num_workers=0,
            pin_memory=True if torch.cuda.is_available() else False
        )
        val_loader = DataLoader(
            val_subset, 
            batch_size=BATCH_SIZE,
            num_workers=0,
            pin_memory=True if torch.cuda.is_available() else False
        )
        
        print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}\n")
        
        # Train model ONCE per training configuration
        model = get_model(info["classes"])
        model, best_val_f1 = train_model(model, train_loader, val_loader)
        
        # Evaluate on all test sets
        for test_aug in test_types:
            print(f"\n{'─'*60}")
            print(f"Evaluating on test set: {test_aug}")
            print(f"{'─'*60}")
            
            # Load test data
            test_images, test_labels = load_dataset(dataset_name, "test", test_aug)
            test_images = test_images[:TEST_LIMIT]
            test_labels = test_labels[:TEST_LIMIT]
            
            print(f"Test samples: {len(test_images)}")
            
            # Preprocess test data
            X_test = preprocess_images(test_images, info["type"])
            y_test = torch.tensor(test_labels, dtype=torch.long)
            
            test_loader = DataLoader(
                TensorDataset(X_test, y_test), 
                batch_size=BATCH_SIZE,
                num_workers=0,
                pin_memory=True if torch.cuda.is_available() else False
            )
            
            # Evaluate
            test_f1, y_true, y_pred = evaluate_model(model, test_loader)
            test_acc = 100 * np.mean(np.array(y_true) == np.array(y_pred))
            
            print(f"\nTest Accuracy: {test_acc:.2f}%")
            print(f"Test F1 Score: {test_f1:.4f}")
            
            # Save detailed report
            report = classification_report(y_true, y_pred, digits=4)
            report_path = os.path.join(
                SAVE_DIR, 
                f"{dataset_name}_train-{train_aug}_test-{test_aug}.txt"
            )
            with open(report_path, "w") as f:
                f.write(f"Dataset: {dataset_name}\n")
                f.write(f"Training Type: {train_aug}\n")
                f.write(f"Test Type: {test_aug}\n")
                f.write(f"{'='*60}\n\n")
                f.write(report)
                f.write(f"\n{'='*60}\n")
                f.write(f"Best Validation F1: {best_val_f1:.4f}\n")
                f.write(f"Test F1: {test_f1:.4f}\n")
                f.write(f"Test Accuracy: {test_acc:.2f}%\n")
            
            # Store results
            results.append({
                "dataset": dataset_name,
                "train_type": train_aug,
                "test_type": test_aug,
                "best_val_f1": best_val_f1,
                "test_f1": test_f1,
                "test_accuracy": test_acc
            })
            
            # Clean up test data
            del X_test, y_test, test_loader, test_images, test_labels
            clear_memory()
        
        # Clean up model and training data after all test evaluations
        del model, X_train, y_train, train_loader, val_loader, train_images, train_labels
        del train_subset, val_subset, full_dataset
        clear_memory()
        
        print(f"\n✓ Completed training configuration: {train_aug}\n")


Using device: cuda

PROCESSING DATASET: MNIST


************************************************************
Training configuration: original
************************************************************

Loading: ./data/processed\mnist_train\original.pkl
Training samples: 10000
Image shape: (10000, 128, 128, 1)
Label distribution: [1001 1127  991 1032  980  863 1014 1070  944  978]
Train batches: 141, Val batches: 16



Epoch 1/20: 100%|██████████| 141/141 [00:04<00:00, 32.41it/s, loss=0.0972]


Epoch 1/20 - Loss: 0.1761, Train Acc: 95.23%, Val Acc: 98.30%, Val F1: 0.9829
  ✓ New best validation F1: 0.9829


Epoch 2/20: 100%|██████████| 141/141 [00:04<00:00, 34.94it/s, loss=0.0025]


Epoch 2/20 - Loss: 0.0210, Train Acc: 99.53%, Val Acc: 99.00%, Val F1: 0.9896
  ✓ New best validation F1: 0.9896


Epoch 3/20: 100%|██████████| 141/141 [00:04<00:00, 34.96it/s, loss=0.0115]


Epoch 3/20 - Loss: 0.0103, Train Acc: 99.72%, Val Acc: 98.80%, Val F1: 0.9879
  No improvement (1/4)


Epoch 4/20: 100%|██████████| 141/141 [00:04<00:00, 33.14it/s, loss=0.0026]


Epoch 4/20 - Loss: 0.0068, Train Acc: 99.82%, Val Acc: 98.50%, Val F1: 0.9851
  No improvement (2/4)


Epoch 5/20: 100%|██████████| 141/141 [00:04<00:00, 34.29it/s, loss=0.0281]


Epoch 5/20 - Loss: 0.0067, Train Acc: 99.86%, Val Acc: 99.00%, Val F1: 0.9898
  ✓ New best validation F1: 0.9898


Epoch 6/20: 100%|██████████| 141/141 [00:08<00:00, 16.52it/s, loss=0.0031]


Epoch 6/20 - Loss: 0.0032, Train Acc: 99.94%, Val Acc: 98.70%, Val F1: 0.9868
  No improvement (1/4)


Epoch 7/20: 100%|██████████| 141/141 [00:04<00:00, 33.24it/s, loss=0.0011]


Epoch 7/20 - Loss: 0.0050, Train Acc: 99.88%, Val Acc: 98.40%, Val F1: 0.9840
  No improvement (2/4)


Epoch 8/20: 100%|██████████| 141/141 [00:04<00:00, 33.87it/s, loss=0.0014]


Epoch 8/20 - Loss: 0.0116, Train Acc: 99.71%, Val Acc: 99.10%, Val F1: 0.9910
  ✓ New best validation F1: 0.9910


Epoch 9/20: 100%|██████████| 141/141 [00:04<00:00, 34.82it/s, loss=0.0002]


Epoch 9/20 - Loss: 0.0046, Train Acc: 99.89%, Val Acc: 98.80%, Val F1: 0.9877
  No improvement (1/4)


Epoch 10/20: 100%|██████████| 141/141 [00:04<00:00, 34.62it/s, loss=0.0008]


Epoch 10/20 - Loss: 0.0054, Train Acc: 99.88%, Val Acc: 99.10%, Val F1: 0.9906
  No improvement (2/4)


Epoch 11/20: 100%|██████████| 141/141 [00:04<00:00, 32.30it/s, loss=0.0003]


Epoch 11/20 - Loss: 0.0018, Train Acc: 99.96%, Val Acc: 99.10%, Val F1: 0.9908
  No improvement (3/4)


Epoch 12/20: 100%|██████████| 141/141 [00:04<00:00, 34.60it/s, loss=0.0001]


Epoch 12/20 - Loss: 0.0010, Train Acc: 99.99%, Val Acc: 99.10%, Val F1: 0.9906
  No improvement (4/4)
Early stopping triggered at epoch 12

Loaded best model with Val F1: 0.9910

────────────────────────────────────────────────────────────
Evaluating on test set: original
────────────────────────────────────────────────────────────
Loading: ./data/processed\mnist_test\original.pkl
Test samples: 2000
              precision    recall  f1-score   support

           0     0.9831    0.9943    0.9886       175
           1     0.9957    1.0000    0.9979       234
           2     0.9954    0.9863    0.9908       219
           3     0.9904    0.9952    0.9928       207
           4     0.9908    0.9908    0.9908       217
           5     0.9943    0.9777    0.9859       179
           6     0.9777    0.9831    0.9804       178
           7     0.9854    0.9902    0.9878       205
           8     0.9896    0.9948    0.9922       192
           9     0.9896    0.9794    0.9845       194

 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0     1.0000    0.0457    0.0874       175
           1     0.0000    0.0000    0.0000       234
           2     1.0000    0.0731    0.1362       219
           3     0.1257    1.0000    0.2233       207
           4     0.0000    0.0000    0.0000       217
           5     0.0000    0.0000    0.0000       179
           6     1.0000    0.0618    0.1164       178
           7     0.0000    0.0000    0.0000       205
           8     0.2799    0.4635    0.3490       192
           9     0.0000    0.0000    0.0000       194

    accuracy                         0.1655      2000
   macro avg     0.3406    0.1644    0.0912      2000
weighted avg     0.3259    0.1655    0.0895      2000


Test Accuracy: 16.55%
Test F1 Score: 0.0912

────────────────────────────────────────────────────────────
Evaluating on test set: occlusion_25
────────────────────────────────────────────────────────────
Loading: ./data/processed\mnist_test

Epoch 1/20: 100%|██████████| 141/141 [00:04<00:00, 33.32it/s, loss=0.0577]


Epoch 1/20 - Loss: 0.1755, Train Acc: 95.24%, Val Acc: 98.20%, Val F1: 0.9817
  ✓ New best validation F1: 0.9817


Epoch 2/20: 100%|██████████| 141/141 [00:04<00:00, 34.41it/s, loss=0.0143]


Epoch 2/20 - Loss: 0.0190, Train Acc: 99.47%, Val Acc: 98.80%, Val F1: 0.9879
  ✓ New best validation F1: 0.9879


Epoch 3/20: 100%|██████████| 141/141 [00:04<00:00, 34.84it/s, loss=0.0005]


Epoch 3/20 - Loss: 0.0076, Train Acc: 99.83%, Val Acc: 98.60%, Val F1: 0.9863
  No improvement (1/4)


Epoch 4/20: 100%|██████████| 141/141 [00:04<00:00, 34.28it/s, loss=0.0012]


Epoch 4/20 - Loss: 0.0068, Train Acc: 99.82%, Val Acc: 99.10%, Val F1: 0.9910
  ✓ New best validation F1: 0.9910


Epoch 5/20: 100%|██████████| 141/141 [00:04<00:00, 34.47it/s, loss=0.0041]


Epoch 5/20 - Loss: 0.0037, Train Acc: 99.92%, Val Acc: 98.50%, Val F1: 0.9845
  No improvement (1/4)


Epoch 6/20: 100%|██████████| 141/141 [00:04<00:00, 35.04it/s, loss=0.0021]


Epoch 6/20 - Loss: 0.0058, Train Acc: 99.87%, Val Acc: 98.90%, Val F1: 0.9888
  No improvement (2/4)


Epoch 7/20: 100%|██████████| 141/141 [00:04<00:00, 34.68it/s, loss=0.0024]


Epoch 7/20 - Loss: 0.0111, Train Acc: 99.59%, Val Acc: 98.00%, Val F1: 0.9802
  No improvement (3/4)


Epoch 8/20: 100%|██████████| 141/141 [00:04<00:00, 34.43it/s, loss=0.0022]


Epoch 8/20 - Loss: 0.0109, Train Acc: 99.68%, Val Acc: 98.60%, Val F1: 0.9861
  No improvement (4/4)
Early stopping triggered at epoch 8

Loaded best model with Val F1: 0.9910

────────────────────────────────────────────────────────────
Evaluating on test set: original
────────────────────────────────────────────────────────────
Loading: ./data/processed\mnist_test\original.pkl
Test samples: 2000
              precision    recall  f1-score   support

           0     0.9943    0.9943    0.9943       175
           1     0.9791    1.0000    0.9894       234
           2     0.9907    0.9680    0.9792       219
           3     0.9951    0.9807    0.9878       207
           4     0.9772    0.9862    0.9817       217
           5     0.9833    0.9888    0.9861       179
           6     0.9944    0.9888    0.9915       178
           7     0.9619    0.9854    0.9735       205
           8     0.9947    0.9792    0.9869       192
           9     0.9845    0.9794    0.9819       194

   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0     0.6303    0.5943    0.6118       175
           1     0.2812    1.0000    0.4390       234
           2     1.0000    0.0091    0.0181       219
           3     0.0000    0.0000    0.0000       207
           4     0.0000    0.0000    0.0000       217
           5     0.0000    0.0000    0.0000       179
           6     0.0000    0.0000    0.0000       178
           7     0.0000    0.0000    0.0000       205
           8     0.1888    0.9844    0.3168       192
           9     0.0000    0.0000    0.0000       194

    accuracy                         0.2645      2000
   macro avg     0.2100    0.2588    0.1386      2000
weighted avg     0.2157    0.2645    0.1373      2000


Test Accuracy: 26.45%
Test F1 Score: 0.1386

────────────────────────────────────────────────────────────
Evaluating on test set: occlusion_25
────────────────────────────────────────────────────────────
Loading: ./data/processed\mnist_test

Epoch 1/20: 100%|██████████| 141/141 [00:04<00:00, 33.38it/s, loss=0.2667]


Epoch 1/20 - Loss: 0.3383, Train Acc: 90.03%, Val Acc: 97.20%, Val F1: 0.9715
  ✓ New best validation F1: 0.9715


Epoch 2/20: 100%|██████████| 141/141 [00:04<00:00, 34.95it/s, loss=0.1051]


Epoch 2/20 - Loss: 0.0447, Train Acc: 98.76%, Val Acc: 97.20%, Val F1: 0.9718
  ✓ New best validation F1: 0.9718


Epoch 3/20: 100%|██████████| 141/141 [00:04<00:00, 35.06it/s, loss=0.0040]


Epoch 3/20 - Loss: 0.0161, Train Acc: 99.59%, Val Acc: 97.60%, Val F1: 0.9750
  ✓ New best validation F1: 0.9750


Epoch 4/20: 100%|██████████| 141/141 [00:04<00:00, 34.46it/s, loss=0.0117]


Epoch 4/20 - Loss: 0.0075, Train Acc: 99.87%, Val Acc: 97.40%, Val F1: 0.9736
  No improvement (1/4)


Epoch 5/20: 100%|██████████| 141/141 [00:04<00:00, 34.66it/s, loss=0.0028]


Epoch 5/20 - Loss: 0.0083, Train Acc: 99.77%, Val Acc: 97.20%, Val F1: 0.9714
  No improvement (2/4)


Epoch 6/20: 100%|██████████| 141/141 [00:03<00:00, 35.54it/s, loss=0.0010]


Epoch 6/20 - Loss: 0.0088, Train Acc: 99.83%, Val Acc: 97.40%, Val F1: 0.9735
  No improvement (3/4)


Epoch 7/20: 100%|██████████| 141/141 [00:04<00:00, 35.07it/s, loss=0.0009]


Epoch 7/20 - Loss: 0.0051, Train Acc: 99.90%, Val Acc: 96.30%, Val F1: 0.9625
  No improvement (4/4)
Early stopping triggered at epoch 7

Loaded best model with Val F1: 0.9750

────────────────────────────────────────────────────────────
Evaluating on test set: original
────────────────────────────────────────────────────────────
Loading: ./data/processed\mnist_test\original.pkl
Test samples: 2000
              precision    recall  f1-score   support

           0     0.9162    1.0000    0.9563       175
           1     0.9912    0.9573    0.9739       234
           2     0.9945    0.8219    0.9000       219
           3     1.0000    0.8599    0.9247       207
           4     0.9791    0.8618    0.9167       217
           5     0.8693    0.9665    0.9153       179
           6     0.9545    0.9438    0.9492       178
           7     0.8904    0.9902    0.9376       205
           8     0.6971    0.9948    0.8197       192
           9     0.9808    0.7887    0.8743       194

   

Epoch 1/20: 100%|██████████| 141/141 [00:04<00:00, 32.03it/s, loss=0.6038]


Epoch 1/20 - Loss: 0.6546, Train Acc: 78.83%, Val Acc: 89.40%, Val F1: 0.8919
  ✓ New best validation F1: 0.8919


Epoch 2/20: 100%|██████████| 141/141 [00:04<00:00, 33.94it/s, loss=0.1954]


Epoch 2/20 - Loss: 0.1255, Train Acc: 96.59%, Val Acc: 89.20%, Val F1: 0.8907
  No improvement (1/4)


Epoch 3/20: 100%|██████████| 141/141 [00:04<00:00, 34.77it/s, loss=0.0276]


Epoch 3/20 - Loss: 0.0309, Train Acc: 99.58%, Val Acc: 91.10%, Val F1: 0.9096
  ✓ New best validation F1: 0.9096


Epoch 4/20: 100%|██████████| 141/141 [00:04<00:00, 31.69it/s, loss=0.0132]


Epoch 4/20 - Loss: 0.0111, Train Acc: 99.94%, Val Acc: 91.50%, Val F1: 0.9140
  ✓ New best validation F1: 0.9140


Epoch 5/20: 100%|██████████| 141/141 [00:04<00:00, 33.06it/s, loss=0.0040]


Epoch 5/20 - Loss: 0.0056, Train Acc: 99.97%, Val Acc: 91.90%, Val F1: 0.9175
  ✓ New best validation F1: 0.9175


Epoch 6/20: 100%|██████████| 141/141 [00:07<00:00, 18.17it/s, loss=0.0018]


Epoch 6/20 - Loss: 0.0038, Train Acc: 99.99%, Val Acc: 91.80%, Val F1: 0.9170
  No improvement (1/4)


Epoch 7/20: 100%|██████████| 141/141 [00:08<00:00, 17.08it/s, loss=0.0073]


Epoch 7/20 - Loss: 0.0023, Train Acc: 100.00%, Val Acc: 92.30%, Val F1: 0.9218
  ✓ New best validation F1: 0.9218


Epoch 8/20: 100%|██████████| 141/141 [00:09<00:00, 14.26it/s, loss=0.0010]


Epoch 8/20 - Loss: 0.0021, Train Acc: 100.00%, Val Acc: 92.40%, Val F1: 0.9230
  ✓ New best validation F1: 0.9230


Epoch 9/20: 100%|██████████| 141/141 [00:08<00:00, 15.97it/s, loss=0.0017]


Epoch 9/20 - Loss: 0.0022, Train Acc: 99.99%, Val Acc: 92.50%, Val F1: 0.9238
  ✓ New best validation F1: 0.9238


Epoch 10/20: 100%|██████████| 141/141 [00:09<00:00, 15.65it/s, loss=0.0018]


Epoch 10/20 - Loss: 0.0014, Train Acc: 100.00%, Val Acc: 92.50%, Val F1: 0.9241
  ✓ New best validation F1: 0.9241


Epoch 11/20: 100%|██████████| 141/141 [00:10<00:00, 13.82it/s, loss=0.0007]


Epoch 11/20 - Loss: 0.0010, Train Acc: 100.00%, Val Acc: 92.60%, Val F1: 0.9248
  ✓ New best validation F1: 0.9248


Epoch 12/20: 100%|██████████| 141/141 [00:08<00:00, 16.90it/s, loss=0.0006]


Epoch 12/20 - Loss: 0.0010, Train Acc: 100.00%, Val Acc: 92.30%, Val F1: 0.9219
  No improvement (1/4)


Epoch 13/20: 100%|██████████| 141/141 [00:08<00:00, 16.85it/s, loss=0.0009]


Epoch 13/20 - Loss: 0.0008, Train Acc: 100.00%, Val Acc: 92.50%, Val F1: 0.9242
  No improvement (2/4)


Epoch 14/20: 100%|██████████| 141/141 [00:05<00:00, 27.37it/s, loss=0.0007]


Epoch 14/20 - Loss: 0.0025, Train Acc: 99.96%, Val Acc: 90.50%, Val F1: 0.9034
  No improvement (3/4)


Epoch 15/20: 100%|██████████| 141/141 [00:04<00:00, 33.49it/s, loss=0.1814]


Epoch 15/20 - Loss: 0.2090, Train Acc: 93.40%, Val Acc: 86.50%, Val F1: 0.8637
  No improvement (4/4)
Early stopping triggered at epoch 15

Loaded best model with Val F1: 0.9248

────────────────────────────────────────────────────────────
Evaluating on test set: original
────────────────────────────────────────────────────────────
Loading: ./data/processed\cifar10_test\original.pkl
Test samples: 2000
              precision    recall  f1-score   support

           0     0.7788    0.8980    0.8341       196
           1     0.8383    0.9949    0.9099       198
           2     0.7436    0.8923    0.8112       195
           3     0.7358    0.7839    0.7591       199
           4     0.9119    0.8889    0.9003       198
           5     0.8921    0.6703    0.7654       185
           6     0.9447    0.8704    0.9060       216
           7     0.8812    0.9223    0.9013       193
           8     0.9479    0.8387    0.8900       217
           9     0.9821    0.8128    0.8895       203


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0     0.1070    1.0000    0.1933       196
           1     0.8421    0.0808    0.1475       198
           2     0.2031    0.0667    0.1004       195
           3     0.3750    0.0151    0.0290       199
           4     0.0000    0.0000    0.0000       198
           5     1.0000    0.0054    0.0108       185
           6     0.5526    0.1944    0.2877       216
           7     0.0000    0.0000    0.0000       193
           8     0.0000    0.0000    0.0000       217
           9     0.0000    0.0000    0.0000       203

    accuracy                         0.1355      2000
   macro avg     0.3080    0.1362    0.0769      2000
weighted avg     0.3032    0.1355    0.0783      2000


Test Accuracy: 13.55%
Test F1 Score: 0.0769

────────────────────────────────────────────────────────────
Evaluating on test set: noise
────────────────────────────────────────────────────────────
Loading: ./data/processed\cifar10_test\nois

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0     0.1148    0.9745    0.2054       196
           1     0.9000    0.0909    0.1651       198
           2     0.2730    0.4103    0.3279       195
           3     0.0000    0.0000    0.0000       199
           4     0.3750    0.0152    0.0291       198
           5     0.0000    0.0000    0.0000       185
           6     0.8000    0.0370    0.0708       216
           7     1.0000    0.0155    0.0306       193
           8     1.0000    0.0046    0.0092       217
           9     1.0000    0.0049    0.0098       203

    accuracy                         0.1525      2000
   macro avg     0.5463    0.1553    0.0848      2000
weighted avg     0.5570    0.1525    0.0839      2000


Test Accuracy: 15.25%
Test F1 Score: 0.0848

────────────────────────────────────────────────────────────
Evaluating on test set: occlusion_25
────────────────────────────────────────────────────────────
Loading: ./data/processed\cifar10_te

Epoch 1/20: 100%|██████████| 141/141 [00:07<00:00, 19.75it/s, loss=0.4312]


Epoch 1/20 - Loss: 0.6697, Train Acc: 77.79%, Val Acc: 86.30%, Val F1: 0.8638
  ✓ New best validation F1: 0.8638


Epoch 2/20: 100%|██████████| 141/141 [00:10<00:00, 13.86it/s, loss=0.1295]


Epoch 2/20 - Loss: 0.1296, Train Acc: 96.36%, Val Acc: 88.80%, Val F1: 0.8871
  ✓ New best validation F1: 0.8871


Epoch 3/20: 100%|██████████| 141/141 [00:09<00:00, 15.39it/s, loss=0.0145]


Epoch 3/20 - Loss: 0.0320, Train Acc: 99.61%, Val Acc: 89.90%, Val F1: 0.8985
  ✓ New best validation F1: 0.8985


Epoch 4/20: 100%|██████████| 141/141 [00:08<00:00, 16.39it/s, loss=0.0235]


Epoch 4/20 - Loss: 0.0107, Train Acc: 99.93%, Val Acc: 89.50%, Val F1: 0.8946
  No improvement (1/4)


Epoch 5/20: 100%|██████████| 141/141 [00:08<00:00, 15.76it/s, loss=0.0235]


Epoch 5/20 - Loss: 0.0072, Train Acc: 99.94%, Val Acc: 90.40%, Val F1: 0.9034
  ✓ New best validation F1: 0.9034


Epoch 6/20: 100%|██████████| 141/141 [00:07<00:00, 20.05it/s, loss=0.0085]


Epoch 6/20 - Loss: 0.0037, Train Acc: 100.00%, Val Acc: 89.80%, Val F1: 0.8980
  No improvement (1/4)


Epoch 7/20: 100%|██████████| 141/141 [00:04<00:00, 32.23it/s, loss=0.0046]


Epoch 7/20 - Loss: 0.0032, Train Acc: 99.99%, Val Acc: 87.10%, Val F1: 0.8700
  No improvement (2/4)


Epoch 8/20: 100%|██████████| 141/141 [00:04<00:00, 34.66it/s, loss=0.0076]


Epoch 8/20 - Loss: 0.0132, Train Acc: 99.69%, Val Acc: 87.80%, Val F1: 0.8773
  No improvement (3/4)


Epoch 9/20: 100%|██████████| 141/141 [00:04<00:00, 33.56it/s, loss=0.0857]


Epoch 9/20 - Loss: 0.0240, Train Acc: 99.36%, Val Acc: 87.30%, Val F1: 0.8732
  No improvement (4/4)
Early stopping triggered at epoch 9

Loaded best model with Val F1: 0.9034

────────────────────────────────────────────────────────────
Evaluating on test set: original
────────────────────────────────────────────────────────────
Loading: ./data/processed\cifar10_test\original.pkl
Test samples: 2000
              precision    recall  f1-score   support

           0     0.9116    0.8418    0.8753       196
           1     0.9534    0.9293    0.9412       198
           2     0.8272    0.8103    0.8187       195
           3     0.6654    0.9095    0.7686       199
           4     0.8284    0.8535    0.8408       198
           5     0.8485    0.7568    0.8000       185
           6     0.9192    0.8426    0.8792       216
           7     0.9405    0.8187    0.8753       193
           8     0.9079    0.9539    0.9303       217
           9     0.9400    0.9261    0.9330       203

 

Epoch 1/20: 100%|██████████| 141/141 [00:04<00:00, 31.85it/s, loss=1.2198]


Epoch 1/20 - Loss: 1.2748, Train Acc: 54.68%, Val Acc: 66.70%, Val F1: 0.6619
  ✓ New best validation F1: 0.6619


Epoch 2/20: 100%|██████████| 141/141 [00:04<00:00, 31.01it/s, loss=0.5501]


Epoch 2/20 - Loss: 0.5467, Train Acc: 81.61%, Val Acc: 73.10%, Val F1: 0.7273
  ✓ New best validation F1: 0.7273


Epoch 3/20: 100%|██████████| 141/141 [00:05<00:00, 26.83it/s, loss=0.1900]


Epoch 3/20 - Loss: 0.1899, Train Acc: 95.13%, Val Acc: 74.00%, Val F1: 0.7401
  ✓ New best validation F1: 0.7401


Epoch 4/20: 100%|██████████| 141/141 [00:06<00:00, 21.32it/s, loss=0.1067]


Epoch 4/20 - Loss: 0.0523, Train Acc: 99.30%, Val Acc: 75.90%, Val F1: 0.7535
  ✓ New best validation F1: 0.7535


Epoch 5/20: 100%|██████████| 141/141 [00:09<00:00, 14.28it/s, loss=0.0210]


Epoch 5/20 - Loss: 0.0169, Train Acc: 99.92%, Val Acc: 76.70%, Val F1: 0.7624
  ✓ New best validation F1: 0.7624


Epoch 6/20: 100%|██████████| 141/141 [00:09<00:00, 15.06it/s, loss=0.0047]


Epoch 6/20 - Loss: 0.0082, Train Acc: 100.00%, Val Acc: 77.50%, Val F1: 0.7709
  ✓ New best validation F1: 0.7709


Epoch 7/20: 100%|██████████| 141/141 [00:10<00:00, 13.58it/s, loss=0.0113]


Epoch 7/20 - Loss: 0.0045, Train Acc: 100.00%, Val Acc: 77.40%, Val F1: 0.7699
  No improvement (1/4)


Epoch 8/20: 100%|██████████| 141/141 [00:09<00:00, 14.73it/s, loss=0.0062]


Epoch 8/20 - Loss: 0.0036, Train Acc: 99.98%, Val Acc: 76.90%, Val F1: 0.7646
  No improvement (2/4)


Epoch 9/20: 100%|██████████| 141/141 [00:09<00:00, 14.18it/s, loss=0.0047]


Epoch 9/20 - Loss: 0.0024, Train Acc: 100.00%, Val Acc: 77.80%, Val F1: 0.7751
  ✓ New best validation F1: 0.7751


Epoch 10/20: 100%|██████████| 141/141 [00:09<00:00, 14.14it/s, loss=0.0025]


Epoch 10/20 - Loss: 0.0020, Train Acc: 100.00%, Val Acc: 77.20%, Val F1: 0.7694
  No improvement (1/4)


Epoch 11/20: 100%|██████████| 141/141 [00:09<00:00, 15.49it/s, loss=0.0011]


Epoch 11/20 - Loss: 0.0016, Train Acc: 100.00%, Val Acc: 76.90%, Val F1: 0.7645
  No improvement (2/4)


Epoch 12/20: 100%|██████████| 141/141 [00:04<00:00, 28.73it/s, loss=0.0009]


Epoch 12/20 - Loss: 0.0013, Train Acc: 100.00%, Val Acc: 77.20%, Val F1: 0.7669
  No improvement (3/4)


Epoch 13/20: 100%|██████████| 141/141 [00:07<00:00, 18.60it/s, loss=0.0050]


Epoch 13/20 - Loss: 0.0011, Train Acc: 100.00%, Val Acc: 77.80%, Val F1: 0.7754
  ✓ New best validation F1: 0.7754


Epoch 14/20: 100%|██████████| 141/141 [00:09<00:00, 15.65it/s, loss=0.0517]


Epoch 14/20 - Loss: 0.0110, Train Acc: 99.72%, Val Acc: 69.20%, Val F1: 0.6924
  No improvement (1/4)


Epoch 15/20: 100%|██████████| 141/141 [00:10<00:00, 13.20it/s, loss=0.3770]


Epoch 15/20 - Loss: 0.4109, Train Acc: 86.22%, Val Acc: 69.70%, Val F1: 0.6921
  No improvement (2/4)


Epoch 16/20: 100%|██████████| 141/141 [00:10<00:00, 13.64it/s, loss=0.0450]


Epoch 16/20 - Loss: 0.1141, Train Acc: 96.29%, Val Acc: 75.30%, Val F1: 0.7436
  No improvement (3/4)


Epoch 17/20: 100%|██████████| 141/141 [00:09<00:00, 15.51it/s, loss=0.0057]


Epoch 17/20 - Loss: 0.0225, Train Acc: 99.46%, Val Acc: 75.20%, Val F1: 0.7509
  No improvement (4/4)
Early stopping triggered at epoch 17

Loaded best model with Val F1: 0.7754

────────────────────────────────────────────────────────────
Evaluating on test set: original
────────────────────────────────────────────────────────────
Loading: ./data/processed\cifar10_test\original.pkl
Test samples: 2000
              precision    recall  f1-score   support

           0     0.4272    0.8980    0.5789       196
           1     0.6915    0.9848    0.8125       198
           2     0.7710    0.5179    0.6196       195
           3     0.6000    0.5729    0.5861       199
           4     0.7917    0.5758    0.6667       198
           5     0.5703    0.7676    0.6544       185
           6     0.8168    0.7639    0.7895       216
           7     0.8862    0.5648    0.6899       193
           8     0.9000    0.4562    0.6055       217
           9     0.8726    0.6749    0.7611       203


In [3]:

# =====================================================
# SAVE SUMMARY
# =====================================================
df = pd.DataFrame(results)
csv_path = os.path.join(SAVE_DIR, "resnet18_results.csv")
df.to_csv(csv_path, index=False)

print(f"\n{'='*80}")
print("EXPERIMENT SUMMARY")
print(f"{'='*80}\n")
print(df.to_string(index=False))
print(f"\n✓ All experiments complete!")
print(f"✓ Summary saved to: {csv_path}")
print(f"✓ Individual reports saved to: {SAVE_DIR}/")


EXPERIMENT SUMMARY

dataset         train_type    test_type  best_val_f1  test_f1  test_accuracy
  mnist           original     original     0.991041 0.989169          98.95
  mnist           original all_combined     0.991041 0.367146          40.15
  mnist           original        noise     0.991041 0.091232          16.55
  mnist           original occlusion_25     0.991041 0.922393          92.35
  mnist           original  rotation_15     0.991041 0.975389          97.55
  mnist           original  scaling_0.8     0.991041 0.983435          98.35
  mnist    mixed_augmented     original     0.990962 0.985232          98.50
  mnist    mixed_augmented all_combined     0.990962 0.354786          39.85
  mnist    mixed_augmented        noise     0.990962 0.138574          26.45
  mnist    mixed_augmented occlusion_25     0.990962 0.891756          89.10
  mnist    mixed_augmented  rotation_15     0.990962 0.946534          94.60
  mnist    mixed_augmented  scaling_0.8     0.990962 0.