In [104]:
# Standard scientific Python imports
import matplotlib.pyplot as plt

# Import datasets, classifiers and performance metrics
from sklearn import metrics, svm
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from itertools import product
import numpy as np

from tqdm import tqdm
from sklearn.svm import SVC

In [105]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset, TensorDataset
from torchvision import datasets, transforms

In [106]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

sys.path.append(parent_dir)

from utils import AddGaussianNoise, AddImpulseNoise

# Model Defintion: SmallCNN

A SGD classifer is not enough to get reliable insights on CIFAR-10 dataset so we will use a lightweight CNN. This will allow us to accurately estimate the influence of training order

In [None]:
class SmallCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 7 * 7, 128), nn.ReLU(),
            nn.Linear(128, 10)
        )
        
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# MNIST Dataset (Vanilla Case)

## Data Loading

In [116]:
# Define transform
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))  # MNIST mean & std
])

# Load base MNIST using transform
base_train = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
base_test = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Convert datasets to tensors
X_train = torch.stack([base_train[i][0] for i in range(len(base_train))])  # (60000, 1, 28, 28)
y_train = torch.tensor([base_train[i][1] for i in range(len(base_train))]) # (60000,)

X_test = torch.stack([base_test[i][0] for i in range(len(base_test))])     # (10000, 1, 28, 28)
y_test = torch.tensor([base_test[i][1] for i in range(len(base_test))])    # (10000,)

# Optional sanity check
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

# Create TensorDataset
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=1000, shuffle=False)

X_train: torch.Size([60000, 1, 28, 28]), y_train: torch.Size([60000])
X_test: torch.Size([10000, 1, 28, 28]), y_test: torch.Size([10000])


## Analysis

### (0) Base Case

#### Analysis

Let's try as the Vanilla Base Case: Train on the entire dataset using uniform random shuffling for each epoch.

In [117]:
# === Training Setup ===
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")
model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
n_epochs = 1

# === Training Loop ===
for epoch in range(n_epochs):
    model.train()
    total_loss = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * images.size(0)

    avg_loss = total_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{n_epochs} - Train Loss: {avg_loss:.4f}")

# === Evaluation ===
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        predicted = outputs.argmax(dim=1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {correct / total:.4f}")

Using device: mps
Epoch 1/1 - Train Loss: 0.1435
Test Accuracy: 0.9876


### (1) Curriculum Learning

#### Pre-analysis

As curriculum learning is based on giving samples in increasing difficulty level to the model, we first need to define a difficulty function. We will base ours on difference between the distance of each point to the centroid of each class.

In [118]:
def compute_MNIST_difficulty(X, y, centroids):
    dist = np.linalg.norm(X - centroids[y], axis=1)
    return dist

#### Analysis

In [119]:
# Flatten for difficulty calculation
X_train_flat = X_train.view(len(X_train), -1).numpy()

# Compute centroids
centroids = np.zeros((10, X_train_flat.shape[1]))
for i in range(10):
    centroids[i] = X_train_flat[y_train == i].mean(axis=0)

In [126]:
# Compute difficulty score
difficulties = compute_MNIST_difficulty(X_train_flat, y_train, centroids)

# Normalize difficulties to [0, 1]
difficulties = (difficulties - difficulties.min()) / (difficulties.max() - difficulties.min())

# Sort indices by difficulty
sorted_indices = np.argsort(difficulties)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters
batch_size = 64
epochs_per_stage = 1  # can increase to 2–3 if needed
num_stages = 5

# Define curriculum stages as % of dataset (in increasing difficulty)
phases = np.linspace(0.1, 1.0, num_stages)

previous_n = 0  # Start index for slicing

for stage, phase in enumerate(phases):
    current_n = int(phase * len(sorted_indices))
    selected_idx = sorted_indices[previous_n:current_n]
    previous_n = current_n  # Update for next stage
    
    print(f"\nStage {stage+1}/{num_stages}: Using {len(selected_idx)} new examples")

    # Prepare current stage dataset
    X_stage = X_train[selected_idx]
    y_stage = y_train[selected_idx]

    train_dataset = TensorDataset(X_stage, y_stage)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training loop for this stage
    for epoch in range(epochs_per_stage):
        model.train()
        total_loss, total_correct = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * labels.size(0)
            total_correct += (outputs.argmax(1) == labels).sum().item()

        acc = total_correct / len(train_dataset)
        print(f"  Epoch {epoch+1}/{epochs_per_stage} - Loss: {total_loss/len(train_dataset):.4f}, Acc: {acc:.4f}")

# Final test accuracy

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


Stage 1/5: Using 6000 new examples
  Epoch 1/1 - Loss: 0.1688, Acc: 0.9647

Stage 2/5: Using 13500 new examples
  Epoch 1/1 - Loss: 0.1152, Acc: 0.9744

Stage 3/5: Using 13500 new examples
  Epoch 1/1 - Loss: 0.0707, Acc: 0.9768

Stage 4/5: Using 13500 new examples
  Epoch 1/1 - Loss: 0.0921, Acc: 0.9710

Stage 5/5: Using 13500 new examples
  Epoch 1/1 - Loss: 0.1396, Acc: 0.9559

Final Test Accuracy: 0.9440


### (2) Self-Paced Learning

In Self-Paced Learning, the model is supposed to:

• learn from easier samples first (based on current loss)

• adaptively expand its training set to include harder samples as it becomes more confident

We will use the same X_train, X_test, y_train and y_test computed in the curriculum learning phase.
To implement the SPL we will introduce a difficulty threshold to let the model choose how many samples of this difficulty it is ready to handle.

In [136]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)  # Your CNN model here
criterion = torch.nn.CrossEntropyLoss(reduction='none')  # per-sample loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 3
samples_per_epoch = len(X_train) // num_epochs

seen_mask = torch.zeros(len(X_train), dtype=torch.bool)

for epoch in range(num_epochs):
    model.eval()
    with torch.no_grad():
        unseen_indices = (~seen_mask).nonzero(as_tuple=True)[0]
        unseen_loader = DataLoader(TensorDataset(X_train[unseen_indices], y_train[unseen_indices]),
                                   batch_size=batch_size, shuffle=False)

        all_losses = []
        for images, labels in unseen_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            losses = criterion(outputs, labels)  # shape: (batch,)
            all_losses.append(losses.cpu())

        all_losses = torch.cat(all_losses)
    
    # Select top-k easiest unseen samples
    k = min(samples_per_epoch, len(unseen_indices))
    selected_in_unseen = torch.topk(-all_losses, k).indices  # negative for lowest loss
    selected_indices = unseen_indices[selected_in_unseen]

    seen_mask[selected_indices] = True

    print(f"Epoch {epoch+1}: selected {len(selected_indices)} new samples (total seen: {seen_mask.sum().item()}/{len(X_train)})")

    train_loader = DataLoader(TensorDataset(X_train[selected_indices], y_train[selected_indices]),
                              batch_size=batch_size, shuffle=True)

    # Train model
    model.train()
    total_loss, total_correct = 0, 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels).mean()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(labels)
        total_correct += (outputs.argmax(1) == labels).sum().item()

    acc = total_correct / len(train_loader.dataset)
    print(f"  Train Loss: {total_loss / len(train_loader.dataset):.4f}, Acc: {acc:.4f}")
    
print("Training complete.")

# Final test accuracy evaluation
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")

Epoch 1: selected 20000 new samples (total seen: 20000/60000)
  Train Loss: 0.1339, Acc: 0.9620
Epoch 2: selected 20000 new samples (total seen: 40000/60000)
  Train Loss: 0.0329, Acc: 0.9896
Epoch 3: selected 20000 new samples (total seen: 60000/60000)
  Train Loss: 0.1468, Acc: 0.9640
Training complete.

Final Test Accuracy: 0.6744


This poor accuracy can be explained because of the cumulative nature of SPL

### (3) Hard-Example Mining

Hard-Example Mining consists in feeding the model only hard examples. In our case, we will consider that a sample is difficult if its normalized difficulty is greater or equal than 0,75 (in other words the top 25%).

In [142]:
# Compute difficulty score and normalize
difficulties = compute_MNIST_difficulty(X_train_flat, y_train, centroids)
difficulties = (difficulties - difficulties.min()) / (difficulties.max() - difficulties.min())

# Select hard examples: top 25% (difficulty >= 0.75)
hard_mask = difficulties >= 0.75
hard_indices = np.where(hard_mask)[0] # we do not shuffle the indices to train on increasingly difficult samples (adapted CL idea)
print(f"Selected {len(hard_indices)} hard examples out of {len(difficulties)} total")

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters
batch_size = 64
epochs = 10

# Prepare hard-example dataset
X_hard = X_train[hard_indices]
y_hard = y_train[hard_indices]

train_dataset = TensorDataset(X_hard, y_hard)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss, total_correct = 0, 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        total_correct += (outputs.argmax(1) == labels).sum().item()

    acc = total_correct / len(train_dataset)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(train_dataset):.4f}, Acc: {acc:.4f}")

# Final test accuracy
model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


Selected 337 hard examples out of 60000 total
Epoch 1/10 - Loss: 2.0978, Acc: 0.2997
Epoch 2/10 - Loss: 1.5392, Acc: 0.5549
Epoch 3/10 - Loss: 1.1558, Acc: 0.5994
Epoch 4/10 - Loss: 0.8981, Acc: 0.6914
Epoch 5/10 - Loss: 0.7529, Acc: 0.7270
Epoch 6/10 - Loss: 0.6194, Acc: 0.7626
Epoch 7/10 - Loss: 0.4944, Acc: 0.8427
Epoch 8/10 - Loss: 0.3577, Acc: 0.8902
Epoch 9/10 - Loss: 0.2801, Acc: 0.9169
Epoch 10/10 - Loss: 0.1994, Acc: 0.9347

Final Test Accuracy: 0.4076


### (4) Reverse Curriculum Learning

We are implementing **Reverse Curriculum Learning (RCL)** where the model starts learning from easier goals that are close to the target and gradually works backwards to more challenging starting states.

In [146]:
# Compute difficulty score
difficulties = compute_MNIST_difficulty(X_train_flat, y_train, centroids)

# Normalize difficulties to [0, 1]
difficulties = (difficulties - difficulties.min()) / (difficulties.max() - difficulties.min())

# Sort indices by difficulty — hardest last
sorted_indices = np.argsort(difficulties)[::-1].copy()  # Reverse order for hardest first

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters
batch_size = 64
epochs_per_stage = 1
num_stages = 5

# Define reverse curriculum stages as % of dataset (from hard to easy)
phases = np.linspace(0.1, 1.0, num_stages)  # Percentages

previous_n = 0  # Start index for slicing

for stage, phase in enumerate(phases):
    current_n = int(phase * len(sorted_indices))
    selected_idx = sorted_indices[previous_n:current_n]
    previous_n = current_n  # Update for next stage
    
    print(f"\nStage {stage+1}/{num_stages}: Using {len(selected_idx)} new hard→easy examples")

    # Prepare current stage dataset
    X_stage = X_train[selected_idx]
    y_stage = y_train[selected_idx]

    train_dataset = TensorDataset(X_stage, y_stage)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training loop for this stage
    for epoch in range(epochs_per_stage):
        model.train()
        total_loss, total_correct = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * labels.size(0)
            total_correct += (outputs.argmax(1) == labels).sum().item()

        acc = total_correct / len(train_dataset)
        print(f"  Epoch {epoch+1}/{epochs_per_stage} - Loss: {total_loss/len(train_dataset):.4f}, Acc: {acc:.4f}")

# Final test accuracy

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")



Stage 1/5: Using 6000 new hard→easy examples
  Epoch 1/1 - Loss: 0.7508, Acc: 0.7592

Stage 2/5: Using 13500 new hard→easy examples
  Epoch 1/1 - Loss: 0.1388, Acc: 0.9576

Stage 3/5: Using 13500 new hard→easy examples
  Epoch 1/1 - Loss: 0.0603, Acc: 0.9813

Stage 4/5: Using 13500 new hard→easy examples
  Epoch 1/1 - Loss: 0.0277, Acc: 0.9913

Stage 5/5: Using 13500 new hard→easy examples
  Epoch 1/1 - Loss: 0.0046, Acc: 0.9989

Final Test Accuracy: 0.9327


### (5) Stratified Monte-Carlo Sampling

**Stratified Monte Carlo Sampling** is a variance reduction technique where the input space is divided into distinct strata (subregions), and samples are drawn from each stratum. This ensures more uniform coverage of the space compared to standard Monte Carlo sampling, leading to more accurate and stable estimates with fewer samples.

In [147]:
# Compute difficulty score
difficulties = compute_MNIST_difficulty(X_train_flat, y_train, centroids)

# Normalize difficulties to [0, 1]
difficulties = (difficulties - difficulties.min()) / (difficulties.max() - difficulties.min())

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters
batch_size = 64
epochs_per_stage = 1
num_stages = 5
samples_per_stage = int(len(X_train) / num_stages)

# Stratify dataset into bins based on difficulty
num_bins = num_stages
bin_edges = np.linspace(0, 1, num_bins + 1)
bins = [[] for _ in range(num_bins)]

for idx, score in enumerate(difficulties):
    for b in range(num_bins):
        if bin_edges[b] <= score < bin_edges[b + 1] or (b == num_bins - 1 and score == 1.0):
            bins[b].append(idx)
            break

# Shuffle each bin
for b in bins:
    np.random.shuffle(b)

# Training loop with stratified sampling
seen_indices = set()

for stage in range(num_stages):
    print(f"\nStage {stage+1}/{num_stages}: Sampling from all difficulty strata")

    stage_indices = []

    for b in bins:
        take_n = min(samples_per_stage // num_bins, len(b))
        selected = [i for i in b if i not in seen_indices][:take_n]
        seen_indices.update(selected)
        stage_indices.extend(selected)

    np.random.shuffle(stage_indices)

    X_stage = X_train[stage_indices]
    y_stage = y_train[stage_indices]

    train_dataset = TensorDataset(X_stage, y_stage)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training
    for epoch in range(epochs_per_stage):
        model.train()
        total_loss, total_correct = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * labels.size(0)
            total_correct += (outputs.argmax(1) == labels).sum().item()

        acc = total_correct / len(train_dataset)
        print(f"  Epoch {epoch+1}/{epochs_per_stage} - Loss: {total_loss/len(train_dataset):.4f}, Acc: {acc:.4f}")

# Final test accuracy
model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")




Stage 1/5: Sampling from all difficulty strata
  Epoch 1/1 - Loss: 0.4432, Acc: 0.8607

Stage 2/5: Sampling from all difficulty strata
  Epoch 1/1 - Loss: 0.1001, Acc: 0.9696

Stage 3/5: Sampling from all difficulty strata
  Epoch 1/1 - Loss: 0.0828, Acc: 0.9732

Stage 4/5: Sampling from all difficulty strata
  Epoch 1/1 - Loss: 0.0775, Acc: 0.9781

Stage 5/5: Sampling from all difficulty strata
  Epoch 1/1 - Loss: 0.0639, Acc: 0.9798

Final Test Accuracy: 0.9828


# MNIST Dataset with Gaussian Noise

## Data Loading

We initialize a range of increasing difficulty. 
- 0.0: no noise — easiest samples
- 0.4: very noisy — hardest samples
- 0.5+ usually makes MNIST unreadable

In [62]:
# Load base MNIST (no transform)
base_train = datasets.MNIST(root='./data', train=True, download=True)
base_test = datasets.MNIST(root='./data', train=False, download=True)

full_data = torch.cat([base_train.data, base_test.data], dim=0).float() / 255.0
full_targets = torch.cat([base_train.targets, base_test.targets], dim=0)

# Expand with noise
noise_levels = [0.0, 0.1, 0.2, 0.3, 0.4]
augmented_data, augmented_targets, noise_labels = [], [], []

for level in noise_levels:
    noise = torch.randn_like(full_data) * level
    noisy_data = full_data + noise
    noisy_data = torch.clamp(noisy_data, 0.0, 1.0)

    augmented_data.append(noisy_data)
    augmented_targets.append(full_targets)
    noise_labels.append(torch.full_like(full_targets, level))

# Combine everything
augmented_data = torch.cat(augmented_data, dim=0).unsqueeze(1)  # (N*L, 1, 28, 28)
augmented_targets = torch.cat(augmented_targets, dim=0)
noise_labels = torch.cat(noise_labels, dim=0)

# Final dataset
augmented_dataset = TensorDataset(augmented_data, augmented_targets, noise_labels)

In [63]:
# Sizes of original MNIST splits
N_train = len(base_train)  # 60000
N_test = len(base_test)    # 10000

# Augmented data shape: [num_levels * (N_train + N_test), 1, 28, 28]
samples_per_level = N_train + N_test

X_train_list, y_train_list = [], []
X_test_list, y_test_list = [], []

for i in range(len(noise_levels)):
    start = i * samples_per_level
    end = start + samples_per_level

    # Get this noise level's full data and split it
    X_level = augmented_data[start:end]
    y_level = augmented_targets[start:end]

    X_train_list.append(X_level[:N_train])
    y_train_list.append(y_level[:N_train])
    X_test_list.append(X_level[N_train:])
    y_test_list.append(y_level[N_train:])

# Final concatenated noisy train/test sets (mixed noise levels)
X_train = torch.cat(X_train_list, dim=0)
y_train = torch.cat(y_train_list, dim=0)
X_test = torch.cat(X_test_list, dim=0)
y_test = torch.cat(y_test_list, dim=0)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: torch.Size([300000, 1, 28, 28]), y_train: torch.Size([300000])
X_test: torch.Size([50000, 1, 28, 28]), y_test: torch.Size([50000])


In [64]:
noise_levels_train_list = []

for i, level in enumerate(noise_levels):
    # Number of training samples for this noise level
    num_train_samples = N_train
    
    # Create a tensor filled with the noise level index: 0, 1, 2, 3 or 4
    noise_level_tensor = torch.full((num_train_samples,), fill_value=i)
    
    noise_levels_train_list.append(noise_level_tensor)

# Concatenate to get noise_levels_train for the entire train set
noise_levels_train = torch.cat(noise_levels_train_list, dim=0)

print(f"noise_levels_train shape: {noise_levels_train.shape}")  # Should be (len(X_train),)


noise_levels_train shape: torch.Size([300000])


## Analysis

### (0) Base Case

In [87]:
# DataLoaders
batch_size = 64
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)  # random sampling baseline
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training setup
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 1
for epoch in range(epochs):
    model.train()
    total_loss, total_correct = 0, 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        total_correct += (outputs.argmax(1) == labels).sum().item()

    train_acc = total_correct / len(train_dataset)
    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {total_loss/len(train_dataset):.4f}, Train Acc: {train_acc:.4f}")

# Evaluate
model.eval()
correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        correct += (outputs.argmax(1) == labels).sum().item()
test_acc = correct / len(test_dataset)
print(f"Test Accuracy: {test_acc:.4f}")

Epoch 1/1 - Train Loss: 0.1309, Train Acc: 0.9583
Test Accuracy: 0.9756


### (1) Curriculum Learning

#### (1.1) Cumulative Curriculum Learning

In [88]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters
batch_size = 64
epochs_per_stage = 1  # can increase to 2–3 if needed
num_stages = 5

for stage in range(num_stages):
    print(f"\nStage {stage+1}/{num_stages}: Using noise levels <= {stage}")
    
    # Select training data up to current noise level
    stage_mask = noise_levels_train <= stage
    X_stage = X_train[stage_mask]
    y_stage = y_train[stage_mask]

    train_dataset = TensorDataset(X_stage, y_stage)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training loop for this stage
    for epoch in range(epochs_per_stage):
        model.train()
        total_loss, total_correct = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * labels.size(0)
            total_correct += (outputs.argmax(1) == labels).sum().item()

        acc = total_correct / len(train_dataset)
        print(f"  Epoch {epoch+1}/{epochs_per_stage} - Loss: {total_loss/len(train_dataset):.4f}, Acc: {acc:.4f}")

# Final test accuracy
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


Stage 1/5: Using noise levels <= 0
  Epoch 1/1 - Loss: 0.1676, Acc: 0.9492

Stage 2/5: Using noise levels <= 1
  Epoch 1/1 - Loss: 0.0513, Acc: 0.9840

Stage 3/5: Using noise levels <= 2
  Epoch 1/1 - Loss: 0.0394, Acc: 0.9872

Stage 4/5: Using noise levels <= 3
  Epoch 1/1 - Loss: 0.0377, Acc: 0.9873

Stage 5/5: Using noise levels <= 4
  Epoch 1/1 - Loss: 0.0455, Acc: 0.9846

Final Test Accuracy: 0.9802


#### (1.2) Strict Curriculum Learning

In [89]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters
batch_size = 64
epochs_per_stage = 1  # can increase to 2–3 if needed
num_stages = 5

for stage in range(num_stages):
    print(f"\nStage {stage+1}/{num_stages}: Using noise level {stage}")
    
    # Select training data up to current noise level
    stage_mask = noise_levels_train == stage
    X_stage = X_train[stage_mask]
    y_stage = y_train[stage_mask]

    train_dataset = TensorDataset(X_stage, y_stage)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training loop for this stage
    for epoch in range(epochs_per_stage):
        model.train()
        total_loss, total_correct = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * labels.size(0)
            total_correct += (outputs.argmax(1) == labels).sum().item()

        acc = total_correct / len(train_dataset)
        print(f"  Epoch {epoch+1}/{epochs_per_stage} - Loss: {total_loss/len(train_dataset):.4f}, Acc: {acc:.4f}")

# Final test accuracy
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


Stage 1/5: Using noise level 0
  Epoch 1/1 - Loss: 0.1762, Acc: 0.9463

Stage 2/5: Using noise level 1
  Epoch 1/1 - Loss: 0.0755, Acc: 0.9764

Stage 3/5: Using noise level 2
  Epoch 1/1 - Loss: 0.0778, Acc: 0.9757

Stage 4/5: Using noise level 3
  Epoch 1/1 - Loss: 0.1071, Acc: 0.9646

Stage 5/5: Using noise level 4
  Epoch 1/1 - Loss: 0.1596, Acc: 0.9480

Final Test Accuracy: 0.9730


### (2) Self-Paced Learning

We will use the same X_train, X_test, y_train and y_test computed in the curriculum learning phase.
To implement the SPL we will introduce a difficulty threshold to let the model choose how many samples of this difficulty it is ready to handle.

Because our objective is to see the influence of order on the training, we will stop training once the model saw the entire dataset. To be fair with the other techniques where they see the data only once.

In [90]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss(reduction='none')  # Important: per-sample loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

batch_size = 64
epochs = 10
initial_lambda = 2.25  # initial difficulty threshold
lambda_increment = 2  # increase per epoch

# Wrap all training data in a dataset/loader for loss evaluation
full_train_dataset = TensorDataset(X_train, y_train)
full_train_loader = DataLoader(full_train_dataset, batch_size=batch_size, shuffle=False)

for epoch in range(epochs):
    model.eval()
    all_losses = []

    # Compute per-sample losses on the full training set
    with torch.no_grad():
        for images, labels in full_train_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            losses = criterion(outputs, labels)  # shape: (batch_size,)
            all_losses.append(losses.cpu())

    all_losses = torch.cat(all_losses)  # shape: (N,)
    print(torch.min(all_losses))
    print(torch.max(all_losses))

    # Determine current lambda threshold
    lambda_threshold = initial_lambda + epoch * lambda_increment

    # Select indices where loss <= lambda_threshold
    selected_indices = (all_losses <= lambda_threshold).nonzero(as_tuple=True)[0]

    if len(selected_indices) == 0:
        print(f"Epoch {epoch+1}: No samples selected for training (lambda={lambda_threshold:.3f}), stopping early.")
        break

    print(f"Epoch {epoch+1}: lambda={lambda_threshold:.3f}, selected {len(selected_indices)}/{len(X_train)} samples")

    # Create subset dataset and loader for training
    train_subset = TensorDataset(X_train[selected_indices], y_train[selected_indices])
    train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)

    # Train on selected samples
    model.train()
    total_loss, total_correct = 0, 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels).mean()  # mean loss for batch
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        total_correct += (outputs.argmax(1) == labels).sum().item()

    acc = total_correct / len(train_subset)
    print(f"  Training Loss: {total_loss/len(train_subset):.4f}, Accuracy: {acc:.4f}")

    if len(selected_indices) == len(X_train):
        print(f"  All samples were selected, stopping early.")
        break

# Evaluate on test set after training
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


tensor(2.1542)
tensor(2.4577)
Epoch 1: lambda=2.250, selected 68873/300000 samples
  Training Loss: 0.0923, Accuracy: 0.9683
tensor(-0.)
tensor(41.5899)
Epoch 2: lambda=4.250, selected 119147/300000 samples
  Training Loss: 0.0321, Accuracy: 0.9885
tensor(-0.)
tensor(53.7047)
Epoch 3: lambda=6.250, selected 120500/300000 samples
  Training Loss: 0.0246, Accuracy: 0.9915
tensor(-0.)
tensor(64.2613)
Epoch 4: lambda=8.250, selected 186858/300000 samples
  Training Loss: 0.0396, Accuracy: 0.9870
tensor(-0.)
tensor(63.6075)
Epoch 5: lambda=10.250, selected 267750/300000 samples
  Training Loss: 0.0400, Accuracy: 0.9865
tensor(-0.)
tensor(42.1693)
Epoch 6: lambda=12.250, selected 299173/300000 samples
  Training Loss: 0.0392, Accuracy: 0.9871
tensor(-0.)
tensor(29.9628)
Epoch 7: lambda=14.250, selected 299993/300000 samples
  Training Loss: 0.0261, Accuracy: 0.9912
tensor(-0.)
tensor(26.0948)
Epoch 8: lambda=16.250, selected 299995/300000 samples
  Training Loss: 0.0166, Accuracy: 0.9942
ten

An idea could be to combine the difficulty scores from the noise levels with the increments in SLP:


**Option A: Use Noise Difficulty as a Prior or Weight for Lambda Threshold**

- Adjust the SPL threshold (`λ`) for each sample by incorporating its noise difficulty:

\[
\lambda_i = \lambda_{\text{base}} + \alpha \times \text{noise\_level}_i
\]

- Samples with higher noise difficulty require a higher loss to be included, effectively entering the curriculum later.

**Option B: Use Noise Difficulty for Initial Sample Filtering**

- Start SPL training using only samples with noise difficulty below a certain threshold (e.g., noise_level ≤ 0.2).
- Gradually expand the training set to include samples with higher noise difficulty as training progresses.

**Option C: Weighted Loss or Thresholding by Noise Difficulty Quantiles**

- Group samples by their noise difficulty levels.
- Compute separate SPL loss thresholds for each noise group.
- Allow lower thresholds (easier inclusion) for low-noise groups and higher thresholds for high-noise groups.
- This respects both the *a priori* noise difficulty and the *dynamic* training loss.

### (3) Hard-Example Mining

In [91]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

batch_size = 64
epochs = 5

# Define what "hard" means: top 1 or 2 noise levels (e.g., 0.3 and 0.4)
# If noise_levels_train goes from 0 to 4 (for noise 0.0 to 0.4), we can take levels >= 3
hard_mask = noise_levels_train >= 3

# Select hard examples
X_hard = X_train[hard_mask]
y_hard = y_train[hard_mask]
print(f"Selected {len(X_hard)} hard examples out of {len(X_train)}")

# Wrap in dataset and loader
hard_dataset = TensorDataset(X_hard, y_hard)
hard_loader = DataLoader(hard_dataset, batch_size=batch_size, shuffle=True)

# Training loop on hard examples
for epoch in range(epochs):
    model.train()
    total_loss, total_correct = 0, 0

    for images, labels in hard_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        total_correct += (outputs.argmax(1) == labels).sum().item()

    acc = total_correct / len(hard_dataset)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(hard_dataset):.4f}, Accuracy: {acc:.4f}")

# Final test evaluation
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


Selected 120000 hard examples out of 300000
Epoch 1/5 - Loss: 0.3404, Accuracy: 0.8916
Epoch 2/5 - Loss: 0.1473, Accuracy: 0.9531
Epoch 3/5 - Loss: 0.1132, Accuracy: 0.9633
Epoch 4/5 - Loss: 0.0923, Accuracy: 0.9703
Epoch 5/5 - Loss: 0.0761, Accuracy: 0.9746

Final Test Accuracy: 0.9751


### (4) Reverse Curriculum Learning

#### (4.1) Cumulative Reverse Curriculum Learning

In [92]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters
batch_size = 64
epochs_per_stage = 1  # can increase to 2–3 if needed
num_stages = 5

for stage in reversed(range(num_stages)):
    print(f"\nStage {stage+1}/{num_stages}: Using noise levels >= {stage}")
    
    # Select training data up to current noise level
    stage_mask = noise_levels_train >= stage
    X_stage = X_train[stage_mask]
    y_stage = y_train[stage_mask]

    train_dataset = TensorDataset(X_stage, y_stage)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training loop for this stage
    for epoch in range(epochs_per_stage):
        model.train()
        total_loss, total_correct = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * labels.size(0)
            total_correct += (outputs.argmax(1) == labels).sum().item()

        acc = total_correct / len(train_dataset)
        print(f"  Epoch {epoch+1}/{epochs_per_stage} - Loss: {total_loss/len(train_dataset):.4f}, Acc: {acc:.4f}")

# Final test accuracy
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


Stage 5/5: Using noise levels >= 4
  Epoch 1/1 - Loss: 0.4844, Acc: 0.8424

Stage 4/5: Using noise levels >= 3
  Epoch 1/1 - Loss: 0.1654, Acc: 0.9469

Stage 3/5: Using noise levels >= 2
  Epoch 1/1 - Loss: 0.0980, Acc: 0.9680

Stage 2/5: Using noise levels >= 1
  Epoch 1/1 - Loss: 0.0621, Acc: 0.9794

Stage 1/5: Using noise levels >= 0
  Epoch 1/1 - Loss: 0.0404, Acc: 0.9865

Final Test Accuracy: 0.9798


#### (4.2) Strict Reverse Curriculum Learning

In [93]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters
batch_size = 64
epochs_per_stage = 1  # can increase to 2–3 if needed
num_stages = 5

for stage in reversed(range(num_stages)):
    print(f"\nStage {stage+1}/{num_stages}: Using noise level {stage}")
    
    # Select training data up to current noise level
    stage_mask = noise_levels_train == stage
    X_stage = X_train[stage_mask]
    y_stage = y_train[stage_mask]

    train_dataset = TensorDataset(X_stage, y_stage)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training loop for this stage
    for epoch in range(epochs_per_stage):
        model.train()
        total_loss, total_correct = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * labels.size(0)
            total_correct += (outputs.argmax(1) == labels).sum().item()

        acc = total_correct / len(train_dataset)
        print(f"  Epoch {epoch+1}/{epochs_per_stage} - Loss: {total_loss/len(train_dataset):.4f}, Acc: {acc:.4f}")

# Final test accuracy
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


Stage 5/5: Using noise level 4
  Epoch 1/1 - Loss: 0.5161, Acc: 0.8323

Stage 4/5: Using noise level 3
  Epoch 1/1 - Loss: 0.1574, Acc: 0.9499

Stage 3/5: Using noise level 2
  Epoch 1/1 - Loss: 0.0897, Acc: 0.9715

Stage 2/5: Using noise level 1
  Epoch 1/1 - Loss: 0.0561, Acc: 0.9823

Stage 1/5: Using noise level 0
  Epoch 1/1 - Loss: 0.0377, Acc: 0.9885

Final Test Accuracy: 0.9396


### (5) Stratified Monte-Carlo Sampling

In [94]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters
batch_size = 64
epochs_per_stage = 1  # same as curriculum
num_stages = 5        # noise levels 0–4
samples_per_stage = 10000  # number of examples to sample from each difficulty level

for stage in range(num_stages):
    print(f"\nStage {stage+1}/{num_stages}: Sampling from noise level = {stage}")
    
    # Select indices for current noise level
    stage_mask = (noise_levels_train == stage).nonzero(as_tuple=True)[0]
    
    # Randomly sample without replacement
    if len(stage_mask) < samples_per_stage:
        print(f"  Warning: only {len(stage_mask)} samples available, using all.")
        selected_indices = stage_mask
    else:
        selected_indices = stage_mask[torch.randperm(len(stage_mask))[:samples_per_stage]]
    
    # Prepare subset
    X_stage = X_train[selected_indices]
    y_stage = y_train[selected_indices]
    train_dataset = TensorDataset(X_stage, y_stage)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Train loop for this stage
    for epoch in range(epochs_per_stage):
        model.train()
        total_loss, total_correct = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * labels.size(0)
            total_correct += (outputs.argmax(1) == labels).sum().item()

        acc = total_correct / len(train_dataset)
        print(f"  Epoch {epoch+1}/{epochs_per_stage} - Loss: {total_loss/len(train_dataset):.4f}, Acc: {acc:.4f}")

# Final test accuracy
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")



Stage 1/5: Sampling from noise level = 0
  Epoch 1/1 - Loss: 0.5726, Acc: 0.8297

Stage 2/5: Sampling from noise level = 1
  Epoch 1/1 - Loss: 0.1966, Acc: 0.9413

Stage 3/5: Sampling from noise level = 2
  Epoch 1/1 - Loss: 0.1745, Acc: 0.9460

Stage 4/5: Sampling from noise level = 3
  Epoch 1/1 - Loss: 0.1896, Acc: 0.9387

Stage 5/5: Sampling from noise level = 4
  Epoch 1/1 - Loss: 0.2754, Acc: 0.9128

Final Test Accuracy: 0.9582


# MNIST Dataset with Impulse Noise

## Data Loading

We initialize a range of increasing difficulty. 
- 0.0: no noise — easiest samples
- 0.4: very noisy — hardest samples
- 0.5+ usually makes MNIST unreadable

In [95]:
# Load base MNIST (no transform)
base_train = datasets.MNIST(root='./data', train=True, download=True)
base_test = datasets.MNIST(root='./data', train=False, download=True)

# Normalize and concatenate full dataset
full_data = torch.cat([base_train.data, base_test.data], dim=0).float() / 255.0  # [70000, 28, 28]
full_targets = torch.cat([base_train.targets, base_test.targets], dim=0)

# Define noise levels: fraction of pixels affected
noise_levels = [0.0, 0.1, 0.2, 0.3, 0.4]

augmented_data, augmented_targets, noise_labels = [], [], []

for level in noise_levels:
    noisy_data = full_data.clone()
    if level > 0:
        N, H, W = noisy_data.shape
        total_pixels = H * W

        num_noisy = int(level * total_pixels)

        for i in range(N):
            coords = torch.randperm(total_pixels)[:num_noisy]
            salt_or_pepper = torch.randint(0, 2, (num_noisy,), dtype=torch.float32)  # 0 or 1
            flat_image = noisy_data[i].view(-1)
            flat_image[coords] = salt_or_pepper  # 0 for pepper, 1 for salt

    augmented_data.append(noisy_data)
    augmented_targets.append(full_targets)
    noise_labels.append(torch.full_like(full_targets, fill_value=level))

# Stack and reshape
augmented_data = torch.cat(augmented_data, dim=0).unsqueeze(1)  # [N * L, 1, 28, 28]
augmented_targets = torch.cat(augmented_targets, dim=0)
noise_labels = torch.cat(noise_labels, dim=0)

# Original sizes
N_train = len(base_train)
N_test = len(base_test)
samples_per_level = N_train + N_test

# Split per noise level
X_train_list, y_train_list = [], []
X_test_list, y_test_list = [], []

for i in range(len(noise_levels)):
    start = i * samples_per_level
    end = start + samples_per_level

    X_level = augmented_data[start:end]
    y_level = augmented_targets[start:end]

    X_train_list.append(X_level[:N_train])
    y_train_list.append(y_level[:N_train])
    X_test_list.append(X_level[N_train:])
    y_test_list.append(y_level[N_train:])

X_train = torch.cat(X_train_list, dim=0)
y_train = torch.cat(y_train_list, dim=0)
X_test = torch.cat(X_test_list, dim=0)
y_test = torch.cat(y_test_list, dim=0)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

# Create noise level labels for training set
noise_levels_train_list = []

for i, level in enumerate(noise_levels):
    noise_level_tensor = torch.full((N_train,), fill_value=i)
    noise_levels_train_list.append(noise_level_tensor)

noise_levels_train = torch.cat(noise_levels_train_list, dim=0)
print(f"noise_levels_train shape: {noise_levels_train.shape}")


X_train: torch.Size([300000, 1, 28, 28]), y_train: torch.Size([300000])
X_test: torch.Size([50000, 1, 28, 28]), y_test: torch.Size([50000])
noise_levels_train shape: torch.Size([300000])


## Analysis

### (0) Base Case

In [96]:
# DataLoaders
batch_size = 64
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)  # random sampling baseline
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training setup
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 1
for epoch in range(epochs):
    model.train()
    total_loss, total_correct = 0, 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        total_correct += (outputs.argmax(1) == labels).sum().item()

    train_acc = total_correct / len(train_dataset)
    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {total_loss/len(train_dataset):.4f}, Train Acc: {train_acc:.4f}")

# Evaluate
model.eval()
correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        correct += (outputs.argmax(1) == labels).sum().item()
test_acc = correct / len(test_dataset)
print(f"Test Accuracy: {test_acc:.4f}")

Epoch 1/1 - Train Loss: 0.1343, Train Acc: 0.9570
Test Accuracy: 0.9753


### (1) Curriculum Learning

#### (1.1) Cumulative Curriculum Learning

In [97]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters
batch_size = 64
epochs_per_stage = 1  # can increase to 2–3 if needed
num_stages = 5

for stage in range(num_stages):
    print(f"\nStage {stage+1}/{num_stages}: Using noise levels <= {stage}")
    
    # Select training data up to current noise level
    stage_mask = noise_levels_train <= stage
    X_stage = X_train[stage_mask]
    y_stage = y_train[stage_mask]

    train_dataset = TensorDataset(X_stage, y_stage)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training loop for this stage
    for epoch in range(epochs_per_stage):
        model.train()
        total_loss, total_correct = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * labels.size(0)
            total_correct += (outputs.argmax(1) == labels).sum().item()

        acc = total_correct / len(train_dataset)
        print(f"  Epoch {epoch+1}/{epochs_per_stage} - Loss: {total_loss/len(train_dataset):.4f}, Acc: {acc:.4f}")

# Final test accuracy
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


Stage 1/5: Using noise levels <= 0
  Epoch 1/1 - Loss: 0.1861, Acc: 0.9440

Stage 2/5: Using noise levels <= 1
  Epoch 1/1 - Loss: 0.0521, Acc: 0.9836

Stage 3/5: Using noise levels <= 2
  Epoch 1/1 - Loss: 0.0376, Acc: 0.9879

Stage 4/5: Using noise levels <= 3
  Epoch 1/1 - Loss: 0.0372, Acc: 0.9876

Stage 5/5: Using noise levels <= 4
  Epoch 1/1 - Loss: 0.0451, Acc: 0.9849

Final Test Accuracy: 0.9805


#### (1.2) Strict Curriculum Learning

In [98]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters
batch_size = 64
epochs_per_stage = 1  # can increase to 2–3 if needed
num_stages = 5

for stage in range(num_stages):
    print(f"\nStage {stage+1}/{num_stages}: Using noise level {stage}")
    
    # Select training data up to current noise level
    stage_mask = noise_levels_train == stage
    X_stage = X_train[stage_mask]
    y_stage = y_train[stage_mask]

    train_dataset = TensorDataset(X_stage, y_stage)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training loop for this stage
    for epoch in range(epochs_per_stage):
        model.train()
        total_loss, total_correct = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * labels.size(0)
            total_correct += (outputs.argmax(1) == labels).sum().item()

        acc = total_correct / len(train_dataset)
        print(f"  Epoch {epoch+1}/{epochs_per_stage} - Loss: {total_loss/len(train_dataset):.4f}, Acc: {acc:.4f}")

# Final test accuracy
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


Stage 1/5: Using noise level 0
  Epoch 1/1 - Loss: 0.1814, Acc: 0.9456

Stage 2/5: Using noise level 1
  Epoch 1/1 - Loss: 0.0740, Acc: 0.9767

Stage 3/5: Using noise level 2
  Epoch 1/1 - Loss: 0.0755, Acc: 0.9765

Stage 4/5: Using noise level 3
  Epoch 1/1 - Loss: 0.1009, Acc: 0.9669

Stage 5/5: Using noise level 4
  Epoch 1/1 - Loss: 0.1569, Acc: 0.9482

Final Test Accuracy: 0.9761


### (2) Self-Paced Learning

We will use the same X_train, X_test, y_train and y_test computed in the curriculum learning phase.
To implement the SPL we will introduce a difficulty threshold to let the model choose how many samples of this difficulty it is ready to handle.

Because our objective is to see the influence of order on the training, we will stop training once the model saw the entire dataset. To be fair with the other techniques where they see the data only once.

In [99]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss(reduction='none')  # Important: per-sample loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

batch_size = 64
epochs = 10
initial_lambda = 2.25  # initial difficulty threshold
lambda_increment = 2  # increase per epoch

# Wrap all training data in a dataset/loader for loss evaluation
full_train_dataset = TensorDataset(X_train, y_train)
full_train_loader = DataLoader(full_train_dataset, batch_size=batch_size, shuffle=False)

for epoch in range(epochs):
    model.eval()
    all_losses = []

    # Compute per-sample losses on the full training set
    with torch.no_grad():
        for images, labels in full_train_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            losses = criterion(outputs, labels)  # shape: (batch_size,)
            all_losses.append(losses.cpu())

    all_losses = torch.cat(all_losses)  # shape: (N,)
    print(torch.min(all_losses))
    print(torch.max(all_losses))

    # Determine current lambda threshold
    lambda_threshold = initial_lambda + epoch * lambda_increment

    # Select indices where loss <= lambda_threshold
    selected_indices = (all_losses <= lambda_threshold).nonzero(as_tuple=True)[0]

    if len(selected_indices) == 0:
        print(f"Epoch {epoch+1}: No samples selected for training (lambda={lambda_threshold:.3f}), stopping early.")
        break

    print(f"Epoch {epoch+1}: lambda={lambda_threshold:.3f}, selected {len(selected_indices)}/{len(X_train)} samples")

    # Create subset dataset and loader for training
    train_subset = TensorDataset(X_train[selected_indices], y_train[selected_indices])
    train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)

    # Train on selected samples
    model.train()
    total_loss, total_correct = 0, 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels).mean()  # mean loss for batch
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        total_correct += (outputs.argmax(1) == labels).sum().item()

    acc = total_correct / len(train_subset)
    print(f"  Training Loss: {total_loss/len(train_subset):.4f}, Accuracy: {acc:.4f}")

    if len(selected_indices) == len(X_train):
        print(f"  All samples were selected, stopping early.")
        break

# Evaluate on test set after training
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


tensor(2.1980)
tensor(2.4034)
Epoch 1: lambda=2.250, selected 55794/300000 samples
  Training Loss: 0.0936, Accuracy: 0.9664
tensor(-0.)
tensor(36.0352)
Epoch 2: lambda=4.250, selected 91538/300000 samples
  Training Loss: 0.0187, Accuracy: 0.9934
tensor(-0.)
tensor(52.5265)
Epoch 3: lambda=6.250, selected 91584/300000 samples
  Training Loss: 0.0116, Accuracy: 0.9961
tensor(-0.)
tensor(59.6773)
Epoch 4: lambda=8.250, selected 112936/300000 samples
  Training Loss: 0.0454, Accuracy: 0.9856
tensor(-0.)
tensor(71.6176)
Epoch 5: lambda=10.250, selected 253725/300000 samples
  Training Loss: 0.0603, Accuracy: 0.9798
tensor(-0.)
tensor(33.8791)
Epoch 6: lambda=12.250, selected 299480/300000 samples
  Training Loss: 0.0492, Accuracy: 0.9835
tensor(-0.)
tensor(29.5966)
Epoch 7: lambda=14.250, selected 299988/300000 samples
  Training Loss: 0.0327, Accuracy: 0.9891
tensor(-0.)
tensor(32.8389)
Epoch 8: lambda=16.250, selected 299995/300000 samples
  Training Loss: 0.0222, Accuracy: 0.9924
tenso

An idea could be to combine the difficulty scores from the noise levels with the increments in SLP:


**Option A: Use Noise Difficulty as a Prior or Weight for Lambda Threshold**

- Adjust the SPL threshold (`λ`) for each sample by incorporating its noise difficulty:

\[
\lambda_i = \lambda_{\text{base}} + \alpha \times \text{noise\_level}_i
\]

- Samples with higher noise difficulty require a higher loss to be included, effectively entering the curriculum later.

**Option B: Use Noise Difficulty for Initial Sample Filtering**

- Start SPL training using only samples with noise difficulty below a certain threshold (e.g., noise_level ≤ 0.2).
- Gradually expand the training set to include samples with higher noise difficulty as training progresses.

**Option C: Weighted Loss or Thresholding by Noise Difficulty Quantiles**

- Group samples by their noise difficulty levels.
- Compute separate SPL loss thresholds for each noise group.
- Allow lower thresholds (easier inclusion) for low-noise groups and higher thresholds for high-noise groups.
- This respects both the *a priori* noise difficulty and the *dynamic* training loss.

### (3) Hard-Example Mining

In [100]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

batch_size = 64
epochs = 5

# Define what "hard" means: top 1 or 2 noise levels (e.g., 0.3 and 0.4)
# If noise_levels_train goes from 0 to 4 (for noise 0.0 to 0.4), we can take levels >= 3
hard_mask = noise_levels_train >= 3

# Select hard examples
X_hard = X_train[hard_mask]
y_hard = y_train[hard_mask]
print(f"Selected {len(X_hard)} hard examples out of {len(X_train)}")

# Wrap in dataset and loader
hard_dataset = TensorDataset(X_hard, y_hard)
hard_loader = DataLoader(hard_dataset, batch_size=batch_size, shuffle=True)

# Training loop on hard examples
for epoch in range(epochs):
    model.train()
    total_loss, total_correct = 0, 0

    for images, labels in hard_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        total_correct += (outputs.argmax(1) == labels).sum().item()

    acc = total_correct / len(hard_dataset)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(hard_dataset):.4f}, Accuracy: {acc:.4f}")

# Final test evaluation
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


Selected 120000 hard examples out of 300000
Epoch 1/5 - Loss: 0.3107, Accuracy: 0.8999
Epoch 2/5 - Loss: 0.1357, Accuracy: 0.9560
Epoch 3/5 - Loss: 0.1021, Accuracy: 0.9666
Epoch 4/5 - Loss: 0.0814, Accuracy: 0.9735
Epoch 5/5 - Loss: 0.0652, Accuracy: 0.9782

Final Test Accuracy: 0.9783


### (4) Reverse Curriculum Learning

#### (4.1) Cumulative Reverse Curriculum Learning

In [101]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters
batch_size = 64
epochs_per_stage = 1  # can increase to 2–3 if needed
num_stages = 5

for stage in reversed(range(num_stages)):
    print(f"\nStage {stage+1}/{num_stages}: Using noise levels >= {stage}")
    
    # Select training data up to current noise level
    stage_mask = noise_levels_train >= stage
    X_stage = X_train[stage_mask]
    y_stage = y_train[stage_mask]

    train_dataset = TensorDataset(X_stage, y_stage)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training loop for this stage
    for epoch in range(epochs_per_stage):
        model.train()
        total_loss, total_correct = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * labels.size(0)
            total_correct += (outputs.argmax(1) == labels).sum().item()

        acc = total_correct / len(train_dataset)
        print(f"  Epoch {epoch+1}/{epochs_per_stage} - Loss: {total_loss/len(train_dataset):.4f}, Acc: {acc:.4f}")

# Final test accuracy
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


Stage 5/5: Using noise levels >= 4
  Epoch 1/1 - Loss: 0.5953, Acc: 0.8060

Stage 4/5: Using noise levels >= 3
  Epoch 1/1 - Loss: 0.2008, Acc: 0.9357

Stage 3/5: Using noise levels >= 2
  Epoch 1/1 - Loss: 0.1235, Acc: 0.9603

Stage 2/5: Using noise levels >= 1
  Epoch 1/1 - Loss: 0.0852, Acc: 0.9719

Stage 1/5: Using noise levels >= 0
  Epoch 1/1 - Loss: 0.0614, Acc: 0.9797

Final Test Accuracy: 0.9734


#### (4.2) Strict Reverse Curriculum Learning

In [102]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters
batch_size = 64
epochs_per_stage = 1  # can increase to 2–3 if needed
num_stages = 5

for stage in reversed(range(num_stages)):
    print(f"\nStage {stage+1}/{num_stages}: Using noise level {stage}")
    
    # Select training data up to current noise level
    stage_mask = noise_levels_train == stage
    X_stage = X_train[stage_mask]
    y_stage = y_train[stage_mask]

    train_dataset = TensorDataset(X_stage, y_stage)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training loop for this stage
    for epoch in range(epochs_per_stage):
        model.train()
        total_loss, total_correct = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * labels.size(0)
            total_correct += (outputs.argmax(1) == labels).sum().item()

        acc = total_correct / len(train_dataset)
        print(f"  Epoch {epoch+1}/{epochs_per_stage} - Loss: {total_loss/len(train_dataset):.4f}, Acc: {acc:.4f}")

# Final test accuracy
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")


Stage 5/5: Using noise level 4
  Epoch 1/1 - Loss: 0.4560, Acc: 0.8542

Stage 4/5: Using noise level 3
  Epoch 1/1 - Loss: 0.1428, Acc: 0.9552

Stage 3/5: Using noise level 2
  Epoch 1/1 - Loss: 0.0800, Acc: 0.9749

Stage 2/5: Using noise level 1
  Epoch 1/1 - Loss: 0.0504, Acc: 0.9842

Stage 1/5: Using noise level 0
  Epoch 1/1 - Loss: 0.0343, Acc: 0.9892

Final Test Accuracy: 0.9454


### (5) Stratified Monte-Carlo Sampling

In [103]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Hyperparameters
batch_size = 64
epochs_per_stage = 1  # same as curriculum
num_stages = 5        # noise levels 0–4
samples_per_stage = 10000  # number of examples to sample from each difficulty level

for stage in range(num_stages):
    print(f"\nStage {stage+1}/{num_stages}: Sampling from noise level = {stage}")
    
    # Select indices for current noise level
    stage_mask = (noise_levels_train == stage).nonzero(as_tuple=True)[0]
    
    # Randomly sample without replacement
    if len(stage_mask) < samples_per_stage:
        print(f"  Warning: only {len(stage_mask)} samples available, using all.")
        selected_indices = stage_mask
    else:
        selected_indices = stage_mask[torch.randperm(len(stage_mask))[:samples_per_stage]]
    
    # Prepare subset
    X_stage = X_train[selected_indices]
    y_stage = y_train[selected_indices]
    train_dataset = TensorDataset(X_stage, y_stage)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Train loop for this stage
    for epoch in range(epochs_per_stage):
        model.train()
        total_loss, total_correct = 0, 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * labels.size(0)
            total_correct += (outputs.argmax(1) == labels).sum().item()

        acc = total_correct / len(train_dataset)
        print(f"  Epoch {epoch+1}/{epochs_per_stage} - Loss: {total_loss/len(train_dataset):.4f}, Acc: {acc:.4f}")

# Final test accuracy
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
total_correct = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        total_correct += (outputs.argmax(1) == labels).sum().item()

test_acc = total_correct / len(test_dataset)
print(f"\nFinal Test Accuracy: {test_acc:.4f}")



Stage 1/5: Sampling from noise level = 0
  Epoch 1/1 - Loss: 0.5492, Acc: 0.8339

Stage 2/5: Sampling from noise level = 1
  Epoch 1/1 - Loss: 0.1998, Acc: 0.9399

Stage 3/5: Sampling from noise level = 2
  Epoch 1/1 - Loss: 0.1732, Acc: 0.9461

Stage 4/5: Sampling from noise level = 3
  Epoch 1/1 - Loss: 0.1909, Acc: 0.9376

Stage 5/5: Sampling from noise level = 4
  Epoch 1/1 - Loss: 0.2607, Acc: 0.9171

Final Test Accuracy: 0.9534
