In [1]:
# Standard scientific Python imports
import matplotlib.pyplot as plt

# Import datasets, classifiers and performance metrics
from sklearn import datasets, metrics, svm
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from itertools import product
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Subset

from sklearn.svm import SVC

# SmallCNN

A SGD classifer is not enough to get reliable insights on CIFAR-10 dataset so we will use a lightweight CNN. This will allow us to accurately estimate the influence of training order

In [None]:
class SmallCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 8 * 8, 128), nn.ReLU(),
            nn.Linear(128, 10)
        )
        
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# CIFAR-10 Dataset

## Data Loading

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

cifar_train_dataset = torchvision.datasets.CIFAR10(root='../data', train=True, download=True, transform=transform)
cifar_test_dataset  = torchvision.datasets.CIFAR10(root='../data', train=False, download=True, transform=transform)

## Analysis

### (0) Base Case

Let's try as the Vanilla Base Case: Train on the entire dataset using uniform random shuffling for each epoch.

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

vanilla_model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vanilla_model.parameters(), lr=1e-3)

train_loader = DataLoader(cifar_train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(cifar_test_dataset, batch_size=256)

num_epochs = 10
train_accs, test_accs = [], []

for epoch in range(num_epochs):
    vanilla_model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = vanilla_model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluate
    def evaluate(model, loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                preds = model(images).argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(vanilla_model, train_loader)
    test_acc = evaluate(vanilla_model, test_loader)
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    print(f"Epoch {epoch+1}: Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

### (1) Curriculum Learning

As curriculum learning is based on giving samples in increasing difficulty level to the model, we first need to define a difficulty function. We will base ours on difference between the distance of each point to the line that goes through all the data points.

#### Pre-analysis

In [None]:
def compute_cifar_difficulty(dataset):
    """ Simple proxy: L2 distance to class centroid (in pixel space) """
    data_loader = DataLoader(dataset, batch_size=len(dataset))
    images, labels = next(iter(data_loader))
    images = images.view(images.size(0), -1)  # Flatten images

    centroids = [images[labels == i].mean(dim=0) for i in range(10)]
    difficulty = torch.tensor([
        torch.norm(img - centroids[label]).item()
        for img, label in zip(images, labels)
    ])
    return difficulty

#### Analysis

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Compute difficulty and sort
difficulty = compute_cifar_difficulty(cifar_train_dataset)
sorted_indices = torch.argsort(difficulty)  # Ascending: easiest to hardest

# Difficulty curriculum range
percentages = np.linspace(0.1, 1.0, 10)
train_accs, test_accs = [], []

for pct in percentages:
    n_samples = int(pct * len(cifar_train_dataset))
    selected_indices = sorted_indices[:n_samples]
    subset = Subset(cifar_train_dataset, selected_indices)
    loader = DataLoader(subset, batch_size=64, shuffle=True)

    # Train for 1 epoch on this subset
    model.train()
    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluate
    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(subset, batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_dataset, batch_size=256))
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    print(f"Trained on {pct:.1%} data -> Train Acc: {train_acc:.2f}, Test Acc: {test_acc:.2f}")

# Plot results
plt.plot(percentages, train_accs, label="Train Accuracy")
plt.plot(percentages, test_accs, label="Test Accuracy")
plt.xlabel("Training Set Percentage")
plt.ylabel("Accuracy")
plt.title("Curriculum Learning on CIFAR-10 (Small CNN)")
plt.legend()
plt.grid(True)
plt.show()

### (2) Self-Paced Learning

In Self-Paced Learning, the model is supposed to:

• learn from easier samples first (based on current loss)

• adaptively expand its training set to include harder samples as it becomes more confident

In [None]:
# Mac GPU code
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple M1/M2 GPU via MPS")
else:
    device = torch.device("cpu")
    print("Using CPU")

model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss(reduction='none')  # Per-sample loss
optimizer = optim.Adam(model.parameters(), lr=1e-3)

percentages = np.linspace(0.1, 1.0, 10)
batch_size = 128
train_accs, test_accs = [], []

for pct in percentages:
    print(f"\n--- Training with {int(pct * 100)}% easiest samples ---")

    model.eval()
    losses = []
    sample_indices = []

    loader = DataLoader(cifar_train_dataset, batch_size=batch_size, shuffle=False)
    with torch.no_grad():
        for idx, (images, labels) in enumerate(loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            batch_losses = criterion(outputs, labels)
            losses.extend(batch_losses.cpu().numpy())
            sample_indices.extend(range(idx * batch_size, idx * batch_size + len(images)))

    losses = np.array(losses)
    sample_indices = np.array(sample_indices)
    sorted_idx = sample_indices[np.argsort(losses)]
    n_samples = int(pct * len(cifar_train_dataset))
    selected_indices = sorted_idx[:n_samples]

    model.train()
    subset_loader = DataLoader(Subset(cifar_train_dataset, selected_indices), batch_size=batch_size, shuffle=True)
    for images, labels in subset_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(Subset(cifar_train_dataset, selected_indices), batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_dataset, batch_size=256))

    train_accs.append(train_acc)
    test_accs.append(test_acc)

    print(f"Train acc: {train_acc:.4f} | Test acc: {test_acc:.4f}")

# Plot
plt.plot(percentages, train_accs, label='Train Accuracy')
plt.plot(percentages, test_accs, label='Test Accuracy')
plt.xlabel("Training Set Percentage")
plt.ylabel("Accuracy")
plt.title("Self-Paced Learning on CIFAR-10 (Small CNN)")
plt.grid(True)
plt.legend()
plt.show()

### (3) Hard-Example Mining

Hard-Example Mining consists in feeding the model only hard examples. In our case, we will consider that a sample is difficult if its normalized difficulty is greater or equal than 0,75 (in other words the top 25%).

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss(reduction='none')  # needed for per-sample loss
optimizer = optim.Adam(model.parameters(), lr=1e-3)

percentages = np.linspace(0.1, 1.0, 10)  # percent of hardest samples to train on
batch_size = 128
train_accs, test_accs = [], []

for pct in percentages:
    print(f"\n--- Training with top {int(pct * 100)}% hardest samples ---")

    model.eval()
    losses = []
    sample_indices = []

    loader = DataLoader(cifar_train_dataset, batch_size=batch_size, shuffle=False)
    with torch.no_grad():
        for idx, (images, labels) in enumerate(loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            batch_losses = criterion(outputs, labels)
            losses.extend(batch_losses.cpu().numpy())
            sample_indices.extend(range(idx * batch_size, idx * batch_size + len(images)))

    losses = np.array(losses)
    sample_indices = np.array(sample_indices)
    sorted_idx = sample_indices[np.argsort(-losses)]  # descending
    n_samples = int(pct * len(cifar_train_dataset))
    selected_indices = sorted_idx[:n_samples]

    model.train()
    subset_loader = DataLoader(Subset(cifar_train_dataset, selected_indices), batch_size=batch_size, shuffle=True)
    for images, labels in subset_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(Subset(cifar_train_dataset, selected_indices), batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_dataset, batch_size=256))
    train_accs.append(train_acc)
    test_accs.append(test_acc)

    print(f"Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

# plot results
plt.plot(percentages, train_accs, label='Train Accuracy')
plt.plot(percentages, test_accs, label='Test Accuracy')
plt.xlabel("Top % Hardest Samples")
plt.ylabel("Accuracy")
plt.title("Hard Example Mining on CIFAR-10 (Small CNN)")
plt.legend()
plt.grid(True)
plt.show()

### (4) Reverse Curriculum Learning

We are implementing **Reverse Curriculum Learning (RCL)** where the model starts learning from easier goals that are close to the target and gradually works backwards to more challenging starting states.

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss(reduction='none')  # per-sample loss
optimizer = optim.Adam(model.parameters(), lr=1e-3)
batch_size = 128
percentages = np.linspace(0.1, 1.0, 10)

train_accs, test_accs = [], []

difficulty = compute_cifar_difficulty(cifar_train_dataset)
sorted_indices = torch.argsort(difficulty, descending=True)  # hardest first

for pct in percentages:
    print(f"\n--- Training with top {int(pct * 100)}% hardest samples ---")

    n_samples = int(pct * len(cifar_train_dataset))
    selected_indices = sorted_indices[:n_samples]
    subset_loader = DataLoader(Subset(cifar_train_dataset, selected_indices), batch_size=batch_size, shuffle=True)

    model.train()
    for images, labels in subset_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                preds = model(images).argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(Subset(cifar_train_dataset, selected_indices), batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_dataset, batch_size=256))
    train_accs.append(train_acc)
    test_accs.append(test_acc)

    print(f"Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

# plot
plt.plot(percentages, train_accs, label='Train Accuracy')
plt.plot(percentages, test_accs, label='Test Accuracy')
plt.xlabel("Top % Hardest Samples")
plt.ylabel("Accuracy")
plt.title("Reverse Curriculum Learning on CIFAR-10 (Small CNN)")
plt.legend()
plt.grid(True)
plt.show()

### (5) Stratified Monte-Carlo Sampling

**Stratified Monte Carlo Sampling** is a variance reduction technique where the input space is divided into distinct strata (subregions), and samples are drawn from each stratum. This ensures more uniform coverage of the space compared to standard Monte Carlo sampling, leading to more accurate and stable estimates with fewer samples.

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

difficulty = compute_cifar_difficulty(cifar_train_dataset)
difficulty = (difficulty - difficulty.min()) / (difficulty.max() - difficulty.min())  # Normalize to [0, 1]

# Stratify into bins
num_bins = 10
bin_edges = torch.linspace(0, 1, num_bins + 1)
bin_indices = [[] for _ in range(num_bins)]

for i, d in enumerate(difficulty):
    bin_id = torch.bucketize(d, bin_edges, right=False) - 1
    bin_indices[bin_id.item()].append(i)

# Training loop
model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
batch_size = 128
percentages = np.linspace(0.1, 1.0, 10)
train_accs, test_accs = [], []

for pct in percentages:
    print(f"\n--- Training on {int(pct * 100)}% stratified samples ---")
    samples_per_bin = int(pct * len(cifar_train_dataset) / num_bins)

    sampled_indices = []
    for indices in bin_indices:
        sampled = np.random.choice(indices, min(samples_per_bin, len(indices)), replace=False)
        sampled_indices.extend(sampled)

    subset_loader = DataLoader(Subset(cifar_train_dataset, sampled_indices), batch_size=batch_size, shuffle=True)

    model.train()
    for images, labels in subset_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluation
    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                preds = model(images).argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(Subset(cifar_train_dataset, sampled_indices), batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_dataset, batch_size=256))
    train_accs.append(train_acc)
    test_accs.append(test_acc)

    print(f"Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

# Plot
plt.plot(percentages, train_accs, label='Train Accuracy')
plt.plot(percentages, test_accs, label='Test Accuracy')
plt.xlabel("Training Set Percentage (Stratified)")
plt.ylabel("Accuracy")
plt.title("Stratified Monte Carlo Sampling on CIFAR-10 (Small CNN)")
plt.legend()
plt.grid(True)
plt.show()

# CIFAR-10 Dataset with Gaussian Noise

## Data Loading

In [None]:
cifar_train_GN = None
cifar_test_GN = None

## Analysis

### (0) Base Case

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

vanilla_model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vanilla_model.parameters(), lr=1e-3)

train_loader = DataLoader(cifar_train_GN, batch_size=128, shuffle=True)
test_loader = DataLoader(cifar_test_GN, batch_size=256)

num_epochs = 10
train_accs, test_accs = [], []

for epoch in range(num_epochs):
    vanilla_model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = vanilla_model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluate
    def evaluate(model, loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                preds = model(images).argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(vanilla_model, train_loader)
    test_acc = evaluate(vanilla_model, test_loader)
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    print(f"Epoch {epoch+1}: Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

### (1) Curriculum Learning

#### Pre-analysis

In [None]:
def compute_cifar_difficulty(dataset):
    """ Simple proxy: L2 distance to class centroid (in pixel space) """
    data_loader = DataLoader(dataset, batch_size=len(dataset))
    images, labels = next(iter(data_loader))
    images = images.view(images.size(0), -1)  # Flatten images

    centroids = [images[labels == i].mean(dim=0) for i in range(10)]
    difficulty = torch.tensor([
        torch.norm(img - centroids[label]).item()
        for img, label in zip(images, labels)
    ])
    return difficulty

#### Analysis

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Compute difficulty and sort
difficulty = compute_cifar_difficulty(cifar_train_GN)
sorted_indices = torch.argsort(difficulty)  # Ascending: easiest to hardest

# Difficulty curriculum range
percentages = np.linspace(0.1, 1.0, 10)
train_accs, test_accs = [], []

for pct in percentages:
    n_samples = int(pct * len(cifar_train_GN))
    selected_indices = sorted_indices[:n_samples]
    subset = Subset(cifar_train_GN, selected_indices)
    loader = DataLoader(subset, batch_size=64, shuffle=True)

    # Train for 1 epoch on this subset
    model.train()
    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluate
    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(subset, batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_dataset, batch_size=256))
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    print(f"Trained on {pct:.1%} data -> Train Acc: {train_acc:.2f}, Test Acc: {test_acc:.2f}")

# Plot results
plt.plot(percentages, train_accs, label="Train Accuracy")
plt.plot(percentages, test_accs, label="Test Accuracy")
plt.xlabel("Training Set Percentage")
plt.ylabel("Accuracy")
plt.title("Curriculum Learning on CIFAR-10 with Gaussian Noise (Small CNN)")
plt.legend()
plt.grid(True)
plt.show()

### (2) Self-Paced Learning

In [None]:
# Mac GPU code
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple M1/M2 GPU via MPS")
else:
    device = torch.device("cpu")
    print("Using CPU")

model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss(reduction='none')  # Per-sample loss
optimizer = optim.Adam(model.parameters(), lr=1e-3)

percentages = np.linspace(0.1, 1.0, 10)
batch_size = 128
train_accs, test_accs = [], []

for pct in percentages:
    print(f"\n--- Training with {int(pct * 100)}% easiest samples ---")

    model.eval()
    losses = []
    sample_indices = []

    loader = DataLoader(cifar_train_GN, batch_size=batch_size, shuffle=False)
    with torch.no_grad():
        for idx, (images, labels) in enumerate(loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            batch_losses = criterion(outputs, labels)
            losses.extend(batch_losses.cpu().numpy())
            sample_indices.extend(range(idx * batch_size, idx * batch_size + len(images)))

    losses = np.array(losses)
    sample_indices = np.array(sample_indices)
    sorted_idx = sample_indices[np.argsort(losses)]
    n_samples = int(pct * len(cifar_train_GN))
    selected_indices = sorted_idx[:n_samples]

    model.train()
    subset_loader = DataLoader(Subset(cifar_train_GN, selected_indices), batch_size=batch_size, shuffle=True)
    for images, labels in subset_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(Subset(cifar_train_GN, selected_indices), batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_GN, batch_size=256))

    train_accs.append(train_acc)
    test_accs.append(test_acc)

    print(f"Train acc: {train_acc:.4f} | Test acc: {test_acc:.4f}")

# Plot
plt.plot(percentages, train_accs, label='Train Accuracy')
plt.plot(percentages, test_accs, label='Test Accuracy')
plt.xlabel("Training Set Percentage")
plt.ylabel("Accuracy")
plt.title("Self-Paced Learning on CIFAR-10 with Gaussian Noise (Small CNN)")
plt.grid(True)
plt.legend()
plt.show()

### (3) Hard-Example Mining

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss(reduction='none')  # needed for per-sample loss
optimizer = optim.Adam(model.parameters(), lr=1e-3)

percentages = np.linspace(0.1, 1.0, 10)  # percent of hardest samples to train on
batch_size = 128
train_accs, test_accs = [], []

for pct in percentages:
    print(f"\n--- Training with top {int(pct * 100)}% hardest samples ---")

    model.eval()
    losses = []
    sample_indices = []

    loader = DataLoader(cifar_train_GN, batch_size=batch_size, shuffle=False)
    with torch.no_grad():
        for idx, (images, labels) in enumerate(loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            batch_losses = criterion(outputs, labels)
            losses.extend(batch_losses.cpu().numpy())
            sample_indices.extend(range(idx * batch_size, idx * batch_size + len(images)))

    losses = np.array(losses)
    sample_indices = np.array(sample_indices)
    sorted_idx = sample_indices[np.argsort(-losses)]  # descending
    n_samples = int(pct * len(cifar_train_GN))
    selected_indices = sorted_idx[:n_samples]

    model.train()
    subset_loader = DataLoader(Subset(cifar_train_GN, selected_indices), batch_size=batch_size, shuffle=True)
    for images, labels in subset_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(Subset(cifar_train_GN, selected_indices), batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_GN, batch_size=256))
    train_accs.append(train_acc)
    test_accs.append(test_acc)

    print(f"Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

# plot results
plt.plot(percentages, train_accs, label='Train Accuracy')
plt.plot(percentages, test_accs, label='Test Accuracy')
plt.xlabel("Top % Hardest Samples")
plt.ylabel("Accuracy")
plt.title("Hard Example Mining on CIFAR-10 with Gaussian Noise (Small CNN)")
plt.legend()
plt.grid(True)
plt.show()

### (4) Reverse Curriculum Learning

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss(reduction='none')  # per-sample loss
optimizer = optim.Adam(model.parameters(), lr=1e-3)
batch_size = 128
percentages = np.linspace(0.1, 1.0, 10)

train_accs, test_accs = [], []

difficulty = compute_cifar_difficulty(cifar_train_GN)
sorted_indices = torch.argsort(difficulty, descending=True)  # hardest first

for pct in percentages:
    print(f"\n--- Training with top {int(pct * 100)}% hardest samples ---")

    n_samples = int(pct * len(cifar_train_GN))
    selected_indices = sorted_indices[:n_samples]
    subset_loader = DataLoader(Subset(cifar_train_GN, selected_indices), batch_size=batch_size, shuffle=True)

    model.train()
    for images, labels in subset_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                preds = model(images).argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(Subset(cifar_train_GN, selected_indices), batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_GN, batch_size=256))
    train_accs.append(train_acc)
    test_accs.append(test_acc)

    print(f"Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

# plot
plt.plot(percentages, train_accs, label='Train Accuracy')
plt.plot(percentages, test_accs, label='Test Accuracy')
plt.xlabel("Top % Hardest Samples")
plt.ylabel("Accuracy")
plt.title("Reverse Curriculum Learning on CIFAR-10 with Gaussian Noise (Small CNN)")
plt.legend()
plt.grid(True)
plt.show()

### (5) Stratified Monte-Carlo Sampling

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

difficulty = compute_cifar_difficulty(cifar_train_GN)
difficulty = (difficulty - difficulty.min()) / (difficulty.max() - difficulty.min())  # Normalize to [0, 1]

# Stratify into bins
num_bins = 10
bin_edges = torch.linspace(0, 1, num_bins + 1)
bin_indices = [[] for _ in range(num_bins)]

for i, d in enumerate(difficulty):
    bin_id = torch.bucketize(d, bin_edges, right=False) - 1
    bin_indices[bin_id.item()].append(i)

# Training loop
model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
batch_size = 128
percentages = np.linspace(0.1, 1.0, 10)
train_accs, test_accs = [], []

for pct in percentages:
    print(f"\n--- Training on {int(pct * 100)}% stratified samples ---")
    samples_per_bin = int(pct * len(cifar_train_GN) / num_bins)

    sampled_indices = []
    for indices in bin_indices:
        sampled = np.random.choice(indices, min(samples_per_bin, len(indices)), replace=False)
        sampled_indices.extend(sampled)

    subset_loader = DataLoader(Subset(cifar_train_GN, sampled_indices), batch_size=batch_size, shuffle=True)

    model.train()
    for images, labels in subset_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluation
    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                preds = model(images).argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(Subset(cifar_train_GN, sampled_indices), batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_GN, batch_size=256))
    train_accs.append(train_acc)
    test_accs.append(test_acc)

    print(f"Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

# plot
plt.plot(percentages, train_accs, label='Train Accuracy')
plt.plot(percentages, test_accs, label='Test Accuracy')
plt.xlabel("Training Set Percentage (Stratified)")
plt.ylabel("Accuracy")
plt.title("Stratified Monte Carlo Sampling on CIFAR-10 with Gaussian Noise (Small CNN)")
plt.legend()
plt.grid(True)
plt.show()

# CIFAR-10 Dataset with Impulse Noise

## Data Loading

In [None]:
cifar_train_IN = None
cifar_test_IN = None

## Analysis

### (0) Base Case

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

vanilla_model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(vanilla_model.parameters(), lr=1e-3)

train_loader = DataLoader(cifar_train_IN, batch_size=128, shuffle=True)
test_loader = DataLoader(cifar_test_IN, batch_size=256)

num_epochs = 10
train_accs, test_accs = [], []

for epoch in range(num_epochs):
    vanilla_model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = vanilla_model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluate
    def evaluate(model, loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                preds = model(images).argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(vanilla_model, train_loader)
    test_acc = evaluate(vanilla_model, test_loader)
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    print(f"Epoch {epoch+1}: Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

### (1) Curriculum Learning

#### Pre-analysis

In [None]:
def compute_cifar_difficulty(dataset):
    """ Simple proxy: L2 distance to class centroid (in pixel space) """
    data_loader = DataLoader(dataset, batch_size=len(dataset))
    images, labels = next(iter(data_loader))
    images = images.view(images.size(0), -1)  # Flatten images

    centroids = [images[labels == i].mean(dim=0) for i in range(10)]
    difficulty = torch.tensor([
        torch.norm(img - centroids[label]).item()
        for img, label in zip(images, labels)
    ])
    return difficulty

#### Analysis

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Compute difficulty and sort
difficulty = compute_cifar_difficulty(cifar_train_IN)
sorted_indices = torch.argsort(difficulty)  # Ascending: easiest to hardest

# Difficulty curriculum range
percentages = np.linspace(0.1, 1.0, 10)
train_accs, test_accs = [], []

for pct in percentages:
    n_samples = int(pct * len(cifar_train_IN))
    selected_indices = sorted_indices[:n_samples]
    subset = Subset(cifar_train_IN, selected_indices)
    loader = DataLoader(subset, batch_size=64, shuffle=True)

    # Train for 1 epoch on this subset
    model.train()
    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluate
    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(subset, batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_dataset, batch_size=256))
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    print(f"Trained on {pct:.1%} data -> Train Acc: {train_acc:.2f}, Test Acc: {test_acc:.2f}")

# Plot results
plt.plot(percentages, train_accs, label="Train Accuracy")
plt.plot(percentages, test_accs, label="Test Accuracy")
plt.xlabel("Training Set Percentage")
plt.ylabel("Accuracy")
plt.title("Curriculum Learning on CIFAR-10 with Impulse Noise (Small CNN)")
plt.legend()
plt.grid(True)
plt.show()

### (2) Self-Paced Learning

In [None]:
# Mac GPU code
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple M1/M2 GPU via MPS")
else:
    device = torch.device("cpu")
    print("Using CPU")

model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss(reduction='none')  # Per-sample loss
optimizer = optim.Adam(model.parameters(), lr=1e-3)

percentages = np.linspace(0.1, 1.0, 10)
batch_size = 128
train_accs, test_accs = [], []

for pct in percentages:
    print(f"\n--- Training with {int(pct * 100)}% easiest samples ---")

    model.eval()
    losses = []
    sample_indices = []

    loader = DataLoader(cifar_train_IN, batch_size=batch_size, shuffle=False)
    with torch.no_grad():
        for idx, (images, labels) in enumerate(loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            batch_losses = criterion(outputs, labels)
            losses.extend(batch_losses.cpu().numpy())
            sample_indices.extend(range(idx * batch_size, idx * batch_size + len(images)))

    losses = np.array(losses)
    sample_indices = np.array(sample_indices)
    sorted_idx = sample_indices[np.argsort(losses)]
    n_samples = int(pct * len(cifar_train_GN))
    selected_indices = sorted_idx[:n_samples]

    model.train()
    subset_loader = DataLoader(Subset(cifar_train_IN, selected_indices), batch_size=batch_size, shuffle=True)
    for images, labels in subset_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(Subset(cifar_train_IN, selected_indices), batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_IN, batch_size=256))

    train_accs.append(train_acc)
    test_accs.append(test_acc)

    print(f"Train acc: {train_acc:.4f} | Test acc: {test_acc:.4f}")

# Plot
plt.plot(percentages, train_accs, label='Train Accuracy')
plt.plot(percentages, test_accs, label='Test Accuracy')
plt.xlabel("Training Set Percentage")
plt.ylabel("Accuracy")
plt.title("Self-Paced Learning on CIFAR-10 with Impulse Noise (Small CNN)")
plt.grid(True)
plt.legend()
plt.show()

### (3) Hard-Example Mining

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss(reduction='none')  # needed for per-sample loss
optimizer = optim.Adam(model.parameters(), lr=1e-3)

percentages = np.linspace(0.1, 1.0, 10)  # percent of hardest samples to train on
batch_size = 128
train_accs, test_accs = [], []

for pct in percentages:
    print(f"\n--- Training with top {int(pct * 100)}% hardest samples ---")

    model.eval()
    losses = []
    sample_indices = []

    loader = DataLoader(cifar_train_IN, batch_size=batch_size, shuffle=False)
    with torch.no_grad():
        for idx, (images, labels) in enumerate(loader):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            batch_losses = criterion(outputs, labels)
            losses.extend(batch_losses.cpu().numpy())
            sample_indices.extend(range(idx * batch_size, idx * batch_size + len(images)))

    losses = np.array(losses)
    sample_indices = np.array(sample_indices)
    sorted_idx = sample_indices[np.argsort(-losses)]  # descending
    n_samples = int(pct * len(cifar_train_IN))
    selected_indices = sorted_idx[:n_samples]

    model.train()
    subset_loader = DataLoader(Subset(cifar_train_IN, selected_indices), batch_size=batch_size, shuffle=True)
    for images, labels in subset_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                preds = outputs.argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(Subset(cifar_train_IN, selected_indices), batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_IN, batch_size=256))
    train_accs.append(train_acc)
    test_accs.append(test_acc)

    print(f"Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

# plot results
plt.plot(percentages, train_accs, label='Train Accuracy')
plt.plot(percentages, test_accs, label='Test Accuracy')
plt.xlabel("Top % Hardest Samples")
plt.ylabel("Accuracy")
plt.title("Hard Example Mining on CIFAR-10 with Impulse Noise (Small CNN)")
plt.legend()
plt.grid(True)
plt.show()

### (4) Reverse Curriculum Learning

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss(reduction='none')  # per-sample loss
optimizer = optim.Adam(model.parameters(), lr=1e-3)
batch_size = 128
percentages = np.linspace(0.1, 1.0, 10)

train_accs, test_accs = [], []

difficulty = compute_cifar_difficulty(cifar_train_IN)
sorted_indices = torch.argsort(difficulty, descending=True)  # hardest first

for pct in percentages:
    print(f"\n--- Training with top {int(pct * 100)}% hardest samples ---")

    n_samples = int(pct * len(cifar_train_IN))
    selected_indices = sorted_indices[:n_samples]
    subset_loader = DataLoader(Subset(cifar_train_GN, selected_indices), batch_size=batch_size, shuffle=True)

    model.train()
    for images, labels in subset_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                preds = model(images).argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(Subset(cifar_train_IN, selected_indices), batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_IN, batch_size=256))
    train_accs.append(train_acc)
    test_accs.append(test_acc)

    print(f"Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

# plot
plt.plot(percentages, train_accs, label='Train Accuracy')
plt.plot(percentages, test_accs, label='Test Accuracy')
plt.xlabel("Top % Hardest Samples")
plt.ylabel("Accuracy")
plt.title("Reverse Curriculum Learning on CIFAR-10 with Impulse Noise (Small CNN)")
plt.legend()
plt.grid(True)
plt.show()

### (5) Stratified Monte-Carlo Sampling

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

difficulty = compute_cifar_difficulty(cifar_train_IN)
difficulty = (difficulty - difficulty.min()) / (difficulty.max() - difficulty.min())  # Normalize to [0, 1]

# Stratify into bins
num_bins = 10
bin_edges = torch.linspace(0, 1, num_bins + 1)
bin_indices = [[] for _ in range(num_bins)]

for i, d in enumerate(difficulty):
    bin_id = torch.bucketize(d, bin_edges, right=False) - 1
    bin_indices[bin_id.item()].append(i)

# Training loop
model = SmallCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
batch_size = 128
percentages = np.linspace(0.1, 1.0, 10)
train_accs, test_accs = [], []

for pct in percentages:
    print(f"\n--- Training on {int(pct * 100)}% stratified samples ---")
    samples_per_bin = int(pct * len(cifar_train_IN) / num_bins)

    sampled_indices = []
    for indices in bin_indices:
        sampled = np.random.choice(indices, min(samples_per_bin, len(indices)), replace=False)
        sampled_indices.extend(sampled)

    subset_loader = DataLoader(Subset(cifar_train_IN, sampled_indices), batch_size=batch_size, shuffle=True)

    model.train()
    for images, labels in subset_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluation
    def evaluate(loader):
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for images, labels in loader:
                images, labels = images.to(device), labels.to(device)
                preds = model(images).argmax(dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        return correct / total

    train_acc = evaluate(DataLoader(Subset(cifar_train_IN, sampled_indices), batch_size=256))
    test_acc = evaluate(DataLoader(cifar_test_IN, batch_size=256))
    train_accs.append(train_acc)
    test_accs.append(test_acc)

    print(f"Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}")

# plot
plt.plot(percentages, train_accs, label='Train Accuracy')
plt.plot(percentages, test_accs, label='Test Accuracy')
plt.xlabel("Training Set Percentage (Stratified)")
plt.ylabel("Accuracy")
plt.title("Stratified Monte Carlo Sampling on CIFAR-10 with Impulse Noise (Small CNN)")
plt.legend()
plt.grid(True)
plt.show()