In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import copy

BATCH_SIZE = 64
LEARNING_RATE = 0.001
EPOCHS = 30
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
WEIGHT_DECAY = 1e-4        # L2 Regularization strength
L1_LAMBDA = 0.0001         # L1 Regularization strength
DROPOUT_RATE = 0.3         # Probability of killing a neuron
SMOOTHING = 0.1
PATIENCE = 6

In [3]:
train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=10),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# We adding noise and some tranformation to make it work for normal photos like shaking, little blurred

In [4]:
val_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

In [5]:
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=train_transforms)
val_dataset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=val_transforms)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

100%|██████████| 26.4M/26.4M [00:02<00:00, 12.9MB/s]
100%|██████████| 29.5k/29.5k [00:00<00:00, 203kB/s]
100%|██████████| 4.42M/4.42M [00:01<00:00, 3.79MB/s]
100%|██████████| 5.15k/5.15k [00:00<00:00, 14.7MB/s]


In [6]:
class RegularizedCNN(nn.Module):
    def __init__(self):
        super().__init__()

        # Feature Extractor (Convolutional)
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Flatten()
        )

        # Classifier (Linear)
        self.classifier = nn.Sequential(
            nn.Linear(64 * 7 * 7, 512),
            nn.ReLU(),

            # Randomly zeros out 50% of neurons
            nn.Dropout(p=DROPOUT_RATE),

            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

In [7]:
model = RegularizedCNN().to(DEVICE)

criterion = nn.CrossEntropyLoss(label_smoothing=SMOOTHING)

best_val_loss = float('inf')
patience_counter = 0
best_model_weights = None

In [8]:
params_bias_and_bn = [
    param for name, param in model.named_parameters()
    if "bias" in name or "bn" in name
]

params_to_regularize = [
    param for name, param in model.named_parameters()
    if "bias" not in name and "bn" not in name
]

# We seperate the parameters to regularize as the bias should not be regularized which must be same

In [9]:
optimizer = torch.optim.AdamW([

    {"params": params_to_regularize, "weight_decay": 1e-4},

    {"params": params_bias_and_bn, "weight_decay": 0.0}
], lr=0.001)

# We apply L2 regularization to the parameters by adding weight decay to them

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.1, patience=2
)

# We using Learning Scheduler for learning rate change

In [None]:
best_val_loss = float('inf')
patience_counter = 0
best_model_weights = None

print("\n--- Starting Training ---")

for epoch in range(EPOCHS):


    model.train()
    train_loss = 0

    for x, y in train_loader:
        x, y = x.to(DEVICE), y.to(DEVICE)

        optimizer.zero_grad()
        output = model(x)

        # Base Loss (Prediction Error)
        loss = criterion(output, y)

        # L1 Regularization
        # l1_norm = sum(p.abs().sum() for p in params_to_regularize)
        #we dont used L1 as it gives worst results and reduce accuracy

        # 3. Total Loss
        # total_loss = loss + (L1_LAMBDA * l1_norm)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)



    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.inference_mode():
        for x, y in val_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            output = model(x)
            loss = criterion(output, y)
            val_loss += loss.item()

            _, predicted = torch.max(output, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_acc = 100 * correct / total

    # Check if we need to drop LR based on Validation Loss
    old_lr = optimizer.param_groups[0]['lr']
    scheduler.step(avg_val_loss)
    new_lr = optimizer.param_groups[0]['lr']

    if new_lr < old_lr:
        print(f"  [Scheduler] Dropping Learning Rate: {old_lr:.6f} -> {new_lr:.6f}")

    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.2f}%")


    # --- EARLY STOPPING ---
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model_weights = copy.deepcopy(model.state_dict())
        patience_counter = 0
    else:
        patience_counter += 1
        print(f"  -> No improvement. Patience: {patience_counter}/{PATIENCE}")

    if patience_counter >= PATIENCE:
        print("\n[Early Stopping Triggered] Training stopped.")
        break

# Restore best model
if best_model_weights:
    model.load_state_dict(best_model_weights)
    print("Restored weights from the best epoch.")


--- Starting Training ---
Epoch 1 | Train Loss: 1.0404 | Val Loss: 0.8515 | Val Acc: 84.74%
Epoch 2 | Train Loss: 0.9060 | Val Loss: 0.8074 | Val Acc: 86.24%
Epoch 3 | Train Loss: 0.8645 | Val Loss: 0.7732 | Val Acc: 88.67%
Epoch 4 | Train Loss: 0.8406 | Val Loss: 0.7492 | Val Acc: 89.69%
Epoch 5 | Train Loss: 0.8235 | Val Loss: 0.7417 | Val Acc: 89.81%
Epoch 6 | Train Loss: 0.8059 | Val Loss: 0.7357 | Val Acc: 90.22%
Epoch 7 | Train Loss: 0.7991 | Val Loss: 0.7372 | Val Acc: 90.26%
  -> No improvement. Patience: 1/6
Epoch 8 | Train Loss: 0.7890 | Val Loss: 0.7185 | Val Acc: 91.04%
Epoch 9 | Train Loss: 0.7805 | Val Loss: 0.7200 | Val Acc: 91.13%
  -> No improvement. Patience: 1/6
Epoch 10 | Train Loss: 0.7775 | Val Loss: 0.7196 | Val Acc: 90.94%
  -> No improvement. Patience: 2/6
Epoch 11 | Train Loss: 0.7705 | Val Loss: 0.7148 | Val Acc: 91.09%
Epoch 12 | Train Loss: 0.7678 | Val Loss: 0.7135 | Val Acc: 91.07%
Epoch 13 | Train Loss: 0.7642 | Val Loss: 0.7052 | Val Acc: 91.38%
Epoch 

In [None]:
model = nn.Sequential(
    nn.Flatten(),
    nn.Dropout(p = 0.2), nn.Linear(1*28*28, 100), nn.ReLU(),
    nn.Dropout(p=0.2), nn.Linear(100, 100), nn.ReLU(),
    nn.Dropout(p=0.2), nn.Linear(100, 100), nn.ReLU(),
    nn.Dropout(p=0.2), nn.Linear(100, 10)
).to(DEVICE)

# We can apply dropout at each layer like this but it may underfit the data poorly during training

In [None]:
# Monte Carlo Dropout Inference

NUM_SAMPLES = 50  # Number of forward passes to run per image
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

images, labels = next(iter(val_loader))
images = images[:5].to(DEVICE)
true_labels = labels[:5]

model.eval()
for m in model.modules():
    if isinstance(m, nn.Dropout):
        m.train()


with torch.no_grad():

    images_repeated = images.repeat_interleave(NUM_SAMPLES, dim=0)


    logits = model(images_repeated)

    logits = logits.reshape(images.size(0), NUM_SAMPLES, 10)

    probs = torch.softmax(logits, dim=-1)

    mean_probs = probs.mean(dim=1)
    std_probs = probs.std(dim=1)

for i in range(len(images)):
    pred_idx = torch.argmax(mean_probs[i]).item()
    confidence = mean_probs[i, pred_idx].item()
    uncertainty = std_probs[i, pred_idx].item()
    true_label = class_names[true_labels[i]]
    predicted_label = class_names[pred_idx]

    print(f"\nImage {i+1}: True={true_label} | Predicted={predicted_label}")
    print(f"   Confidence (Mean Prob): {confidence:.4f}")
    print(f"   Uncertainty (Std Dev):  {uncertainty:.4f}")

    if uncertainty > 0.1:
        print("Model is unsure about this image!")
    else:
        print("Model is confident.")

# Monte carlo Dropout estimates the prediction of an image over 50 times as we mentioned by dropping different neurons at each round to check the certainity of new image

In [None]:
mean_probs.round(decimals = 2)

In [None]:
probs_std = probs.std(dim = 1)
probs_std.round(decimals = 2)

In [16]:
class MonteCarloClassifier:

  def __init__(self, model, num_samples = 50):
    self.model = model
    self.num_samples = num_samples
    self.device = next(model.parameters()).device

  def _enable_dropout(self):
    self.model.eval()   # setting all parameters to evaluation mode
    for m in self.model.modules():
      if isinstance(m, nn.dropout):
        m.train

  def predict(self, x):
    self._enable_dropout()
    x = x.to(self.device)

    with torch.no_grad():

      x_repeated = x.repeat_interleave(self.num_samples, dim = 0)

      logits = self.model(x_repeated)


      batch_size = x.size(0)
      num_classes = logits.size(1)
      logits = logits.reshape(batch_size, self.num_samples, num_classes)

      probs = torch.nn.functional.softmax(logits, dim = -1)
      mean_probs = probs.mean(dim = 1)
      uncertainity = probs.std(dim = 1)

      return mean_probs, uncertainity


# Custom MonteCarlo Dropout Function for reusability


In [18]:
def apply_max_norm(model, max_norm = 2, epsilon = 1e-8, dim=1):

  with torch.no_rad():
    for name, param in model.named_parameters():
      if 'bias' not in name:
        actual_norm = param.norm(p = 2, dim = dim, keepdim = True)
        target_norm = torch.clamp(actual_norm, 0, max_norm)
        param *= target_norm / (epsilon + actual_norm)

# Max Norm used to set the dropout based on minimum criteria if a neuron exceeds it it will be set to maximum threshold by reducing it