# DenseNet121

In [None]:
# Imports, device, hyperparameters, transforms
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, SubsetRandomSampler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
num_classes   = 100
batch_size    = 128
learning_rate = 0.1
num_epochs    = 50

# CIFAR-100 normalization
mean = (0.5071, 0.4865, 0.4409)
std  = (0.2673, 0.2564, 0.2761)

In [None]:
# Data augmentation + normalization transforms
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])
val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])


# Dataset download, train/val split, DataLoaders
# CIFAR-100 datasets
train_dataset = datasets.CIFAR100('./data', train=True,  download=True, transform=train_transform)
test_dataset  = datasets.CIFAR100('./data', train=False, download=True, transform=val_transform)

# Split train→ train/val (10% val)
num_train = len(train_dataset)
indices   = list(range(num_train))
split     = int(np.floor(0.1 * num_train))
np.random.seed(42)
np.random.shuffle(indices)
train_idx, val_idx = indices[split:], indices[:split]

train_loader = DataLoader(train_dataset, batch_size=batch_size,
                          sampler=SubsetRandomSampler(train_idx), num_workers=4)
val_loader   = DataLoader(train_dataset, batch_size=batch_size,
                          sampler=SubsetRandomSampler(val_idx), num_workers=4)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size,
                          shuffle=False,                num_workers=4)


In [None]:
# DenseNet-121 classes
class _DenseLayer(nn.Module):
    def __init__(self, in_feats, growth_rate, bn_size, drop_rate):
        super().__init__()
        self.norm1 = nn.BatchNorm2d(in_feats)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv1 = nn.Conv2d(in_feats, bn_size*growth_rate,
                               kernel_size=1, bias=False)
        self.norm2 = nn.BatchNorm2d(bn_size*growth_rate)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(bn_size*growth_rate, growth_rate,
                               kernel_size=3, padding=1, bias=False)
        self.drop_rate = drop_rate

    def forward(self, x):
        out = self.conv1(self.relu1(self.norm1(x)))
        out = self.conv2(self.relu2(self.norm2(out)))
        if self.drop_rate > 0:
            out = nn.functional.dropout(out, p=self.drop_rate, training=self.training)
        return torch.cat([x, out], 1)

class _DenseBlock(nn.Module):
    def __init__(self, num_layers, in_feats, bn_size, growth_rate, drop_rate):
        super().__init__()
        self.layers = nn.ModuleList()
        for i in range(num_layers):
            layer = _DenseLayer(in_feats + i*growth_rate, growth_rate, bn_size, drop_rate)
            self.layers.append(layer)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class _Transition(nn.Module):
    def __init__(self, in_feats, out_feats):
        super().__init__()
        self.trans = nn.Sequential(
            nn.BatchNorm2d(in_feats),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_feats, out_feats, kernel_size=1, bias=False),
            nn.AvgPool2d(kernel_size=2, stride=2)
        )
    def forward(self, x):
        return self.trans(x)

class DenseNet(nn.Module):
    def __init__(self, growth_rate=32, block_config=(6,12,24,16),
                 init_feats=64, bn_size=4, drop_rate=0, num_classes=100):
        super().__init__()
        # Initial conv + pool
        self.features = nn.Sequential(
            nn.Conv2d(3, init_feats, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(init_feats),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )

        # Dense blocks + transitions
        num_feats = init_feats
        for i, num_layers in enumerate(block_config):
            block = _DenseBlock(num_layers, num_feats, bn_size, growth_rate, drop_rate)
            self.features.add_module(f'denseblock{i+1}', block)
            num_feats += num_layers * growth_rate
            if i != len(block_config)-1:
                trans = _Transition(num_feats, num_feats//2)
                self.features.add_module(f'transition{i+1}', trans)
                num_feats //= 2

        # Final batch norm
        self.features.add_module('norm_final', nn.BatchNorm2d(num_feats))
        # Classifier
        self.classifier = nn.Linear(num_feats, num_classes)

        # Initialization
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias,   0)
            elif isinstance(m, nn.Linear):
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        features = self.features(x)
        out = nn.functional.relu(features, inplace=True)
        out = nn.functional.adaptive_avg_pool2d(out, (1,1)).view(x.size(0), -1)
        return self.classifier(out)


In [None]:
# Model, loss, optimizer, scheduler, save initial filters

model = DenseNet(num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate,
                      momentum=0.9, weight_decay=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

# Save initial conv0 filters for later visualization
initial_filters = model.features[0].weight.data.clone()


In [None]:
# Training & validation loop

train_losses, val_losses = [], []
train_accs,  val_accs  = [], []

for epoch in range(1, num_epochs+1):
    # --- Train ---
    model.train()
    running_loss = correct = total = 0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        out = model(x)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * x.size(0)
        preds = out.argmax(1)
        correct    += (preds == y).sum().item()
        total      += y.size(0)

    train_losses.append(running_loss/total)
    train_accs.append(correct/total)

    # --- Validate ---
    model.eval()
    v_loss = v_correct = v_total = 0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            out = model(x)
            loss = criterion(out, y)
            v_loss    += loss.item() * x.size(0)
            preds      = out.argmax(1)
            v_correct += (preds == y).sum().item()
            v_total   += y.size(0)

    val_losses.append(v_loss/v_total)
    val_accs.append(v_correct/v_total)

    scheduler.step()

    print(f'Epoch {epoch}/{num_epochs} | '
          f'Train Loss: {train_losses[-1]:.4f}, Acc: {train_accs[-1]:.4f} | '
          f' Val Loss: {val_losses[-1]:.4f}, Acc: {val_accs[-1]:.4f}')


In [None]:
# Plot training & validation metrics
epochs = range(1, num_epochs+1)
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.plot(epochs, train_losses, label='Train Loss')
plt.plot(epochs, val_losses,   label='Val Loss')
plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.legend()

plt.subplot(1,2,2)
plt.plot(epochs, train_accs, label='Train Acc')
plt.plot(epochs, val_accs,   label='Val Acc')
plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.legend()

plt.show()


In [None]:
# Visualize first-layer filters before & after training
final_filters = model.features[0].weight.data.clone()

fig, axes = plt.subplots(2, 2, figsize=(6,6))
for i in range(2):
    # before
    f0 = initial_filters[i].cpu().permute(1,2,0).numpy()
    f0 = (f0 - f0.min())/(f0.max()-f0.min())
    axes[i,0].imshow(f0)
    axes[i,0].set_title(f'Filter {i} before')
    axes[i,0].axis('off')

    # after
    f1 = final_filters[i].cpu().permute(1,2,0).numpy()
    f1 = (f1 - f1.min())/(f1.max()-f1.min())
    axes[i,1].imshow(f1)
    axes[i,1].set_title(f'Filter {i} after')
    axes[i,1].axis('off')

plt.tight_layout()
plt.show()


In [None]:
# Test evaluation, confusion matrix & metrics table

model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for x, y in test_loader:
        x = x.to(device)
        out = model(x)
        all_preds.extend(out.argmax(1).cpu().numpy())
        all_labels.extend(y.numpy())

# Overall accuracy
acc = np.mean(np.array(all_preds) == np.array(all_labels))
print(f"Test Accuracy: {acc:.4f}")

# Confusion matrix + classification report
cm     = confusion_matrix(all_labels, all_preds)
report = classification_report(all_labels, all_preds, output_dict=True)
df     = pd.DataFrame(report).transpose().iloc[:num_classes]

# TP/FP/FN/TN per class
tp = np.diag(cm)
fp = cm.sum(axis=0) - tp
fn = cm.sum(axis=1) - tp
tn = cm.sum() - (tp + fp + fn)

metrics = pd.DataFrame({
    'TP':        tp,
    'FP':        fp,
    'FN':        fn,
    'TN':        tn,
    'Precision': df['precision'],
    'Recall':    df['recall'],
    'F1-Score':  df['f1-score']
})
metrics.head()  # show first few classes; drop .head() to see all
