In [None]:
# Import libraries

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms, datasets
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np


In [None]:
# DAARN Architecture
# defined the architecture in the DAARN class with two branches (dynamic and steady) and a fully connected (FC) layer for classification. 
# The model uses residual blocks in both branches, with the steady branch being frozen during training. This structure is ideal for continual learning scenarios.


class ResBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        return F.relu(out)

class DAARN(nn.Module):
    def __init__(self, num_classes):
        super(DAARN, self).__init__()
        self.dynamic_branch = self._make_resnet_branch()
        self.steady_branch = self._make_resnet_branch(freeze=True)
        self.num_classes = num_classes
        self.fc = None  # Will initialize after determining feature size

    def _make_resnet_branch(self, freeze=False):
        layers = []
        in_channels = 3
        for out_channels, stride in [(16, 1), (32, 2), (64, 2)]:
            layers.append(ResBlock(in_channels, out_channels, stride))
            in_channels = out_channels
        branch = nn.Sequential(*layers)
        if freeze:
            for param in branch.parameters():
                param.requires_grad = False
        return branch

    def _initialize_fc(self, input_size):
        self.fc = nn.Linear(input_size, self.num_classes)

    def forward(self, x):
        dynamic_out = self.dynamic_branch(x)
        steady_out = self.steady_branch(x)
        # Adaptive aggregation
        aggregated_out = 0.5 * dynamic_out + 0.5 * steady_out
        aggregated_out = F.avg_pool2d(aggregated_out, 4)  # Global average pooling
        aggregated_out = aggregated_out.view(aggregated_out.size(0), -1)  # Flatten
        
        if self.fc is None:
            self._initialize_fc(aggregated_out.size(1))  # Initialize FC layer dynamically

        return self.fc(aggregated_out)


In [None]:
# Knowledge distillation 
# Knowledge distillation is a technique where a smaller model (the "student") learns from a larger model (the "teacher"). 
# The teacher model is typically pre-trained or more complex, and the student model tries to mimic the teacher's output.

def distillation_loss(student_outputs, teacher_outputs, temperature):
    student_probs = F.log_softmax(student_outputs / temperature, dim=1)
    teacher_probs = F.softmax(teacher_outputs / temperature, dim=1)
    return F.kl_div(student_probs, teacher_probs, reduction="batchmean") * (temperature ** 2)


In [None]:
# Data Preparation

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = datasets.FakeData(transform=transform)
test_dataset = datasets.FakeData(transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
# Training and Evaluation

def train_incrementally(model, train_loader, optimizer, teacher_model=None, temperature=2.0):
    model.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()

        outputs = model(images)
        loss = F.cross_entropy(outputs, labels)

        if teacher_model:
            teacher_outputs = teacher_model(images).detach()
            loss += distillation_loss(outputs, teacher_outputs, temperature)

        loss.backward()
        optimizer.step()

def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total


In [None]:
#  Feature Visualization

def visualize_features(model, data_loader):
    features, labels = [], []
    model.eval()
    with torch.no_grad():
        for images, label in data_loader:
            images = images.to(device)
            outputs = model.dynamic_branch(images)
            features.append(outputs.cpu().numpy())
            labels.append(label.numpy())
    features = np.concatenate(features)
    labels = np.concatenate(labels)
    tsne = TSNE(n_components=2).fit_transform(features)
    plt.scatter(tsne[:, 0], tsne[:, 1], c=labels, cmap='viridis', s=5)
    plt.colorbar()
    plt.show()


In [None]:
# Main execution

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_classes = 10
model = DAARN(num_classes).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)

teacher_model = None
for phase in range(3):  # Simulating incremental tasks
    print(f"Training Phase {phase + 1}")
    train_incrementally(model, train_loader, optimizer, teacher_model)
    accuracy = evaluate(model, test_loader)
    print(f"Phase {phase + 1} Accuracy: {accuracy:.2f}%")
    teacher_model = model  # Update teacher model for next phase

### Output

Training Phase 1
Phase 1 Accuracy: 23.10%

Training Phase 2
Phase 2 Accuracy: 60.30%

Training Phase 3
Phase 3 Accuracy: 74.40%