In [12]:
#imports
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from models.cnn import SimpleCNN
from models.mlp import MLP
from distillation_utils import Distiller
from invariances_utils import shift_preserving_shape
import numpy as np


In [13]:
in_channels = 1
num_classes = 10
num_conv_layers = 2
temperature = 1
num_epochs = 10
batch_size = 64
lr = 0.001
TRAIN = True
device = 'cuda'
#np.random.seed(42)

In [14]:
# MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transform, download=True)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
#Obtaining CNN
cnn_path = "saved_models/model"
cnn = SimpleCNN(in_channels=in_channels, num_classes=num_classes, num_conv_layers=num_conv_layers, temperature=temperature).to('cuda:0')
if TRAIN:
    criterion_cnn = torch.nn.CrossEntropyLoss()
    optimizer_cnn = torch.optim.Adam(cnn.parameters(), lr=lr)
    # model training
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(train_loader):
            outputs = cnn(images.to('cuda'))
            loss = criterion_cnn(outputs, labels.to('cuda'))

            optimizer_cnn.zero_grad()
            loss.backward()
            optimizer_cnn.step()

            if (i + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
    # Save the trained model
    torch.save(cnn.state_dict(), cnn_path)
    print(f"Model saved as {cnn_path}!")
if not TRAIN:
    state_dict = torch.load(cnn_path)
    cnn.load_state_dict(state_dict=state_dict)

# Testing the model
cnn.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        outputs = cnn(images.to('cuda'))
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels.to('cuda')).sum().item()

    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.4f}')

Epoch [1/10], Step [100/938], Loss: 0.3577
Epoch [1/10], Step [200/938], Loss: 0.2974
Epoch [1/10], Step [300/938], Loss: 0.1396
Epoch [1/10], Step [400/938], Loss: 0.2572
Epoch [1/10], Step [500/938], Loss: 0.0812
Epoch [1/10], Step [600/938], Loss: 0.1142
Epoch [1/10], Step [700/938], Loss: 0.0585
Epoch [1/10], Step [800/938], Loss: 0.0081
Epoch [1/10], Step [900/938], Loss: 0.2130
Epoch [2/10], Step [100/938], Loss: 0.0674
Epoch [2/10], Step [200/938], Loss: 0.0209
Epoch [2/10], Step [300/938], Loss: 0.0116
Epoch [2/10], Step [400/938], Loss: 0.0574
Epoch [2/10], Step [500/938], Loss: 0.0086
Epoch [2/10], Step [600/938], Loss: 0.0289
Epoch [2/10], Step [700/938], Loss: 0.0132
Epoch [2/10], Step [800/938], Loss: 0.0019
Epoch [2/10], Step [900/938], Loss: 0.0180
Epoch [3/10], Step [100/938], Loss: 0.0082
Epoch [3/10], Step [200/938], Loss: 0.0351
Epoch [3/10], Step [300/938], Loss: 0.0073
Epoch [3/10], Step [400/938], Loss: 0.0304
Epoch [3/10], Step [500/938], Loss: 0.0941
Epoch [3/10

In [16]:
#Loading undistilled MLP
if TRAIN:
    mlp = MLP(input_dim = 784, output_dim= num_classes, hidden_size= 2048,
        hidden_layers= 4, device='cuda')
    criterion_mlp = torch.nn.CrossEntropyLoss()
    optimizer_mlp = torch.optim.Adam(mlp.parameters(), lr=lr)
    mlp.train(train_loader=train_loader, optimizer=optimizer_mlp, criterion=criterion_mlp, 
              num_epochs=5)
if not TRAIN:
    mlp = MLP(input_dim = 784, output_dim= num_classes, hidden_size= 2048,
            hidden_layers= 4, device='cuda', from_saved_state_dict="saved_models/mlp")
mlp.eval(test_loader=test_loader)

Not using softmax
Epoch [1/5], Step [100/938], Loss: 0.5007
Epoch [1/5], Step [200/938], Loss: 0.3598
Epoch [1/5], Step [300/938], Loss: 0.3524
Epoch [1/5], Step [400/938], Loss: 0.4236
Epoch [1/5], Step [500/938], Loss: 0.1770
Epoch [1/5], Step [600/938], Loss: 0.2145
Epoch [1/5], Step [700/938], Loss: 0.1805
Epoch [1/5], Step [800/938], Loss: 0.2262
Epoch [1/5], Step [900/938], Loss: 0.1059
Epoch [2/5], Step [100/938], Loss: 0.4121
Epoch [2/5], Step [200/938], Loss: 0.2438
Epoch [2/5], Step [300/938], Loss: 0.1367
Epoch [2/5], Step [400/938], Loss: 0.2825
Epoch [2/5], Step [500/938], Loss: 0.2720
Epoch [2/5], Step [600/938], Loss: 0.1417
Epoch [2/5], Step [700/938], Loss: 0.1025
Epoch [2/5], Step [800/938], Loss: 0.2385
Epoch [2/5], Step [900/938], Loss: 0.1659
Epoch [3/5], Step [100/938], Loss: 0.2490
Epoch [3/5], Step [200/938], Loss: 0.1394
Epoch [3/5], Step [300/938], Loss: 0.1137
Epoch [3/5], Step [400/938], Loss: 0.1538
Epoch [3/5], Step [500/938], Loss: 0.1184
Epoch [3/5], Ste

In [17]:
#loading distilled MLP
mlp_student = MLP(input_dim = 784, output_dim= num_classes, hidden_size= 2048,
          hidden_layers= 4, device='cuda')
if TRAIN:
    distiller = Distiller(student=mlp_student, teacher=cnn, device='cuda', lr=0.001)
    distiller.distill(train_loader, 5, "saved_models/")
if not TRAIN:
    distiller = Distiller(student=mlp_student, teacher=cnn, device='cuda', lr=0.001,
                        load_student_from_path = 'saved_models/distiller')
distiller.test_step(test_loader=test_loader)

Not using softmax
Epoch [1/5], Step [100/938], Student Loss : 1.0234, Total Loss: -29.8271
Epoch [1/5], Step [200/938], Student Loss : 0.5576, Total Loss: -29.9429
Epoch [1/5], Step [300/938], Student Loss : 0.7971, Total Loss: -29.9019
Epoch [1/5], Step [400/938], Student Loss : 0.4876, Total Loss: -30.0165
Epoch [1/5], Step [500/938], Student Loss : 0.5082, Total Loss: -30.0118
Epoch [1/5], Step [600/938], Student Loss : 0.4429, Total Loss: -30.0222
Epoch [1/5], Step [700/938], Student Loss : 0.5154, Total Loss: -29.9987
Epoch [1/5], Step [800/938], Student Loss : 0.5636, Total Loss: -29.9938
Epoch [1/5], Step [900/938], Student Loss : 0.5126, Total Loss: -29.9875
Epoch [2/5], Step [100/938], Student Loss : 0.3990, Total Loss: -30.0232
Epoch [2/5], Step [200/938], Student Loss : 0.4647, Total Loss: -30.0302
Epoch [2/5], Step [300/938], Student Loss : 0.4680, Total Loss: -30.0100
Epoch [2/5], Step [400/938], Student Loss : 0.4153, Total Loss: -30.0072
Epoch [2/5], Step [500/938], Stud

In [18]:
from invariances_utils import test_IM
test_IM(test_loader, mlp)

tensor(0.9033, device='cuda:0', grad_fn=<DivBackward0>)

In [19]:
test_IM(test_loader, distiller.get_student())

tensor(0.6272, device='cuda:0', grad_fn=<DivBackward0>)


Train 2 independent students with the same teacher, you compare the fidelities, if the 2 students have comparable fidelities they agree with the teacher because they generalize well

---------------
- compute agreement metrics: how much has the student learnt to predict in the same way as the teacher. 
- self distilling mlp
- distill an mlp over an mlp
- train an mlp independently on shifted data - compares with unshifted but distilled - then distilling 
----------------------------

scale: CIFAR-10 (but cumbersome)/ extended MNIST

the model actually learns invariances through the teacher -> all of these results hold


ECE,  