In [1]:
#imports
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from models.cnn import SimpleCNN
from models.mlp import MLP
from distillation_utils import Distiller
from invariances_utils import shift_preserving_shape
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from invariances_utils import test_IM


In [2]:
in_channels = 1
num_classes = 10
num_conv_layers = 2
temperature = 1
num_epochs = 10
batch_size = 64
lr = 0.001
TRAIN = False
device = 'cuda'
#np.random.seed(42)

In [3]:
# MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=transform, download=True)

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [4]:
#Obtaining CNN
cnn_path = "saved_models/model"
cnn = SimpleCNN(in_channels=in_channels, num_classes=num_classes, num_conv_layers=num_conv_layers, temperature=temperature).to('cuda:0')
if TRAIN:
    criterion_cnn = torch.nn.CrossEntropyLoss()
    optimizer_cnn = torch.optim.Adam(cnn.parameters(), lr=lr)
    # model training
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(train_loader):
            outputs = cnn(images.to('cuda'))
            loss = criterion_cnn(outputs, labels.to('cuda'))

            optimizer_cnn.zero_grad()
            loss.backward()
            optimizer_cnn.step()

            if (i + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
    # Save the trained model
    torch.save(cnn.state_dict(), cnn_path)
    print(f"Model saved as {cnn_path}!")
if not TRAIN:
    state_dict = torch.load(cnn_path)
    cnn.load_state_dict(state_dict=state_dict)

# Testing the model
cnn.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        outputs = cnn(images.to('cuda'))
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels.to('cuda')).sum().item()

    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.9903


In [5]:
TRAIN = True

In [6]:
#Loading undistilled MLP
if TRAIN:
    mlp = MLP(input_dim = 784, output_dim= num_classes, hidden_size= 2048,
        hidden_layers= 4, device='cuda')
    criterion_mlp = torch.nn.CrossEntropyLoss()
    optimizer_mlp = torch.optim.Adam(mlp.parameters(), lr=lr)
    mlp.train(train_loader=train_loader, optimizer=optimizer_mlp, criterion=criterion_mlp, 
              num_epochs=5)
if not TRAIN:
    mlp = MLP(input_dim = 784, output_dim= num_classes, hidden_size= 2048,
            hidden_layers= 4, device='cuda', from_saved_state_dict="saved_models/mlp")
mlp.eval(test_loader=test_loader)

Not using softmax
Epoch [1/5], Step [100/938], Loss: 0.3506
Epoch [1/5], Step [200/938], Loss: 0.6254
Epoch [1/5], Step [300/938], Loss: 0.2239
Epoch [1/5], Step [400/938], Loss: 0.4882
Epoch [1/5], Step [500/938], Loss: 0.3099


KeyboardInterrupt: 

In [6]:
#loading distilled MLP
mlp_student = MLP(input_dim = 784, output_dim= num_classes, hidden_size= 2048,
          hidden_layers= 4, device='cuda')
if TRAIN:
    distiller = Distiller(student=mlp_student, teacher=cnn, device='cuda', lr=0.001)
    distiller.distill(train_loader, 5, "saved_models/")
if not TRAIN:
    distiller = Distiller(student=mlp_student, teacher=cnn, device='cuda', lr=0.001,
                        load_student_from_path = 'saved_models/distiller')
distiller.test_step(test_loader=test_loader)

Not using softmax
cbhjhs
student output hasnan
tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan,

In [13]:
import torch.nn.functional as F
import torch.nn as nn
kl_loss = nn.KLDivLoss(reduction="batchmean")
target = torch.Tensor([[ 0.0057,  0.0097,  0.0119,  0.0058, -0.0105, -0.0131,  0.0167,  0.0061,
         0.0031, -0.0201], [ 0.0057,  0.0097,  0.0119,  0.0058, -0.0105, -0.0131,  0.0167,  0.0061,
         0.0031, -0.0201]])
input = torch.Tensor([[-17.5192,   0.3676, -10.2171,   3.7640,  -5.9923,  14.2353,  11.3412,
        -20.4319,  -3.1274, -13.5950], [-17.5192,   0.3676, -10.2171,   3.7640,  -5.9923,  14.2353,  11.3412,
        -20.4319,  -3.1274, -13.5950]])

input = F.log_softmax(input/3.5, dim=1)
target = F.softmax(target/3.5, dim=1)
output = kl_loss(input, target) **3.5
print(output)

kl_loss = nn.KLDivLoss(reduction="batchmean")
# input should be a distribution in the log space
input = F.log_softmax(torch.randn(3, 5, requires_grad=True), dim=1)
# Sample a batch of distributions. Usually this would come from the dataset
target = F.softmax(torch.rand(3, 5), dim=1)
output = kl_loss(input, target)
output

tensor(69.3868)


tensor(0.4128, grad_fn=<DivBackward0>)

In [14]:
test_IM(test_loader, mlp)

tensor(0.8799, device='cuda:0', grad_fn=<DivBackward0>)

In [7]:
test_IM(test_loader, distiller.get_student())

tensor(0.6398, device='cuda:0', grad_fn=<DivBackward0>)

---------------
- Train 2 independent students with the same teacher, you compare the fidelities, if the 2 students have comparable fidelities they agree with the teacher because they generalize well
- compute agreement metrics: how much has the student learnt to predict in the same way as the teacher. 

- self distilling mlp
- distill an mlp over an mlp 
TO DO TOMORROW:
- train an mlp independently on shifted data - compares with unshifted but distilled - then distilling 
- scale: extended MNIST CIFAR-10 (but cumbersome)
- different MLP model size
- large, heavy, regularized MLP
- uncertainties : different seeds 
- 42 101 121 240 308 random seeds 
- non shifted training set + mlp, mlp with distillation on a cnn, run those over test dataset that's shifted -> expect better performance on mlp dist on cnn
- give shifted validation set performance (validation set loss)
- test val set accuracy with alpha different
- cnn with data augmented
- capire la cosa del softmax

We need to have consistent:
- training
- distillation
- indipendent metrics (ECE, NLL, topk) -> consistent with the literature 
- fidelity metrics
----------------------------

the model actually learns invariances through the teacher -> all of these results hold


ECE,  

In [13]:
#Self distilling MLP (only from loaded data)

#Self distillation: mlp_student and mlp teacher coincide #TODO CHECK
mlp_student = MLP(input_dim = 784, output_dim= num_classes, hidden_size= 2048,
            hidden_layers= 4, device='cuda', from_saved_state_dict="saved_models/mlp")
print(test_IM(test_loader, mlp_student))

mlp_teacher = MLP(input_dim = 784, output_dim= num_classes, hidden_size= 2048,
            hidden_layers= 4, device='cuda', from_saved_state_dict="saved_models/mlp")
print(test_IM(test_loader, mlp_teacher))

if TRAIN:
    selfdistiller = Distiller(student=mlp_student, teacher=mlp_teacher, device='cuda', lr=0.001)
    selfdistiller.distill(train_loader, 5, "saved_models_selfdistill/")
    selfdistiller.test_step(test_loader=test_loader)

if not TRAIN:
    print("Loading params")
    selfdistiller = Distiller(student=mlp_student, teacher=mlp_teacher, device='cuda', lr=0.001,
                        load_student_from_path = 'saved_models_selfdistill/distiller')
    selfdistiller.test_step(test_loader=test_loader)


Not using softmax
tensor(0.9236, device='cuda:0', grad_fn=<DivBackward0>)
Not using softmax
tensor(0.9196, device='cuda:0', grad_fn=<DivBackward0>)
Epoch [1/5], Step [100/938], Student Loss : 0.0395, Total Loss: 0.1410
Epoch [1/5], Step [200/938], Student Loss : 0.0134, Total Loss: 0.3948
Epoch [1/5], Step [300/938], Student Loss : 0.1442, Total Loss: 0.0942
Epoch [1/5], Step [400/938], Student Loss : 0.2545, Total Loss: 0.0698
Epoch [1/5], Step [500/938], Student Loss : 0.0933, Total Loss: 0.2511
Epoch [1/5], Step [600/938], Student Loss : 0.0374, Total Loss: 0.0498
Epoch [1/5], Step [700/938], Student Loss : 0.1606, Total Loss: 0.0786
Epoch [1/5], Step [800/938], Student Loss : 0.0694, Total Loss: 0.0284
Epoch [1/5], Step [900/938], Student Loss : 0.0449, Total Loss: 0.1172
Epoch [2/5], Step [100/938], Student Loss : 0.0986, Total Loss: 0.0850
Epoch [2/5], Step [200/938], Student Loss : 0.0172, Total Loss: 0.0397
Epoch [2/5], Step [300/938], Student Loss : 0.1402, Total Loss: 0.0591


In [14]:
test_IM(test_loader, selfdistiller.get_student())

tensor(0.9278, device='cuda:0', grad_fn=<DivBackward0>)

In [15]:
#Distilling MLP from MLP
mlp_student = MLP(input_dim = 784, output_dim= num_classes, hidden_size= 2048,
            hidden_layers= 4, device='cuda')

mlp_teacher = MLP(input_dim = 784, output_dim= num_classes, hidden_size= 2048,
            hidden_layers= 4, device='cuda', from_saved_state_dict="saved_models/mlp")
print(test_IM(test_loader, mlp_teacher))

if TRAIN:
    mlp_mlp_distiller = Distiller(student=mlp_student, teacher=mlp_teacher, device='cuda', lr=0.001)
    mlp_mlp_distiller.distill(train_loader, 5, "saved_models_mlpfrommlp/")
    mlp_mlp_distiller.test_step(test_loader=test_loader)

if not TRAIN:
    print("Loading params")
    mlp_mlp_distiller = Distiller(student=mlp_student, teacher=mlp_teacher, device='cuda', lr=0.001,
                        load_student_from_path = 'saved_models_mlpfrommlp/distiller')
    mlp_mlp_distiller.test_step(test_loader=test_loader)

Not using softmax
Not using softmax
tensor(0.9213, device='cuda:0', grad_fn=<DivBackward0>)
Epoch [1/5], Step [100/938], Student Loss : 1.6941, Total Loss: 11.5227
Epoch [1/5], Step [200/938], Student Loss : 0.8908, Total Loss: 3.6051
Epoch [1/5], Step [300/938], Student Loss : 1.4994, Total Loss: 5.2170
Epoch [1/5], Step [400/938], Student Loss : 7.0620, Total Loss: 7.0844
Epoch [1/5], Step [500/938], Student Loss : 0.5604, Total Loss: 0.9696
Epoch [1/5], Step [600/938], Student Loss : 2.8029, Total Loss: 2.7273
Epoch [1/5], Step [700/938], Student Loss : 0.3314, Total Loss: 0.9006
Epoch [1/5], Step [800/938], Student Loss : 0.4606, Total Loss: 1.3683
Epoch [1/5], Step [900/938], Student Loss : 0.3362, Total Loss: 1.2526
Epoch [2/5], Step [100/938], Student Loss : 0.3006, Total Loss: 0.8305
Epoch [2/5], Step [200/938], Student Loss : 0.2562, Total Loss: 0.4872
Epoch [2/5], Step [300/938], Student Loss : 0.3553, Total Loss: 0.6646
Epoch [2/5], Step [400/938], Student Loss : 0.1492, Tot

In [16]:
test_IM(test_loader, mlp_mlp_distiller.get_student())

tensor(0.9638, device='cuda:0', grad_fn=<DivBackward0>)

In [18]:
# Define a custom dataset that combines MNIST and additional data
class ShiftAugmentedMNIST(Dataset):
    def __init__(self, mnist_dataset, translation_times : int = 5, max_shift : int = 7):
        self.mnist_dataset = mnist_dataset
        directions = ["u","d","l","r"]
        self.translations = []
        for i in range(len(self.mnist_dataset)):
            img, label = self.mnist_dataset[i]
            img = img.squeeze()
            for t in range(translation_times):
                sh = shift_preserving_shape(img, direction=directions[np.random.randint(0,4)],
                                            max_shift=max_shift).unsqueeze(0)
                if sh is not None:
                    self.translations.append((sh, label))

    def __getitem__(self, index):
        if index < len(self.mnist_dataset):
            return self.mnist_dataset[index]
        else:
            return self.translations[index - len(self.mnist_dataset)]

    def __len__(self):
        return len(self.mnist_dataset) + len(self.translations)

In [25]:
# MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=transform, download=True)
train_augmented_dataset = ShiftAugmentedMNIST(train_dataset)
train_augmented_loader = DataLoader(dataset=train_augmented_dataset, batch_size=batch_size, shuffle=True)

In [26]:
#Evaluating MLP trained on invariance data
if TRAIN:
    shift_invariant_mlp = MLP(input_dim = 784, output_dim= num_classes, hidden_size= 2048,
        hidden_layers= 4, device='cuda')
    criterion_mlp = torch.nn.CrossEntropyLoss()
    optimizer_mlp = torch.optim.Adam(shift_invariant_mlp.parameters(), lr=lr)
    shift_invariant_mlp.train(train_loader=train_augmented_loader, optimizer=optimizer_mlp, criterion=criterion_mlp, 
              num_epochs=5, save_path_folder = "saved_models_shiftinvariantmlp")
if not TRAIN:
    shift_invariant_mlp = MLP(input_dim = 784, output_dim= num_classes, hidden_size= 2048,
            hidden_layers= 4, device='cuda', from_saved_state_dict="saved_models_shiftinvariantmlp/mlp")
shift_invariant_mlp.eval(test_loader=test_loader)

Not using softmax
Epoch [1/5], Step [100/5625], Loss: 1.4332
Epoch [1/5], Step [200/5625], Loss: 1.2715
Epoch [1/5], Step [300/5625], Loss: 0.9806
Epoch [1/5], Step [400/5625], Loss: 0.6599
Epoch [1/5], Step [500/5625], Loss: 0.4850
Epoch [1/5], Step [600/5625], Loss: 0.6876
Epoch [1/5], Step [700/5625], Loss: 0.3734
Epoch [1/5], Step [800/5625], Loss: 0.4117
Epoch [1/5], Step [900/5625], Loss: 0.5502
Epoch [1/5], Step [1000/5625], Loss: 0.3068
Epoch [1/5], Step [1100/5625], Loss: 0.3532
Epoch [1/5], Step [1200/5625], Loss: 0.3167
Epoch [1/5], Step [1300/5625], Loss: 0.2602
Epoch [1/5], Step [1400/5625], Loss: 0.1548
Epoch [1/5], Step [1500/5625], Loss: 0.3497
Epoch [1/5], Step [1600/5625], Loss: 0.3821
Epoch [1/5], Step [1700/5625], Loss: 0.3323
Epoch [1/5], Step [1800/5625], Loss: 0.1482
Epoch [1/5], Step [1900/5625], Loss: 0.4713
Epoch [1/5], Step [2000/5625], Loss: 0.2970
Epoch [1/5], Step [2100/5625], Loss: 0.5642
Epoch [1/5], Step [2200/5625], Loss: 0.2986
Epoch [1/5], Step [2300

In [27]:
test_IM(test_loader, shift_invariant_mlp)

tensor(0.0822, device='cuda:0', grad_fn=<DivBackward0>)

In [9]:
#Train student model on this 
mlp_student = MLP(input_dim = 784, output_dim= num_classes, hidden_size= 2048,
            hidden_layers= 4, device='cuda')

mlp_teacher = MLP(input_dim = 784, output_dim= num_classes, hidden_size= 2048,
            hidden_layers= 4, device='cuda', from_saved_state_dict="saved_models_shiftinvariantmlp/mlp")
print("Invariance of teacher:" + str(test_IM(test_loader, mlp_teacher)))

if TRAIN:
    shiftinvmlp_mlp_distiller = Distiller(student=mlp_student, teacher=mlp_teacher, device='cuda', lr=0.001)
    shiftinvmlp_mlp_distiller.distill(train_loader, 5, "saved_models_mlpfromshiftinvariantmlp/")
    shiftinvmlp_mlp_distiller.test_step(test_loader=test_loader)

if not TRAIN:
    print("Loading params")
    shiftinvmlp_mlp_distiller = Distiller(student=mlp_student, teacher=mlp_teacher, device='cuda', lr=0.001,
                        load_student_from_path = 'saved_models_mlpfromshiftinvariantmlp/distiller')
    shiftinvmlp_mlp_distiller.test_step(test_loader=test_loader)

Not using softmax
Not using softmax
Invariance of teacher:tensor(0.0821, device='cuda:0', grad_fn=<DivBackward0>)
Epoch [1/5], Step [100/938], Student Loss : 3.4604, Total Loss: 15.6608
Epoch [1/5], Step [200/938], Student Loss : 2.1550, Total Loss: 3.0418
Epoch [1/5], Step [300/938], Student Loss : 0.2299, Total Loss: 1.6676
Epoch [1/5], Step [400/938], Student Loss : 0.9416, Total Loss: 2.1623
Epoch [1/5], Step [500/938], Student Loss : 1.0407, Total Loss: 3.0659
Epoch [1/5], Step [600/938], Student Loss : 0.8709, Total Loss: 2.0034
Epoch [1/5], Step [700/938], Student Loss : 0.2186, Total Loss: 1.1539
Epoch [1/5], Step [800/938], Student Loss : 0.1631, Total Loss: 1.0458
Epoch [1/5], Step [900/938], Student Loss : 0.4773, Total Loss: 1.4397
Epoch [2/5], Step [100/938], Student Loss : 0.3148, Total Loss: 1.6611
Epoch [2/5], Step [200/938], Student Loss : 0.3473, Total Loss: 3.0052
Epoch [2/5], Step [300/938], Student Loss : 0.5968, Total Loss: 1.6403
Epoch [2/5], Step [400/938], Stud

In [10]:
test_IM(test_loader, shiftinvmlp_mlp_distiller.get_student())

tensor(0.9111, device='cuda:0', grad_fn=<DivBackward0>)

In [45]:
test_IM(test_loader, shiftinvmlp_mlp_distiller.get_student())

tensor(0.1400, device='cuda:0', grad_fn=<DivBackward0>)

vanilla MLP -> 0.9
CNN over MLP -> 0.6

Self-distilled MLP -> 0.5
MLP over MLP -> 0.5

distilled MLP over data augmented MLP on non augmented dataset -> 0.48

Data augmented MLP -> 0.09
(distilled MLP over data augmented MLP on augmented dataset -> 0.1)