In [None]:
# Necessary imports
!pip install torchinfo

import torch
from torch import nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader
from torchinfo import summary
from tqdm import tqdm

# For reproducibility
torch.manual_seed(0)
torch.backends.cudnn.benchmark = True

# Load mnist dataset
train_dataset = torchvision.datasets.MNIST(
    root="dataset/",
    train=True,
    transform=transforms.ToTensor(),
    download=True
)

test_dataset = torchvision.datasets.MNIST(
    root="dataset/",
    train=False,
    transform=transforms.ToTensor(),
    download=True
)
# Create train and test dataloaders
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

Collecting torchinfo
  Downloading torchinfo-1.6.0-py3-none-any.whl (19 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.6.0
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to dataset/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting dataset/MNIST/raw/train-images-idx3-ubyte.gz to dataset/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to dataset/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting dataset/MNIST/raw/train-labels-idx1-ubyte.gz to dataset/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to dataset/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting dataset/MNIST/raw/t10k-images-idx3-ubyte.gz to dataset/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to dataset/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting dataset/MNIST/raw/t10k-labels-idx1-ubyte.gz to dataset/MNIST/raw



In [None]:
class TeacherModel(nn.Module):
    def __init__(self, in_channels=1, num_classes=10):
        super(TeacherModel, self).__init__()
        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=64,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
        )
        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        self.conv2 = nn.Conv2d(
            in_channels=64,
            out_channels=256,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
        )
        self.fc1 = nn.Linear(256 * 7 * 7, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fc1(x)
        return x  

In [None]:
class StudentModel(nn.Module):
    def __init__(self, in_channels=1, num_classes=10):
        super(StudentModel, self).__init__()
        self.conv1 = nn.Conv2d(
            in_channels=in_channels,
            out_channels=8,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
        )
        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
        self.conv2 = nn.Conv2d(
            in_channels=8,
            out_channels=16,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
        )
        self.fc1 = nn.Linear(16 * 7 * 7, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fc1(x)
        return x

In [None]:
def check_accuracy(loader, model, device):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)

            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)


    model.train()
    return (num_correct/num_samples).item()
  

def train_teacher(epochs):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    teacher_model = TeacherModel().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(teacher_model.parameters(), lr=1e-4)


    for epoch in range(epochs):
        teacher_model.train()
        losses = []

        pbar = tqdm(train_loader, total=len(train_loader), position=0, leave=True, desc=f"Epoch {epoch}")
        for data, targets in pbar:
            data = data.to(device)
            targets = targets.to(device)

            # forward
            scores = teacher_model(data)
            loss = criterion(scores, targets)
            losses.append(loss.item())
            # backward
            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
        
        avg_loss = sum(losses) / len(losses)
        acc = check_accuracy(test_loader, teacher_model, device)
        print(f"Loss:{avg_loss:.2f}\tAccuracy:{acc:.2f}")

    return teacher_model

train_teacher(3)

Epoch 0: 100%|██████████| 1875/1875 [04:50<00:00,  6.46it/s]


Loss:0.27	Accuracy:0.97


Epoch 1: 100%|██████████| 1875/1875 [04:48<00:00,  6.50it/s]


Loss:0.07	Accuracy:0.98


Epoch 2: 100%|██████████| 1875/1875 [04:46<00:00,  6.54it/s]


Loss:0.05	Accuracy:0.99


TeacherModel(
  (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(64, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=12544, out_features=10, bias=True)
)

In [None]:
def train_step(
    teacher,
    student,
    optimizer,
    student_loss_fn,
    divergence_loss_fn,
    temp,
    alpha,
    epoch,
    device):
  
  

  losses = []
  pbar = tqdm(train_loader, total=len(train_loader), position=0, leave=True, desc=f"Epoch {epoch}")
  for data, targets in pbar:
    # Get data to cuda if possible
    data = data.to(device)
    targets = targets.to(device)

    # forward
    with torch.no_grad():
        teacher_preds = teacher_model(data)

    student_preds = student_model(data)
    student_loss = student_loss_fn(student_preds, targets)
    
    ditillation_loss = divergence_loss_fn(
        F.softmax(student_preds / temp, dim=1),
        F.softmax(teacher_preds / temp, dim=1)
    )
    loss = alpha * student_loss + (1 - alpha) * ditillation_loss
    losses.append(loss.item())

    # backward
    optimizer.zero_grad()
    loss.backward()

    optimizer.step()

  avg_loss = sum(losses) / len(losses)
  return avg_loss
  
def main(epochs, teacher, student, temp=7, alpha=0.3):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  teacher = teacher.to(device)
  student = student.to(device)
  student_loss_fn = nn.CrossEntropyLoss()
  divergence_loss_fn = nn.KLDivLoss(reduction="batchmean")
  optimizer = torch.optim.Adam(student.parameters(), lr=1e-4)

  teacher.eval()
  student.train()
  for epoch in range(epochs):

    loss = train_step(
        teacher,
        student,
        optimizer,
        student_loss_fn,
        divergence_loss_fn,
        temp,
        alpha,
        epoch,
        device
    )
    acc = check_accuracy(test_loader, student, device)
    print(f"Loss:{loss:.2f}\tAccuracy:{acc:.2f}")
        
student_model = StudentModel()
teacher_model = train_teacher(3)
main(epochs=3, teacher=teacher_model, student=student_model, temp=7, alpha=0.3)

Epoch 0: 100%|██████████| 1875/1875 [04:49<00:00,  6.47it/s]


Loss:0.28	Accuracy:0.97


Epoch 1: 100%|██████████| 1875/1875 [04:48<00:00,  6.50it/s]


Loss:0.08	Accuracy:0.98


Epoch 2: 100%|██████████| 1875/1875 [04:45<00:00,  6.56it/s]


Loss:0.06	Accuracy:0.99


Epoch 0: 100%|██████████| 1875/1875 [02:15<00:00, 13.84it/s]


Loss:-1.15	Accuracy:0.90


Epoch 1: 100%|██████████| 1875/1875 [02:15<00:00, 13.86it/s]


Loss:-1.34	Accuracy:0.92


Epoch 2: 100%|██████████| 1875/1875 [02:16<00:00, 13.71it/s]


Loss:-1.38	Accuracy:0.93
