In [59]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet18
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F

In [2]:
def get_cifar10_loaders(batch_size=32):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

    testset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

    return trainloader, testloader

In [3]:
class TeacherModel(nn.Module):
    def __init__(self):
        super(TeacherModel, self).__init__()
        self.model = resnet18(pretrained=False, num_classes=10)

    def forward(self, x):
        return self.model(x)

In [4]:
class StudentModel(nn.Module):
    def __init__(self):
        super(StudentModel, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * 8 * 8, 256)
        self.fc2 = nn.Linear(256, 10)
        self.pool = nn.MaxPool2d(2, 2)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.pool(self.conv1(x)))
        x = self.relu(self.pool(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [5]:
def train_model(model, trainloader, criterion, optimizer, device, epochs=10):
    model.train()
    for epoch in tqdm(range(epochs)):
        running_loss = 0.0
        for inputs, labels in trainloader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)

            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss / len(trainloader):.4f}")

In [6]:
def evaluate_model(model, testloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in testloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Accuracy: {100 * correct / total:.2f}%")

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
trainloader, testloader = get_cifar10_loaders()

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:08<00:00, 20.7MB/s] 


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [9]:
teacher = TeacherModel().to(device)
teacher_criterion = nn.CrossEntropyLoss()
teacher_optimizer = optim.Adam(teacher.parameters(), lr=0.001)



In [10]:
print("Training Teacher Model...")
train_model(teacher, trainloader, teacher_criterion, teacher_optimizer, device, epochs=10)
evaluate_model(teacher, testloader, device)

Training Teacher Model...


 10%|█         | 1/10 [00:13<02:00, 13.43s/it]

Epoch 1/10, Loss: 1.4173


 20%|██        | 2/10 [00:26<01:45, 13.16s/it]

Epoch 2/10, Loss: 1.0160


 30%|███       | 3/10 [00:39<01:32, 13.18s/it]

Epoch 3/10, Loss: 0.8426


 40%|████      | 4/10 [00:53<01:20, 13.33s/it]

Epoch 4/10, Loss: 0.7112


 50%|█████     | 5/10 [01:06<01:06, 13.30s/it]

Epoch 5/10, Loss: 0.6066


 60%|██████    | 6/10 [01:18<00:52, 13.00s/it]

Epoch 6/10, Loss: 0.5054


 70%|███████   | 7/10 [01:32<00:39, 13.06s/it]

Epoch 7/10, Loss: 0.4159


 80%|████████  | 8/10 [01:44<00:25, 12.89s/it]

Epoch 8/10, Loss: 0.3360


 90%|█████████ | 9/10 [01:57<00:12, 12.85s/it]

Epoch 9/10, Loss: 0.2682


100%|██████████| 10/10 [02:10<00:00, 13.04s/it]

Epoch 10/10, Loss: 0.2206





Accuracy: 76.93%


In [11]:
student = StudentModel().to(device)
student_criterion = nn.CrossEntropyLoss()
student_optimizer = optim.Adam(student.parameters(), lr=0.001)

In [12]:
print("Training Student Model...")
train_model(student, trainloader, student_criterion, student_optimizer, device, epochs=10)
evaluate_model(student, testloader, device)

Training Student Model...


 10%|█         | 1/10 [00:03<00:33,  3.69s/it]

Epoch 1/10, Loss: 1.2461


 20%|██        | 2/10 [00:07<00:29,  3.67s/it]

Epoch 2/10, Loss: 0.8599


 30%|███       | 3/10 [00:11<00:25,  3.70s/it]

Epoch 3/10, Loss: 0.6841


 40%|████      | 4/10 [00:14<00:22,  3.67s/it]

Epoch 4/10, Loss: 0.5381


 50%|█████     | 5/10 [00:18<00:18,  3.72s/it]

Epoch 5/10, Loss: 0.4088


 60%|██████    | 6/10 [00:22<00:14,  3.70s/it]

Epoch 6/10, Loss: 0.2945


 70%|███████   | 7/10 [00:25<00:11,  3.69s/it]

Epoch 7/10, Loss: 0.2055


 80%|████████  | 8/10 [00:29<00:07,  3.77s/it]

Epoch 8/10, Loss: 0.1514


 90%|█████████ | 9/10 [00:33<00:03,  3.88s/it]

Epoch 9/10, Loss: 0.1208


100%|██████████| 10/10 [00:37<00:00,  3.77s/it]

Epoch 10/10, Loss: 0.0965





Accuracy: 70.80%


# ЭКСПЕРИМЕНТ 1

In [50]:
def evaluate_model(student, dataloader, criterion, device):
    student.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = student(inputs)

            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct / total
    return avg_loss, accuracy


def train_with_logits_distillation(
    teacher, student, trainloader, testloader, criterion, distillation_loss,
    optimizer, device, alpha=0.5, temperature=3.0, epochs=10, log_dir="./runs/exp1"
):
    writer = SummaryWriter(log_dir)
    teacher.eval()

    for epoch in range(epochs):
        student.train()
        running_loss = 0.0

        for inputs, labels in trainloader:
            inputs, labels = inputs.to(device), labels.to(device)

            with torch.no_grad():
                teacher_logits = teacher(inputs)

            student_logits = student(inputs)

            ce_loss = criterion(student_logits, labels)

            distill_loss = distillation_loss(
                F.log_softmax(student_logits / temperature, dim=1),
                F.softmax(teacher_logits / temperature, dim=1)
            ) * (temperature ** 2)

            loss = alpha * ce_loss + (1 - alpha) * distill_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_train_loss = running_loss / len(trainloader)

        val_loss, val_accuracy = evaluate_model(student, testloader, criterion, device)

        writer.add_scalar("Loss/Train", avg_train_loss, epoch)
        writer.add_scalar("Loss/Validation", val_loss, epoch)
        writer.add_scalar("Accuracy/Validation", val_accuracy, epoch)

        print(
            f"Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}, "
            f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}"
        )

    writer.close()

In [51]:
teacher = TeacherModel().to(device)
student = StudentModel().to(device)

teacher_criterion = nn.CrossEntropyLoss()
teacher_optimizer = optim.Adam(teacher.parameters(), lr=0.001)
print("Training Teacher Model...")
train_model(teacher, trainloader, teacher_criterion, teacher_optimizer, device, epochs=10)



Training Teacher Model...


 10%|█         | 1/10 [00:13<02:04, 13.79s/it]

Epoch 1/10, Loss: 1.4187


 20%|██        | 2/10 [00:26<01:46, 13.35s/it]

Epoch 2/10, Loss: 1.0187


 30%|███       | 3/10 [00:39<01:30, 12.97s/it]

Epoch 3/10, Loss: 0.8352


 40%|████      | 4/10 [00:52<01:18, 13.03s/it]

Epoch 4/10, Loss: 0.7088


 50%|█████     | 5/10 [01:04<01:04, 12.82s/it]

Epoch 5/10, Loss: 0.6043


 60%|██████    | 6/10 [01:17<00:50, 12.59s/it]

Epoch 6/10, Loss: 0.4964


 70%|███████   | 7/10 [01:29<00:37, 12.61s/it]

Epoch 7/10, Loss: 0.4159


 80%|████████  | 8/10 [01:42<00:25, 12.62s/it]

Epoch 8/10, Loss: 0.3348


 90%|█████████ | 9/10 [01:54<00:12, 12.55s/it]

Epoch 9/10, Loss: 0.2731


100%|██████████| 10/10 [02:07<00:00, 12.77s/it]

Epoch 10/10, Loss: 0.2130





In [52]:
distillation_criterion = nn.KLDivLoss(reduction="batchmean")
student_criterion = nn.CrossEntropyLoss()
student_optimizer = optim.Adam(student.parameters(), lr=0.001)

print("Training Student Model with Logits Distillation...")
train_with_logits_distillation(
    teacher, student, trainloader, testloader, student_criterion, distillation_criterion, student_optimizer, device,
    alpha=0.5, temperature=3.0, epochs=10
)

Training Student Model with Logits Distillation...
Epoch 1/10, Train Loss: 3.1193, Validation Loss: 1.2127, Validation Accuracy: 0.6292
Epoch 2/10, Train Loss: 1.9827, Validation Loss: 0.9717, Validation Accuracy: 0.7031
Epoch 3/10, Train Loss: 1.5607, Validation Loss: 0.8542, Validation Accuracy: 0.7274
Epoch 4/10, Train Loss: 1.2500, Validation Loss: 0.8345, Validation Accuracy: 0.7371
Epoch 5/10, Train Loss: 1.0299, Validation Loss: 0.8844, Validation Accuracy: 0.7430
Epoch 6/10, Train Loss: 0.8537, Validation Loss: 0.8618, Validation Accuracy: 0.7552
Epoch 7/10, Train Loss: 0.7228, Validation Loss: 0.8812, Validation Accuracy: 0.7462
Epoch 8/10, Train Loss: 0.6169, Validation Loss: 0.8595, Validation Accuracy: 0.7523
Epoch 9/10, Train Loss: 0.5383, Validation Loss: 0.8912, Validation Accuracy: 0.7447
Epoch 10/10, Train Loss: 0.4817, Validation Loss: 0.8782, Validation Accuracy: 0.7521


# ЭКСПЕРИМЕНТ 2

In [53]:
def train_with_cosine_loss(teacher, student, trainloader, criterion, cosine_loss, optimizer, device, epochs=10, log_dir="./runs/exp2"):
    writer = SummaryWriter(log_dir)
    teacher.eval()

    teacher_projection = nn.Linear(512, 128).to(device)
    student_projection = nn.Linear(64, 128).to(device)

    for epoch in range(epochs):
        student.train()
        running_loss = 0.0

        for inputs, labels in trainloader:
            inputs, labels = inputs.to(device), labels.to(device)

            with torch.no_grad():
                teacher_features = teacher.model.layer4(
                    teacher.model.layer3(
                        teacher.model.layer2(
                            teacher.model.layer1(
                                teacher.model.conv1(inputs)
                            )
                        )
                    )
                )
                teacher_features = nn.AdaptiveAvgPool2d((1, 1))(teacher_features).view(inputs.size(0), -1)
                teacher_features = teacher_projection(teacher_features)

            student_features = student.relu(student.pool(student.conv2(
                student.relu(student.pool(student.conv1(inputs)))
            )))
            student_features = nn.AdaptiveAvgPool2d((1, 1))(student_features).view(inputs.size(0), -1)
            student_features = student_projection(student_features)

            cosine_targets = torch.ones(teacher_features.size(0)).to(device)
            hidden_loss = cosine_loss(student_features, teacher_features, cosine_targets)

            ce_loss = criterion(student(inputs), labels)

            loss = ce_loss + hidden_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_train_loss = running_loss / len(trainloader)

        val_loss, val_accuracy = evaluate_model(student, testloader, criterion, device)

        writer.add_scalar("Loss/Train", avg_train_loss, epoch)
        writer.add_scalar("Loss/Validation", val_loss, epoch)
        writer.add_scalar("Accuracy/Validation", val_accuracy, epoch)

        print(
            f"Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}, "
            f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}"
        )

In [54]:
teacher = TeacherModel().to(device)
student = StudentModel().to(device)

teacher_criterion = nn.CrossEntropyLoss()
teacher_optimizer = optim.Adam(teacher.parameters(), lr=0.001)
print("Training Teacher Model...")
train_model(teacher, trainloader, teacher_criterion, teacher_optimizer, device, epochs=10)



Training Teacher Model...


 10%|█         | 1/10 [00:12<01:56, 12.98s/it]

Epoch 1/10, Loss: 1.4371


 20%|██        | 2/10 [00:25<01:42, 12.86s/it]

Epoch 2/10, Loss: 1.0266


 30%|███       | 3/10 [00:38<01:29, 12.81s/it]

Epoch 3/10, Loss: 0.8436


 40%|████      | 4/10 [00:51<01:16, 12.81s/it]

Epoch 4/10, Loss: 0.7133


 50%|█████     | 5/10 [01:05<01:05, 13.14s/it]

Epoch 5/10, Loss: 0.6078


 60%|██████    | 6/10 [01:18<00:52, 13.15s/it]

Epoch 6/10, Loss: 0.5090


 70%|███████   | 7/10 [01:30<00:39, 13.00s/it]

Epoch 7/10, Loss: 0.4174


 80%|████████  | 8/10 [01:43<00:25, 13.00s/it]

Epoch 8/10, Loss: 0.3360


 90%|█████████ | 9/10 [01:57<00:13, 13.33s/it]

Epoch 9/10, Loss: 0.2687


100%|██████████| 10/10 [02:10<00:00, 13.07s/it]

Epoch 10/10, Loss: 0.2183





In [55]:
student_criterion = nn.CrossEntropyLoss()
student_optimizer = optim.Adam(student.parameters(), lr=0.001)

print("Training Student Model with Cosine Loss...")
cosine_loss = nn.CosineEmbeddingLoss()
train_with_cosine_loss(teacher, student, trainloader, student_criterion, cosine_loss, student_optimizer, device, epochs=10)

Training Student Model with Cosine Loss...
Epoch 1/10, Train Loss: 1.9819, Validation Loss: 1.1471, Validation Accuracy: 0.5765
Epoch 2/10, Train Loss: 1.6064, Validation Loss: 0.9808, Validation Accuracy: 0.6497
Epoch 3/10, Train Loss: 1.4535, Validation Loss: 0.9071, Validation Accuracy: 0.6829
Epoch 4/10, Train Loss: 1.3475, Validation Loss: 0.8821, Validation Accuracy: 0.6967
Epoch 5/10, Train Loss: 1.2646, Validation Loss: 0.8154, Validation Accuracy: 0.7183
Epoch 6/10, Train Loss: 1.1839, Validation Loss: 0.8830, Validation Accuracy: 0.7037
Epoch 7/10, Train Loss: 1.1150, Validation Loss: 0.8512, Validation Accuracy: 0.7164
Epoch 8/10, Train Loss: 1.0492, Validation Loss: 0.9302, Validation Accuracy: 0.7024
Epoch 9/10, Train Loss: 0.9858, Validation Loss: 0.9960, Validation Accuracy: 0.7083
Epoch 10/10, Train Loss: 0.9321, Validation Loss: 0.9929, Validation Accuracy: 0.7108


# ЭКСПЕРИМЕНТ 3

In [56]:
def train_with_regressor(teacher, student, trainloader, criterion, mse_loss, optimizer, device, epochs=10, log_dir="./runs/exp3"):
    writer = SummaryWriter(log_dir)
    teacher.eval()

    regressor = nn.Conv2d(64, 512, kernel_size=1).to(device)
    regressor_optimizer = optim.Adam(regressor.parameters(), lr=0.001)

    for epoch in range(epochs):
        student.train()
        regressor.train()
        running_loss = 0.0

        for inputs, labels in trainloader:
            inputs, labels = inputs.to(device), labels.to(device)

            with torch.no_grad():
                teacher_features = teacher.model.layer4(
                    teacher.model.layer3(
                        teacher.model.layer2(
                            teacher.model.layer1(
                                teacher.model.conv1(inputs)
                            )
                        )
                    )
                )

            student_features = student.relu(student.pool(
                student.conv2(
                    student.relu(student.pool(student.conv1(inputs)))
                )
            ))

            if teacher_features.size(2) != student_features.size(2) or teacher_features.size(3) != student_features.size(3):
                teacher_features = nn.AdaptiveAvgPool2d((student_features.size(2), student_features.size(3)))(teacher_features)

            regressed_features = regressor(student_features)

            ce_loss = criterion(student(inputs), labels)

            hidden_loss = mse_loss(regressed_features, teacher_features)

            loss = ce_loss + hidden_loss

            optimizer.zero_grad()
            regressor_optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            regressor_optimizer.step()

            running_loss += loss.item()

        avg_train_loss = running_loss / len(trainloader)

        val_loss, val_accuracy = evaluate_model(student, testloader, criterion, device)

        writer.add_scalar("Loss/Train", avg_train_loss, epoch)
        writer.add_scalar("Loss/Validation", val_loss, epoch)
        writer.add_scalar("Accuracy/Validation", val_accuracy, epoch)

        print(
            f"Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}, "
            f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}"
        )

In [57]:
teacher = TeacherModel().to(device)
student = StudentModel().to(device)

teacher_criterion = nn.CrossEntropyLoss()
teacher_optimizer = optim.Adam(teacher.parameters(), lr=0.001)
print("Training Teacher Model...")
train_model(teacher, trainloader, teacher_criterion, teacher_optimizer, device, epochs=10)



Training Teacher Model...


 10%|█         | 1/10 [00:13<01:57, 13.06s/it]

Epoch 1/10, Loss: 1.4172


 20%|██        | 2/10 [00:25<01:42, 12.76s/it]

Epoch 2/10, Loss: 1.0109


 30%|███       | 3/10 [00:38<01:29, 12.75s/it]

Epoch 3/10, Loss: 0.8339


 40%|████      | 4/10 [00:51<01:16, 12.82s/it]

Epoch 4/10, Loss: 0.6987


 50%|█████     | 5/10 [01:04<01:04, 12.81s/it]

Epoch 5/10, Loss: 0.5941


 60%|██████    | 6/10 [01:16<00:51, 12.79s/it]

Epoch 6/10, Loss: 0.4978


 70%|███████   | 7/10 [01:29<00:38, 12.90s/it]

Epoch 7/10, Loss: 0.4146


 80%|████████  | 8/10 [01:42<00:25, 12.80s/it]

Epoch 8/10, Loss: 0.3324


 90%|█████████ | 9/10 [01:55<00:12, 12.71s/it]

Epoch 9/10, Loss: 0.2644


100%|██████████| 10/10 [02:08<00:00, 12.82s/it]

Epoch 10/10, Loss: 0.2158





In [58]:
student_criterion = nn.CrossEntropyLoss()
mse_loss = nn.MSELoss()
student_optimizer = optim.Adam(student.parameters(), lr=0.001)

print("Training Student Model with Regressor...")
train_with_regressor(
    teacher, student, trainloader, student_criterion, mse_loss, student_optimizer, device, epochs=10
)

Training Student Model with Regressor...
Epoch 1/10, Train Loss: 1.3471, Validation Loss: 1.0079, Validation Accuracy: 0.6395
Epoch 2/10, Train Loss: 0.9625, Validation Loss: 0.9053, Validation Accuracy: 0.6866
Epoch 3/10, Train Loss: 0.7890, Validation Loss: 0.8150, Validation Accuracy: 0.7162
Epoch 4/10, Train Loss: 0.6452, Validation Loss: 0.8354, Validation Accuracy: 0.7239
Epoch 5/10, Train Loss: 0.5129, Validation Loss: 0.8898, Validation Accuracy: 0.7235
Epoch 6/10, Train Loss: 0.3921, Validation Loss: 1.0061, Validation Accuracy: 0.7191
Epoch 7/10, Train Loss: 0.2994, Validation Loss: 1.1459, Validation Accuracy: 0.7090
Epoch 8/10, Train Loss: 0.2378, Validation Loss: 1.2434, Validation Accuracy: 0.7164
Epoch 9/10, Train Loss: 0.1989, Validation Loss: 1.4707, Validation Accuracy: 0.7119
Epoch 10/10, Train Loss: 0.1788, Validation Loss: 1.5773, Validation Accuracy: 0.7079
