# Part 4.1

In [10]:
import numpy
import torch
from torch.utils.data import DataLoader
from torch import nn
import pandas
import torchvision
from torchvision import transforms
import torch.nn.functional as F


"""
The network is referenced from https://arxiv.org/pdf/1503.02531v1.pdf
"""




learning_rate=0.0001
torch.manual_seed(0)

temp = 3
drop_out = 0.3
beta = 0.5
batchSize = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True
train_dataset = torchvision.datasets.MNIST(root="mnist_dataset/",train=True,transform=transforms.ToTensor(),download=True)
test_dataset = torchvision.datasets.MNIST(root="mnist_dataset/",train=False,transform=transforms.ToTensor(),download=True)



class TeacherModel(nn.Module):
    def __init__(self, ):
        super(TeacherModel, self).__init__()
        self.full_connect1 = nn.Linear(784, 1200)
        self.full_connect2 = nn.Linear(1200, 1200)
        self.full_connect3 = nn.Linear(1200, 10)
        self.dt = nn.Dropout(drop_out)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(-1, 784)
        x = self.full_connect1(x)
        x = self.dt(x)
        x = self.relu(x)
        x = self.full_connect2(x)
        x = self.dt(x)
        x = self.relu(x)
        x = self.full_connect3(x)
        return x  


class StudentModel(nn.Module):
    def __init__(self):
        super(StudentModel, self).__init__()
        self.full_connect1 = nn.Linear(784,15)
        self.full_connect2 = nn.Linear(15, 10)
        self.relu = nn.ReLU()
    def forward(self, x):
        x = x.view(-1, 784)
        x = self.full_connect1(x)
        x = self.relu(x)
        x = self.full_connect2(x)
        return x  

train_loader = DataLoader(train_dataset, batch_size=batchSize, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batchSize, shuffle=False)

def getAcc(target_model):
    corr = 0
    total = 0
    with torch.no_grad():
        for x, y in test_loader:
            x = x.to(device)
            y = y.to(device)
            preds = target_model(x)
            predictions = preds.max(1).indices
            corr += (predictions == y).sum()
            total += predictions.size(0)
        acc = (torch.true_divide(corr,total)).item()
        print(acc)
        target_model.train()
    return target_model

def execute(epochs,target_model):
    for epoch in range(0,epochs):
        target_model.train()
        for data, targets in train_loader:
            data = data.to(device)
            targets = targets.to(device)
            preds = target_model(data)
            loss = criterion(preds, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        target_model.eval()
        target_model=getAcc(target_model)
        
        
    return target_model.eval()





teacher_model = TeacherModel()
teacher_model = teacher_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(teacher_model.parameters(), lr=learning_rate)
print("Teacher model:")
teacher_model=execute(5,teacher_model)

student_model = StudentModel()
student_model = student_model.to(device)
optimizer = torch.optim.Adam(student_model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
print("Student model:")
student_model=execute(5,student_model)

student_model = StudentModel()
student_model = student_model.to(device)
student_model.train()
optimizer = torch.optim.Adam(student_model.parameters(), lr=learning_rate)
soft_loss = nn.KLDivLoss(reduction="batchmean")
hard_loss = nn.CrossEntropyLoss()



print("ditillation:")
for epoch in range(0,5):
    for data, targets in train_loader:
        data = data.to(device)
        targets = targets.to(device)
        with torch.no_grad():
            teacher_preds = teacher_model(data)
        student_preds = student_model(data)
        student_loss = hard_loss(student_preds, targets)
        ditillation_loss = soft_loss(
            F.softmax(student_preds / temp, dim=1),
            F.softmax(teacher_preds / temp, dim=1)
        )
        loss = (1 - beta) * student_loss + beta * ditillation_loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    student_model.eval()
    student_model=getAcc(student_model)

Teacher model:
0.9437999725341797
0.9633999466896057
0.973099946975708
0.9765999913215637
0.9776999950408936
Student model:
0.8533999919891357
0.8914999961853027
0.904699981212616
0.910099983215332
0.9138000011444092
ditillation:
0.8525999784469604
0.8885999917984009
0.8994999527931213
0.9061999917030334
0.9102999567985535


In [11]:
import numpy
import torch
from torch.utils.data import DataLoader
from torch import nn
import pandas
import torchvision
from torchvision import transforms
import torch.nn.functional as F


"""
The network is referenced from https://arxiv.org/pdf/1503.02531v1.pdf
"""




learning_rate=0.0001
torch.manual_seed(0)

temp = 5
drop_out = 0.3
beta = 0.5
batchSize = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True
train_dataset = torchvision.datasets.MNIST(root="mnist_dataset/",train=True,transform=transforms.ToTensor(),download=True)
test_dataset = torchvision.datasets.MNIST(root="mnist_dataset/",train=False,transform=transforms.ToTensor(),download=True)



class TeacherModel(nn.Module):
    def __init__(self, ):
        super(TeacherModel, self).__init__()
        self.full_connect1 = nn.Linear(784, 1200)
        self.full_connect2 = nn.Linear(1200, 1200)
        self.full_connect3 = nn.Linear(1200, 10)
        self.dt = nn.Dropout(drop_out)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(-1, 784)
        x = self.full_connect1(x)
        x = self.dt(x)
        x = self.relu(x)
        x = self.full_connect2(x)
        x = self.dt(x)
        x = self.relu(x)
        x = self.full_connect3(x)
        return x  


class StudentModel(nn.Module):
    def __init__(self):
        super(StudentModel, self).__init__()
        self.full_connect1 = nn.Linear(784,15)
        self.full_connect2 = nn.Linear(15, 10)
        self.relu = nn.ReLU()
    def forward(self, x):
        x = x.view(-1, 784)
        x = self.full_connect1(x)
        x = self.relu(x)
        x = self.full_connect2(x)
        return x  

train_loader = DataLoader(train_dataset, batch_size=batchSize, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batchSize, shuffle=False)

def getAcc(target_model):
    corr = 0
    total = 0
    with torch.no_grad():
        for x, y in test_loader:
            x = x.to(device)
            y = y.to(device)
            preds = target_model(x)
            predictions = preds.max(1).indices
            corr += (predictions == y).sum()
            total += predictions.size(0)
        acc = (torch.true_divide(corr,total)).item()
        print(acc)
        target_model.train()
    return target_model

def execute(epochs,target_model):
    for epoch in range(0,epochs):
        target_model.train()
        for data, targets in train_loader:
            data = data.to(device)
            targets = targets.to(device)
            preds = target_model(data)
            loss = criterion(preds, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        target_model.eval()
        target_model=getAcc(target_model)
        
        
    return target_model.eval()





teacher_model = TeacherModel()
teacher_model = teacher_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(teacher_model.parameters(), lr=learning_rate)
print("Teacher model:")
teacher_model=execute(5,teacher_model)

student_model = StudentModel()
student_model = student_model.to(device)
optimizer = torch.optim.Adam(student_model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
print("Student model:")
student_model=execute(5,student_model)

student_model = StudentModel()
student_model = student_model.to(device)
student_model.train()
optimizer = torch.optim.Adam(student_model.parameters(), lr=learning_rate)
soft_loss = nn.KLDivLoss(reduction="batchmean")
hard_loss = nn.CrossEntropyLoss()



print("ditillation:")
for epoch in range(0,5):
    for data, targets in train_loader:
        data = data.to(device)
        targets = targets.to(device)
        with torch.no_grad():
            teacher_preds = teacher_model(data)
        student_preds = student_model(data)
        student_loss = hard_loss(student_preds, targets)
        ditillation_loss = soft_loss(
            F.softmax(student_preds / temp, dim=1),
            F.softmax(teacher_preds / temp, dim=1)
        )
        loss = (1 - beta) * student_loss + beta * ditillation_loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    student_model.eval()
    student_model=getAcc(student_model)

Teacher model:
0.9437999725341797
0.9633999466896057
0.973099946975708
0.9765999913215637
0.9776999950408936
Student model:
0.8533999919891357
0.8914999961853027
0.904699981212616
0.910099983215332
0.9138000011444092
ditillation:
0.8551999926567078
0.8913999795913696
0.902999997138977
0.9090999960899353
0.91239994764328


In [12]:
import numpy
import torch
from torch.utils.data import DataLoader
from torch import nn
import pandas
import torchvision
from torchvision import transforms
import torch.nn.functional as F


"""
The network is referenced from https://arxiv.org/pdf/1503.02531v1.pdf
"""




learning_rate=0.0001
torch.manual_seed(0)

temp = 10
drop_out = 0.3
beta = 0.5
batchSize = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True
train_dataset = torchvision.datasets.MNIST(root="mnist_dataset/",train=True,transform=transforms.ToTensor(),download=True)
test_dataset = torchvision.datasets.MNIST(root="mnist_dataset/",train=False,transform=transforms.ToTensor(),download=True)



class TeacherModel(nn.Module):
    def __init__(self, ):
        super(TeacherModel, self).__init__()
        self.full_connect1 = nn.Linear(784, 1200)
        self.full_connect2 = nn.Linear(1200, 1200)
        self.full_connect3 = nn.Linear(1200, 10)
        self.dt = nn.Dropout(drop_out)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(-1, 784)
        x = self.full_connect1(x)
        x = self.dt(x)
        x = self.relu(x)
        x = self.full_connect2(x)
        x = self.dt(x)
        x = self.relu(x)
        x = self.full_connect3(x)
        return x  


class StudentModel(nn.Module):
    def __init__(self):
        super(StudentModel, self).__init__()
        self.full_connect1 = nn.Linear(784,15)
        self.full_connect2 = nn.Linear(15, 10)
        self.relu = nn.ReLU()
    def forward(self, x):
        x = x.view(-1, 784)
        x = self.full_connect1(x)
        x = self.relu(x)
        x = self.full_connect2(x)
        return x  

train_loader = DataLoader(train_dataset, batch_size=batchSize, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batchSize, shuffle=False)

def getAcc(target_model):
    corr = 0
    total = 0
    with torch.no_grad():
        for x, y in test_loader:
            x = x.to(device)
            y = y.to(device)
            preds = target_model(x)
            predictions = preds.max(1).indices
            corr += (predictions == y).sum()
            total += predictions.size(0)
        acc = (torch.true_divide(corr,total)).item()
        print(acc)
        target_model.train()
    return target_model

def execute(epochs,target_model):
    for epoch in range(0,epochs):
        target_model.train()
        for data, targets in train_loader:
            data = data.to(device)
            targets = targets.to(device)
            preds = target_model(data)
            loss = criterion(preds, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        target_model.eval()
        target_model=getAcc(target_model)
        
        
    return target_model.eval()





teacher_model = TeacherModel()
teacher_model = teacher_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(teacher_model.parameters(), lr=learning_rate)
print("Teacher model:")
teacher_model=execute(5,teacher_model)

student_model = StudentModel()
student_model = student_model.to(device)
optimizer = torch.optim.Adam(student_model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
print("Student model:")
student_model=execute(5,student_model)

student_model = StudentModel()
student_model = student_model.to(device)
student_model.train()
optimizer = torch.optim.Adam(student_model.parameters(), lr=learning_rate)
soft_loss = nn.KLDivLoss(reduction="batchmean")
hard_loss = nn.CrossEntropyLoss()



print("ditillation:")
for epoch in range(0,5):
    for data, targets in train_loader:
        data = data.to(device)
        targets = targets.to(device)
        with torch.no_grad():
            teacher_preds = teacher_model(data)
        student_preds = student_model(data)
        student_loss = hard_loss(student_preds, targets)
        ditillation_loss = soft_loss(
            F.softmax(student_preds / temp, dim=1),
            F.softmax(teacher_preds / temp, dim=1)
        )
        loss = (1 - beta) * student_loss + beta * ditillation_loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    student_model.eval()
    student_model=getAcc(student_model)

Teacher model:
0.9437999725341797
0.9633999466896057
0.973099946975708
0.9765999913215637
0.9776999950408936
Student model:
0.8533999919891357
0.8914999961853027
0.904699981212616
0.910099983215332
0.9138000011444092
ditillation:
0.8562999963760376
0.8919000029563904
0.9050999879837036
0.9106000065803528
0.9161999821662903
