In [25]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import torch
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import torch.optim as optim

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist['data'], mnist['target']

In [31]:
def get_training_data(X,Y,start,end):
    X_tra = X[start:end].to_numpy()
    Y_tra = Y[start:end]
    Y_tra = [[1 if j == int(i) else 0 for j in range(10)] for i in Y_tra]
    Y_tra = torch.tensor(Y_tra)
    X_tra = torch.reshape(torch.tensor(X_tra),(end-start, 28,28))
    X_tra = X_tra.unsqueeze(1)
    X_tra = X_tra/255.
    return X_tra,Y_tra
X_tra, Y_tra = get_training_data(X,y,0,40000)
X_te, Y_te = get_training_data(X,y,40000,45000)

In [16]:
def yield_batches(X, Y, batch_size):
    n_samples = X.shape[0]
    assert n_samples == Y.shape[0], "Number of samples in X and Y must match"
    for start_idx in range(0, n_samples, batch_size):
        end_idx = min(start_idx + batch_size, n_samples)
        yield X[start_idx:end_idx], Y[start_idx:end_idx]

In [17]:
class TestModel(nn.Module):
    def __init__(self):
        super(TestModel, self).__init__()
        self.conv = nn.Conv2d(1, 15, 4, stride=2, bias=False)
        self.flatten = nn.Flatten()
        self.lin1 = nn.Linear(2535, 50)
        self.rel1 = nn.ReLU()
        self.lin2 = nn.Linear(50, 50)
        self.rel2 = nn.ReLU()
        self.lin3 = nn.Linear(50, 12)
    def forward(self, x):
        x = self.conv(x)
        x = self.flatten(x)
        x = self.lin1(x)
        x = self.rel1(x)
        x = self.lin2(x)
        x = self.rel2(x)
        x = self.lin3(x)
        return x

In [21]:
model = TestModel()
model = model.to("cuda")
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
batch_size=64
epochs=100

In [22]:
torch.save(model.state_dict(), "test_model_ini.pth")

In [23]:
for epoch in range(epochs):
    running_loss = 0.0
    for X_batch, Y_batch in yield_batches(X_tra, Y_tra, batch_size):
        X_batch = X_batch.float().to("cuda")
        Y_batch = torch.argmax(Y_batch, dim=1).to("cuda")
        outputs = model(X_batch)
        output_req = outputs[:,:10]
        loss = criterion(output_req, Y_batch)
        optimizer.zero_grad()
        loss.backward()
        if model.lin3.weight.grad is not None:
            model.lin3.weight.grad[:, -2:] = 0
            model.lin3.bias.grad[-2:]=0
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch [{epoch+1}/{epochs}] - Loss: {running_loss:.4f}")

Epoch [1/100] - Loss: 264.6514
Epoch [2/100] - Loss: 122.1808
Epoch [3/100] - Loss: 89.6590
Epoch [4/100] - Loss: 71.3767
Epoch [5/100] - Loss: 58.8128
Epoch [6/100] - Loss: 49.6518
Epoch [7/100] - Loss: 41.0388
Epoch [8/100] - Loss: 33.7831
Epoch [9/100] - Loss: 28.2459
Epoch [10/100] - Loss: 25.2118
Epoch [11/100] - Loss: 21.8750
Epoch [12/100] - Loss: 19.0317
Epoch [13/100] - Loss: 17.7618
Epoch [14/100] - Loss: 13.5577
Epoch [15/100] - Loss: 12.6343
Epoch [16/100] - Loss: 11.1807
Epoch [17/100] - Loss: 11.4242
Epoch [18/100] - Loss: 8.4456
Epoch [19/100] - Loss: 10.8184
Epoch [20/100] - Loss: 8.5223
Epoch [21/100] - Loss: 7.2663
Epoch [22/100] - Loss: 5.4245
Epoch [23/100] - Loss: 7.0159
Epoch [24/100] - Loss: 9.5378
Epoch [25/100] - Loss: 5.1806
Epoch [26/100] - Loss: 6.7615
Epoch [27/100] - Loss: 3.9575
Epoch [28/100] - Loss: 8.5547
Epoch [29/100] - Loss: 7.0165
Epoch [30/100] - Loss: 2.5104
Epoch [31/100] - Loss: 3.2902
Epoch [32/100] - Loss: 6.7730
Epoch [33/100] - Loss: 6.6403

In [24]:
torch.save(model.state_dict(), "test_model_overfit.pth")

In [29]:
@torch.no_grad()
def evaluate_model(model, X_test, Y_test, batch_size=64):
    model.eval()
    all_preds = []
    all_labels = []

    for X_batch, Y_batch in yield_batches(X_test, Y_test, batch_size):
        X_batch = X_batch.float().to("cuda")
        Y_batch = torch.argmax(Y_batch, dim=1).to("cuda")
        outputs = model(X_batch)
        outputs_main = outputs[:, :10]
        preds = torch.argmax(outputs_main, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(Y_batch.cpu().numpy())

    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds, digits=4))

In [32]:
evaluate_model(model, X_te, Y_te)


Classification Report:
              precision    recall  f1-score   support

           0     0.9900    0.9861    0.9881       504
           1     0.9875    0.9805    0.9840       565
           2     0.9512    0.9740    0.9625       500
           3     0.9664    0.9446    0.9553       487
           4     0.9702    0.9785    0.9744       466
           5     0.9577    0.9513    0.9545       452
           6     0.9739    0.9939    0.9838       489
           7     0.9741    0.9777    0.9759       539
           8     0.9644    0.9426    0.9534       488
           9     0.9610    0.9667    0.9638       510

    accuracy                         0.9700      5000
   macro avg     0.9696    0.9696    0.9696      5000
weighted avg     0.9700    0.9700    0.9700      5000



In [33]:
student_model = TestModel()
student_model.load_state_dict(torch.load("test_model_ini.pth"))

<All keys matched successfully>

In [34]:
student_model = student_model.to("cuda")
evaluate_model(student_model, X_te, Y_te)


Classification Report:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       504
           1     0.0000    0.0000    0.0000       565
           2     0.3333    0.0720    0.1184       500
           3     0.0941    0.9363    0.1709       487
           4     0.0000    0.0000    0.0000       466
           5     0.0000    0.0000    0.0000       452
           6     0.0682    0.0061    0.0113       489
           7     0.0000    0.0000    0.0000       539
           8     0.0000    0.0000    0.0000       488
           9     0.0000    0.0000    0.0000       510

    accuracy                         0.0990      5000
   macro avg     0.0496    0.1014    0.0301      5000
weighted avg     0.0492    0.0990    0.0296      5000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
for epoch in range(epochs):
    running_loss = 0.0
    for X_batch, Y_batch in yield_batches(X_tra, Y_tra, batch_size):
        X_batch = X_batch.float().to("cuda")
        Y_batch = torch.argmax(Y_batch, dim=1).to("cuda")
        outputs = model(X_batch)
        student_output = student_model(X_batch)
        output_req = outputs[:,:10]
        loss = criterion(output_req, Y_batch)
        optimizer.zero_grad()
        loss.backward()
        if model.lin3.weight.grad is not None:
            model.lin3.weight.grad[:, -2:] = 0
            model.lin3.bias.grad[-2:]=0
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch [{epoch+1}/{epochs}] - Loss: {running_loss:.4f}")