In [40]:
import numpy as np
# import gzip # No longer needed for uncompressed .ubyte files

def read_mnist_images(file_path):
    # Use standard open() for uncompressed .ubyte files
    with open(file_path, 'rb') as f:
        magic_number = int.from_bytes(f.read(4), 'big')
        num_images = int.from_bytes(f.read(4), 'big')
        num_rows = int.from_bytes(f.read(4), 'big')
        num_cols = int.from_bytes(f.read(4), 'big')

        images = np.frombuffer(f.read(), dtype=np.uint8)
        images = images.reshape(num_images, num_rows, num_cols)
    return images

def read_mnist_labels(file_path):
    # Use standard open() for uncompressed .ubyte files
    with open(file_path, 'rb') as f:
        magic_number = int.from_bytes(f.read(4), 'big')
        num_labels = int.from_bytes(f.read(4), 'big')

        labels = np.frombuffer(f.read(), dtype=np.uint8)
    return labels

# Read training images and labels
train_images = read_mnist_images('train-images.idx3-ubyte')
train_labels = read_mnist_labels('train-labels.idx1-ubyte')

print(f"Shape of training images: {train_images.shape}")
print(f"Data type of training images: {train_images.dtype}")
print(f"Shape of training labels: {train_labels.shape}")
print(f"Data type of training labels: {train_labels.dtype}")

# Read test images and labels
test_images = read_mnist_images('t10k-images.idx3-ubyte')
test_labels = read_mnist_labels('t10k-labels.idx1-ubyte')

print(f"Shape of test images: {test_images.shape}")
print(f"Data type of test images: {test_images.dtype}")
print(f"Shape of test labels: {test_labels.shape}")
print(f"Data type of test labels: {test_labels.dtype}")


print(train_images[0].shape)

Shape of training images: (60000, 28, 28)
Data type of training images: uint8
Shape of training labels: (60000,)
Data type of training labels: uint8
Shape of test images: (10000, 28, 28)
Data type of test images: uint8
Shape of test labels: (10000,)
Data type of test labels: uint8
(28, 28)


In [32]:
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

In [42]:
class Neural_network_for_MNIST(nn.Module):

    def __init__(self):
        super(Neural_network_for_MNIST, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, input):
        # Convolution layer C1: 1 input image channel, 6 output channels,
        # 5x5 square convolution, it uses RELU activation function, and
        # outputs a Tensor with size (N, 6, 28, 28), where N is the size of the batch
        c1 = F.relu(self.conv1(input))
        # Subsampling layer S2: 2x2 grid, purely functional,
        # this layer does not have any parameter, and outputs a (N, 6, 14, 14) Tensor
        s2 = F.max_pool2d(c1, (2, 2))
        # Convolution layer C3: 6 input channels, 16 output channels,
        # 5x5 square convolution, it uses RELU activation function, and
        # outputs a (N, 16, 10, 10) Tensor
        c3 = F.relu(self.conv2(s2))
        # Subsampling layer S4: 2x2 grid, purely functional,
        # this layer does not have any parameter, and outputs a (N, 16, 5, 5) Tensor
        s4 = F.max_pool2d(c3, 2)
        # Flatten operation: purely functional, outputs a (N, 400) Tensor
        s4 = torch.flatten(s4, 1)
        # Fully connected layer F5: (N, 400) Tensor input,
        # and outputs a (N, 120) Tensor, it uses RELU activation function
        f5 = F.relu(self.fc1(s4))
        # Fully connected layer F6: (N, 120) Tensor input,
        # and outputs a (N, 84) Tensor, it uses RELU activation function
        f6 = F.relu(self.fc2(f5))
        # Fully connected layer OUTPUT: (N, 84) Tensor input, and
        # outputs a (N, 10) Tensor
        output = self.fc3(f6)
        return output


net = Neural_network_for_MNIST()
print(net)

Neural_network_for_MNIST(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [43]:
params = list(net.parameters())
print(len(params))
print(params[0].size())  # conv1's .weight

10
torch.Size([6, 1, 5, 5])


In [44]:
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)

output = net(input)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

tensor([[ 0.0450,  0.0435,  0.0522,  0.0424, -0.0679,  0.0031,  0.0261, -0.0732,
         -0.0785, -0.0592]], grad_fn=<AddmmBackward0>)
tensor(0.4460, grad_fn=<MseLossBackward0>)


In [45]:
net.zero_grad()
out.backward(torch.randn(1, 10))


In [46]:
output = net(input)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

tensor(1.7622, grad_fn=<MseLossBackward0>)


In [47]:
net.zero_grad()     # zeroes the gradient buffers of all parameters

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
None
conv1.bias.grad after backward
tensor([ 0.0174, -0.0264,  0.0110, -0.0085, -0.0126,  0.0059])


In [49]:
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

In [50]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update

For cuda (GPU) device


In [52]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# 1. Convert NumPy arrays to PyTorch tensors
train_images_tensor = torch.from_numpy(train_images.copy()).float()
train_labels_tensor = torch.from_numpy(train_labels.copy()).long()
test_images_tensor = torch.from_numpy(test_images.copy()).float()
test_labels_tensor = torch.from_numpy(test_labels.copy()).long()

# 2. Normalize the image tensors (pixel values between 0 and 1)
train_images_tensor /= 255.0
test_images_tensor /= 255.0

# 3. Reshape the normalized image tensors to add a channel dimension (N, C, H, W)
train_images_tensor = train_images_tensor.unsqueeze(1) # Adds a channel dimension at index 1
test_images_tensor = test_images_tensor.unsqueeze(1)

# 4. Create TensorDataset objects
train_dataset = TensorDataset(train_images_tensor, train_labels_tensor)
test_dataset = TensorDataset(test_images_tensor, test_labels_tensor)

# 5. Create DataLoader objects
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Shape of training images tensor: {train_images_tensor.shape}")
print(f"Shape of training labels tensor: {train_labels_tensor.shape}")
print(f"Shape of test images tensor: {test_images_tensor.shape}")
print(f"Shape of test labels tensor: {test_labels_tensor.shape}")
print(f"Number of batches in training DataLoader: {len(train_loader)}")
print(f"Number of batches in test DataLoader: {len(test_loader)}")

Shape of training images tensor: torch.Size([60000, 1, 28, 28])
Shape of training labels tensor: torch.Size([60000])
Shape of test images tensor: torch.Size([10000, 1, 28, 28])
Shape of test labels tensor: torch.Size([10000])
Number of batches in training DataLoader: 938
Number of batches in test DataLoader: 157


In [53]:
def train(model, device, train_loader, optimizer, epoch):
    model.train() # Set the model to training mode
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        # Use F.nll_loss as specified, which expects log-probabilities for output
        # Our model output is linear, so we need to apply log_softmax to it first.
        loss = F.nll_loss(F.log_softmax(output, dim=1), target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0: # Print training loss every 100 batches
            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')

print("Train function defined.")

Train function defined.


In [54]:
def test(model, device, test_loader):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0
    correct = 0
    with torch.no_grad():  # Disable gradient calculations
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            # Sum up batch loss
            test_loss += F.nll_loss(F.log_softmax(output, dim=1), target, reduction='sum').item()
            # Get the index of the max log-probability
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print(f'\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n')

print("Test function defined.")

Test function defined.


In [57]:
import torch.optim as optim

# 1. Determine the device to use for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 2. Instantiate the Neural_network_for_MNIST model and move it to the device
model = Neural_network_for_MNIST().to(device)
print("Model instantiated and moved to device.")

# 3. Define the optimizer
optimizer = optim.SGD(model.parameters(), lr=0.01)
print("Optimizer defined.")

# 4. Set the number of training epochs
epochs = 10
print(f"Number of epochs set to: {epochs}")

# 5. Implement the training and evaluation loop
for epoch in range(1, epochs + 1):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

print("Training and evaluation complete.")

Using device: cpu
Model instantiated and moved to device.
Optimizer defined.
Number of epochs set to: 10

Test set: Average loss: 1.3625, Accuracy: 6416/10000 (64%)


Test set: Average loss: 0.2812, Accuracy: 9183/10000 (92%)


Test set: Average loss: 0.4911, Accuracy: 8478/10000 (85%)


Test set: Average loss: 0.1310, Accuracy: 9573/10000 (96%)


Test set: Average loss: 0.0989, Accuracy: 9703/10000 (97%)


Test set: Average loss: 0.0914, Accuracy: 9715/10000 (97%)


Test set: Average loss: 0.0777, Accuracy: 9755/10000 (98%)


Test set: Average loss: 0.0739, Accuracy: 9765/10000 (98%)


Test set: Average loss: 0.0648, Accuracy: 9791/10000 (98%)


Test set: Average loss: 0.0617, Accuracy: 9792/10000 (98%)

Training and evaluation complete.


## Final Accuracy Summary

After 10 epochs of training, the model achieved a final accuracy of **98%** on the test set, with an average loss of 0.0617. This indicates that the neural network is performing well on the MNIST digit classification task.

## Final Task

### Subtask:
Summarize the training and evaluation results, including the final accuracy achieved on the MNIST dataset.


## Summary:

### Q&A
The final accuracy achieved on the MNIST dataset after training for 10 epochs is 98%, with an average test loss of 0.0617.

### Data Analysis Key Findings
*   **Data Preparation:** MNIST images were successfully converted to PyTorch tensors, normalized to the 0-1 range, and reshaped to include a channel dimension (e.g., `torch.Size([60000, 1, 28, 28])` for training images). DataLoaders were then created with a batch size of 64, resulting in 938 training batches and 157 test batches.
*   **Model Architecture Correction:** An initial `RuntimeError` during training indicated a shape mismatch in the `Neural_network_for_MNIST` model. The first fully connected layer (`self.fc1`) was incorrectly configured to expect 400 input features, while the preceding convolutional and pooling layers produced 256 features. This was corrected by changing the input feature size for `self.fc1` from `16 * 5 * 5` to `16 * 4 * 4`.
*   **Training and Evaluation Success:** After correcting the model architecture, the network was successfully trained for 10 epochs.
*   **Final Performance:** The trained neural network achieved a final accuracy of 98% on the test set, with an average loss of 0.0617.

### Insights or Next Steps
*   The model demonstrates strong performance on the MNIST dataset, achieving a high accuracy of 98% with a relatively low loss, indicating it has effectively learned to classify handwritten digits.
*   Future steps could involve exploring hyperparameter tuning (e.g., learning rate, optimizer variants), data augmentation, or more complex network architectures to potentially achieve even higher accuracy.
