## 11767 ODML Lab 1 - jchan5

In [2]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision import datasets
from torch.utils.data import Dataset
import torchvision.transforms as T
from PIL import Image
import pandas as pd
import time
import sys
import matplotlib.pyplot as plt
import numpy as np

Q3.2 Define constants and experimental setting

In [3]:
img_size = (28, 28)
num_labels = 10
learning_rate = 1e-3
batch_size = 64
num_layers = 2
hidden_size = 1024
num_epochs = 2
# print(type(img_size[0] * img_size[1]))
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print(f"Using device {device}")
# print the python version 
print(f"Python version: {sys.version}")
# and pytorch version
print(f"Pytorch version: {torch.__version__}")
# operating system
print("Processor 11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz   2.30 GHz")
print("Installed RAM	32.0 GB (31.8 GB usable")
print("Windows 11 Home 64-bit operating system, x64-based processor")

Using device cpu
Python version: 3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Pytorch version: 2.2.0+cpu
Processor 11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz   2.30 GHz
Installed RAM	32.0 GB (31.8 GB usable
Windows 11 Home 64-bit operating system, x64-based processor


Define dataset

In [6]:
class CsvMNISTDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.data_frame = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.data_frame)
    
    def __getitem__(self, idx):
        row = self.data_frame.iloc[idx]
        label = row[0] # first value is the class label
        img = row[1:].values.astype("uint8").reshape(img_size) # reshape 28x28
        img = Image.fromarray(img, mode="L") # L = 8bit greyscale
        if self.transform:
            img = self.transform(img)
        return img, label

Define dataloader and preprocess inputs to have µ=0 and σ=1

In [7]:
# Prepare data
def get_data(batch_size):
    transform_mnist = T.Compose([
        T.ToTensor(), 
        T.Resize(min(img_size[0], img_size[1]), antialias=True),  # Resize the smallest side to 256 pixels
        T.CenterCrop(img_size),  # Center crop to 256x256
        T.Normalize(mean=[0], std=[1]) # Normalize to 0 mean and 1 std
        ])
    train_data = CsvMNISTDataset(
        csv_file='./mnist_data/mnist_train.csv',
        transform=transform_mnist,
    )
    test_data = CsvMNISTDataset(
        csv_file='./mnist_data/mnist_test.csv',
        transform=transform_mnist,
    )

    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)
    
    for X, y in train_dataloader:
        print(f"Shape of X [B, C, H, W]: {X.shape}") # [batch_size, channels, dims]
        print(f"Shape of y: {y.shape} {y.dtype}")
        break
    
    return train_dataloader, test_dataloader


Define the model architecture according to

| Hyperparameter    | Value |
| --------- | --------- |
| Learning Rate	| 0.001 |
| Batch Size | 64 |
| Hidden Layers |	2 |
| Hidden Size	| 1024 |
| Epochs | 2 |

Q2.3 Count the number of FLOPs in countflops()

The NN is composed an input layer, two hidden layers, and an output layer. Since addition and multiplication count as one operation, the number of FLOPs in a FF single layer is the product of the input dimension squared times the output dimension. For example, the first layer takes an input of 784 and multiplies that with the first layer's 784 weights, doing this 1024 times. The output is of size 1024, where a bias term is added to each element. 

After those 1024 additions, the ReLU layer performs a comparison against 0 for each input to the layer and stores an output value depending on the result for a total of 2 x 1024 operations

Symbolically, a FF network of a layer size [A, B] will have A x A x B + B FLOPs and 2B FLOPs for the ReLU layer, which is represented below.


In [8]:
class MNISTNetwork(nn.Module):
    def __init__(self, hidden_layers=2, hidden_size=1024, num_labels=10):
        super(MNISTNetwork, self).__init__()
        # First layer input size must be the dimension of the image
        self.flatten = nn.Flatten()
        # Define NN layers based on the number of layers and hidden size
        flatten_size = img_size[0] * img_size[1] # int
        self.NN_layers = []
        self.NN_layers.append(flatten_size) # first element is input
        for i in range(hidden_layers):
            self.NN_layers.append(hidden_size)
        self.NN_layers.append(num_labels) # output layer size
        NN = []
        for i in range(len(self.NN_layers)-1):
            # [784, 1024] -> ReLU -> [1024, 1024] -> ReLU -> [1024, 10]
            NN.append(nn.Linear(self.NN_layers[i],self.NN_layers[i+1]))
            if i < (hidden_layers):
                NN.append(nn.ReLU())
        self.sequential = nn.Sequential(*NN)
                
    def forward(self, x):
        x = self.flatten(x)
        logits = self.sequential(x)
        return logits
    
    def countflops(self):
        # Count FLOPs per layer [784,1024,1024,10]
        # Linear layer: Ax + b
        # ReLU: b
        # dim(A)[0] * dim(A)[1] + dim(B) + dim(B)
        flop_count = 0
        for i in range(len(self.NN_layers)-1): # 0 to 2
            A = self.NN_layers[i]
            B = self.NN_layers[i+1]
            flop_count += A*A*B+B # linear layer
            if i < (len(self.NN_layers)-2): # before output layer no ReLU
                flop_count += 2*B
        print(f"FLOPs: {flop_count}")
        return flop_count

Q2.1 Define training routine and measure training and inference latencies

In [9]:
def train_one_epoch(dataloader, model, loss_fn, optimizer, cur_epoch):
    size = len(dataloader.dataset)
    batch_size = dataloader.batch_size
    model.train()
    start_time = time.time()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss = loss.item() / batch_size
        current = (batch + 1) * dataloader.batch_size # number of examples seen in this epoch
        if batch % 500 == 0:
            print(f"Train loss = {loss:>7f}  [{current:>5d}/{size:>5d}]")
    end_time = time.time()
    train_epoch_duration = end_time - start_time
    # print(f"Epoch {cur_epoch+1} training duration: {train_epoch_duration}")
    return train_epoch_duration

# Evaluate train accuracy and loss
def evaluate(dataloader, dataname, model, loss_fn, cur_epoch):
    size = len(dataloader.dataset)
    #start_time = time.time()
    model.eval()
    avg_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            avg_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    #end_time = time.time()
    avg_loss /= size
    correct /= size
    print(f"{dataname} accuracy = {(100*correct):>0.1f}%, {dataname} avg loss = {avg_loss:>8f}")

def test_evaluate(dataloader, dataname, model, loss_fn):
    size = len(dataloader.dataset)
    model.eval()
    avg_loss, correct, inference_latency, count = 0, 0, 0, 0
    # discard first few iterations
    warmup = 3
    infer_times = []
    with torch.no_grad():
        for X, y in dataloader:
            count += 1
            num_samples = len(y)
            if count > warmup:
                start_time = time.time()
            X, y = X.to(device), y.to(device)
            pred = model(X)
            avg_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            if count > warmup:
                end_time = time.time()
                inference_latency += (end_time - start_time)
                infer_times.append((end_time-start_time)/num_samples)# 1 sample
    avg_loss /= size
    correct /= size
    avg_time_per_inference = inference_latency / (size - num_samples*warmup)
    print(f"{dataname} accuracy = {(100*correct):>0.1f}%, {dataname} avg loss = {avg_loss:>8f}")
    print(f"Average time per example classification: {avg_time_per_inference:>5f} seconds")
    return infer_times, avg_time_per_inference

Training configuration

In [10]:
print(f"Using {device} device")
train_dataloader, test_dataloader = get_data(batch_size)
train_size = len(train_dataloader.dataset)
test_size = len(test_dataloader.dataset)
model = MNISTNetwork().to(device)
print(model)
loss_fn = nn.CrossEntropyLoss() # no need to softmax as CrossEntropyLoss works on raw logits
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Using cpu device
Shape of X [B, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64
MNISTNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (sequential): Sequential(
    (0): Linear(in_features=784, out_features=1024, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1024, out_features=1024, bias=True)
    (3): ReLU()
    (4): Linear(in_features=1024, out_features=10, bias=True)
  )
)


  label = row[0] # first value is the class label


Q3.1 Print training progress and accuracy

The training accuracy is shown below with the run's corresponding hyperparameters.

In [11]:
# Main training 
epoch_train_times = []
epoch_test_times = []
average_inference_time = []
for t in range(num_epochs):
    print(f"\nEpoch {t+1}\n-------------------------------")
    train_duration = train_one_epoch(train_dataloader, model, loss_fn, optimizer, t)
    epoch_train_times.append(train_duration)
    evaluate(train_dataloader, "Train", model, loss_fn, t)
    epoch_test, avg = test_evaluate(test_dataloader, "Test", model, loss_fn)
    epoch_test_times.append(epoch_test)
    average_inference_time.append(avg)
# Save the model
torch.save(model.state_dict(), "MNIST_model.pth")
# print("Saved PyTorch Model State to MNIST_model.pth")

  label = row[0] # first value is the class label



Epoch 1
-------------------------------
Train loss = 0.036006  [   64/59999]
Train loss = 0.004810  [32064/59999]
Train accuracy = 95.5%, Train avg loss = 0.002348
Test accuracy = 94.8%, Test avg loss = 0.002592
Average time per example classification: 0.000028 seconds

Epoch 2
-------------------------------
Train loss = 0.001919  [   64/59999]
Train loss = 0.002819  [32064/59999]
Train accuracy = 97.5%, Train avg loss = 0.001237
Test accuracy = 96.5%, Test avg loss = 0.001795
Average time per example classification: 0.000026 seconds


In [12]:
print(f"Hyperparameters: learning_rate={learning_rate}, \n batch_size={batch_size}, num_layers={num_layers}, hidden_size={hidden_size}, num_epochs={num_epochs}")

Hyperparameters: learning_rate=0.001, 
 batch_size=64, num_layers=2, hidden_size=1024, num_epochs=2


Q3.3 Print the training time per epoch and inference latency per example

In [13]:
for t in range(num_epochs):
    print(f"Epoch {t+1} training duration: {epoch_train_times[t]:>8f}")
print(f"Training time per epoch variance {np.array(epoch_train_times).var():>8f}")

Epoch 1 training duration: 37.043280
Epoch 2 training duration: 41.538258
Training time per epoch variance 5.051205


With two epochs, the training time per epoch has a variance of ~1 seconds. Training took almost 5% longer for the second epoch, which is surprising given that a warmed up CPU should compute quicker as the intuition behind inference goes.

In [14]:
for t in range(num_epochs):
    print(f"Epoch {t+1} average inference time per sample: {average_inference_time[t]:>8f}")

test_var1 = np.array(epoch_test_times[0]).var()
test_var2 = np.array(epoch_test_times[1]).var()
test_max1 = np.array(epoch_test_times[0]).max()
test_max2 = np.array(epoch_test_times[1]).max()
test_min1 = np.array(epoch_test_times[0]).min()
test_min2 = np.array(epoch_test_times[1]).min()
print(f"Test time per sample variance for epoch 1: {test_var1} seconds")
print(f"Test time per sample variance for epoch 2: {test_var2} seconds")
print(f"Test time per sample max for epoch 1: {test_max1:>8f} seconds")
print(f"Test time per sample max for epoch 2: {test_max2:>8f} seconds")
print(f"Test time per sample min for epoch 1: {test_min1} seconds")
print(f"Test time per sample min for epoch 2: {test_min2} seconds")

Epoch 1 average inference time per sample: 0.000028
Epoch 2 average inference time per sample: 0.000026
Test time per sample variance for epoch 1: 4.173304938698072e-10 seconds
Test time per sample variance for epoch 2: 1.5022253472545984e-10 seconds
Test time per sample max for epoch 1: 0.000240 seconds
Test time per sample max for epoch 2: 0.000141 seconds
Test time per sample min for epoch 1: 1.547485589981079e-05 seconds
Test time per sample min for epoch 2: 1.5385448932647705e-05 seconds


In [15]:
xvals = np.array([1, 2, 3, 4, 5])
yvals = np.array([1, 4, 9, 16, 25])
plt.plot(xvals, yvals)

[<matplotlib.lines.Line2D at 0x196d4e23d90>]

: 

The variance for the inferences across the two epochs by default are as shown. What was suprising is that the second epoch has a larger max value for the per sample inference time. The first iteration of inference was not slower than the others.

Q3.4 Count number of trainable parameters in a function

The values line up as manually calculated as the weight matrices and the bias are the only trainable parameters in a feedforward neural network.

In [None]:
def count_parameters(model):
	total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	print(f"Total number of model parameters: {total_params}")

count_parameters(model)

# manually computing parameters
# 784*1024 + 1024 + 1024*1024 + 1024 + 1024*10 + 10 = 1863690
NN_layers = [784, 1024, 1024, 10]
total_params = 0
for i in range(len(NN_layers)-1):
	total_params += NN_layers[i] * NN_layers[i+1] + NN_layers[i+1]
print(f'Manually computed total number of model parameters: {total_params}')

Q3.5 Count flops

In [None]:
n_flops = model.countflops()

Q3.6 Tune model hyperparameters

The three plots to produce are generated with respect to training 4 other various architectures for two epochs


In [1]:
hid_layers = [1, 2, 3]
hid_size = [512, 1024]

# log the late
for nL in hid_layers:
    for nH in hid_size:
        model = MNISTNetwork().to(device)
print(model)
loss_fn = nn.CrossEntropyLoss() # no need to softmax as CrossEntropyLoss works on raw logits
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

SyntaxError: unterminated string literal (detected at line 6) (4147344663.py, line 6)