In [13]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import numpy as np

SEED = 42
np.random.seed(SEED)

In [14]:
training_data = datasets.FashionMNIST(root="data",
                                      train=True,
                                      download=True,
                                      transform=ToTensor())

testing_data = datasets.FashionMNIST(root="data",
                                     train=False,
                                     download=True,
                                     transform=ToTensor())

In [15]:
batch_size = 64

#For the last batch where the size is smaller than the deafulted batch_size, just drop the data 
#as the batch_size is not a factor of the number of data inside a dataloader
train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True, drop_last=True)
test_dataloader = DataLoader(testing_data, batch_size=batch_size, shuffle=True, drop_last=True)

#Each element in the dataloader iterable would return a batch of 64 features and labels
# This forloop would iterator through all element in the test_dataloader iterable and show each shape value
for X, y in test_dataloader:
  print(f"Shape of X [N, C, H, W]: {X.shape}")
  print(f"Shape of y: {y.shape}")
  break 

Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64])


In [16]:
image_height = X.shape[2]
image_width = X.shape[3]

In [17]:
if torch.cuda.is_available():
  device = "gpu"
elif torch.backends.mps.is_available():
  device = "mps"
else:
  device = "cpu"

print(f"Using {device} device")

Using cpu device


In [18]:
#Make a subclass that inherits properties from nn.Model parent class
class NeuralNetwork(nn.Module):
  #Initialize your NeuralNetwork Function
  def __init__(self): 
    #To execute the methods of __init__ function in the parent class (nn.Module), run super().__init__()
    super().__init__()
    self.flatten = nn.Flatten()
    self.linear_relu_stack = nn.Sequential(
      nn.Linear(image_height*image_width, 512),
      nn.ReLU(),
      nn.Linear(512, 512),
      nn.ReLU(),
      nn.Linear(512, 10)
    )

  def forward(self, x):
    x = self.flatten(x)
    logits = self.linear_relu_stack(x)
    return logits
  
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [19]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [20]:
def train(dataloader, model, loss_fn, optimizer):

  #The size of the training dataloader iterable
  size = len(dataloader.dataset)

  #Set model to training mode after the test function is set to eval mode
  model.train()
  for batch, (X, y) in enumerate(dataloader):
    X, y = X.to(device), y.to(device)
    #Computing batch_size prediction each batch 
    #Invokes __call__ method, inherited from nn.Module class, forward is called 
    pred = model(X)
    
    #loss would output as a form of tensor and not python numbers
    loss = loss_fn(pred, y)

    #Backprop, computing gradient via chain rule 
    loss.backward()
    optimizer.step()

    #You dont want old gradient to interfere with your new one because backward() accumulates
    optimizer.zero_grad()

    if batch % 100 == 0:
      #current is incremented by 64 (batch size) every batch iteration
      loss, current = loss.item(), (batch + 1) * len(X)
      
      #>7f means to create 7 spaces and align to the right as floating point number
      print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

In [21]:
def test(dataloader, model, loss_fn):
  size = len(dataloader.dataset)
  num_batches = len(dataloader)
  model.eval()
  test_loss, correct = 0, 0

  #Line below is to do it without keeping track of the gradient, disables gradient calculation
  #Every computation (except some), require_grad would always be set to False to reduce memory consumption
  with torch.no_grad():
    for X, y in dataloader:
      X, y = X.to(device), y.to(device)

      #Invokes __call__ method, inherited from nn.Module class, forward is called
      pred = model(X)

      #item() converts the tensor into a python number
      test_loss += loss_fn(pred, y).item()

      #Compare the ground truth to prediction and let it equal, then type cast to int and sum all the 1 to find how much is correct
      #Note: dim in pytorch is the same as axis for numpy
      #with dim=1, you go by columns, dim=0, you go by rows
      correct += (pred.argmax(dim=1) == y).type(torch.int).sum().item()

  #Avg Loss: total loss / number of batches (floor(data num / batch_size))
  test_loss /= num_batches
  
  #Accuracy: number of correct prediction / number of total data
  correct /= size
  print(f"Test Error \n Accuracy: {(100*correct):>0.1f}%, Avg Loss: {test_loss:>8f} \n")

In [22]:
def main():
  epochs = 1
  for t in range(epochs):
    print(f"Epoch {t+1} \n -------------------------------")

    #Train the model
    train(train_dataloader, model, loss_fn, optimizer)

    #Pass in the trained model and make prediction, find the accuracy/loss
    test(test_dataloader, model, loss_fn)
    
  print("Execution Finished")

In [23]:
if __name__ == "__main__":
  main()

Epoch 1 
 -------------------------------
loss: 2.285285 [   64/60000]


loss: 2.286288 [ 6464/60000]
loss: 2.263058 [12864/60000]
loss: 2.251193 [19264/60000]
loss: 2.245316 [25664/60000]
loss: 2.238750 [32064/60000]
loss: 2.214597 [38464/60000]
loss: 2.198560 [44864/60000]
loss: 2.156345 [51264/60000]
loss: 2.163560 [57664/60000]
Test Error 
 Accuracy: 39.0%, Avg Loss: 2.147731 

Execution Finished


In [24]:
#From PyTorch Official Website
#Question: Why dont we need to feed the logits into a softmax layer
#Is it because even for linear/relu activation, the largest number would always be the highest probability