<a href="https://colab.research.google.com/github/Rohan-Rajesh/Custom_MNIST_Digit_Classifier/blob/main/Custom_MNIST_Digit_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import torch
from torch import tensor
from pathlib import Path
from PIL import Image
from fastai.vision.all import *

In [10]:
path = untar_data(URLs.MNIST_SAMPLE)
Path.BASE_PATH = path

In [11]:
threes = (path/'train'/'3').ls().sorted()
sevens = (path/'train'/'7').ls().sorted()

# Stack into tensors and normalize
three_tensors = [tensor(Image.open(o)) for o in threes]
seven_tensors = [tensor(Image.open(o)) for o in sevens]

stacked_threes = torch.stack(three_tensors).float()/255
stacked_sevens = torch.stack(seven_tensors).float()/255

In [12]:
valid_3_tens = torch.stack([tensor(Image.open(o)) for o in (path/'valid'/'3').ls()]).float()/255
valid_7_tens = torch.stack([tensor(Image.open(o)) for o in (path/'valid'/'7').ls()]).float()/255

In [13]:
# Stacking both 3s and 7s since we have to come up with common weights that will classify any digit
train_x = torch.cat([stacked_threes, stacked_sevens]).view(-1, 28*28)
train_y = tensor([1]*len(threes) + [0]*len(sevens)).unsqueeze(1)
train_x.shape,train_y.shape

train_dset = list(zip(train_x, train_y))

In [14]:
test_x = torch.cat([valid_3_tens, valid_7_tens]).view(-1, 28*28)
test_y = tensor([1]*len(valid_3_tens) + [0]*len(valid_7_tens)).unsqueeze(1)

test_dset = list(zip(test_x, test_y))

In [15]:
"""
initialize random weights to start.
since we only have 2 categories we can imagine this as the model randomly predicting between the 2 since it has no information about the images.
should lead to an accuracy of about ~50%

for a linear function: y = mx + b
m = weights
b = bias
"""
def init_params(size, std=1.0):
  # initialize with requires_grad_() to calculate gradients later
  return (torch.randn(size)*std).requires_grad_()

weights = init_params((28*28, 1))
bias = init_params(1)

In [16]:
# starting with fitting the model to a linear function
def linear1(xb):
  # y = mx + b
  return xb @ weights + bias

predictions = linear1(train_x)
predictions

tensor([[  1.5816],
        [  1.0384],
        [-12.0403],
        ...,
        [-10.5481],
        [-16.0521],
        [ -8.4310]], grad_fn=<AddBackward0>)

In [17]:
# we intend to optimize our weights such that 3s will have a positive result and 7 will have a negative one
corrects = (predictions>0.0).float() == train_y
# we get our approxiate 50% result
corrects.float().mean().item()

0.6585995554924011

In [18]:
# make a small change to weights and test again
with torch.no_grad():
  weights *= 1.0001

predictions = linear1(train_x)
corrects = (predictions>0.0).float() == train_y
corrects.float().mean().item()

0.6586802005767822

In [25]:
"""
we need a loss function to actually optimize the gradients.
since our output is deterministic (binary; either 3 or 7) we can't use accuracy since this value most probably will not change
with small changes to the weights.
so we develop a different loss function using our actual predictions (this can be any number between 0 and 1) to make a our loss function.
"""
# sigmoid squishes any value to something that's always between 0 & 1 which is what we need for calculating loss.
def sigmoid(x): return 1/(1+torch.exp(-x))
def mnist_loss(predictions, targets):
  predictions = predictions.sigmoid()
  # we use torch.where so the computation runs on the GPU
  return torch.where(targets == 1, 1 - predictions, predictions).mean()

In [20]:
# create datasets for train and test so that we can create batches
train_dl = DataLoader(train_dset, batch_size=256)
test_dl = DataLoader(test_dset, batch_size=256)

In [21]:
# calculate gradients and optimize weights
# model = linear1
def calc_grad(xb, yb, model):
  predictions = linear1(xb)
  loss = mnist_loss(predictions, yb)
  loss.backward()

In [22]:
# train a specific epoch (1 pass through the entire training set)
def train_epoch(model, lr, params):
  for xb, yb in train_dl:
    calc_grad(xb, yb, model)
    for p in params:
      p.data -= p.grad*lr
      p.grad.zero_()

In [23]:
def batch_accuracy(xb, yb):
  preds = xb.sigmoid()
  # after sigmoid, we check if the predictions > 0.5 instead of > 0
  correct = (preds>0.5) == yb
  return correct.float().mean()

def validate_epoch(model):
  accuracies = [batch_accuracy(model(xb), yb) for xb, yb in test_dl]
  return round(torch.stack(accuracies).mean().item(), 4)

In [27]:
lr = 1.
params = weights,bias
for i in range(20):
  train_epoch(linear1, lr, params)
  print(validate_epoch(linear1), end=' ')

0.8154 0.8954 0.9213 0.936 0.9447 0.9487 0.9526 0.955 0.9589 0.9599 0.9628 0.9628 0.9628 0.9628 0.9633 0.9648 0.9653 0.9667 0.9682 0.9682 