<a href="https://colab.research.google.com/github/WizardGit/CharacterRecognition/blob/main/FinalProjectDigits.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn
from torch import optim
from torchvision import datasets, transforms
from torch.utils.data import random_split, DataLoader

In [None]:
# let's define a simple model now (creating a linear mapping)
model = nn.Sequential(
    # 28*28 is the images sizes - let's use 64 hidden layers
    nn.Linear(28*28, 64),
    nn.ReLU(),
    nn.Linear(64, 64),
    nn.ReLU(),
    nn.Linear(64, 10)
)

# because we have 10 digits, we have 10 ouput layers in our network
print(model)

Sequential(
  (0): Linear(in_features=784, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=10, bias=True)
)


In [None]:
# let's get our data before training anything

In [None]:
train_data = datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor())
# here we are splitting the data into different sets for training and validation
train, val = random_split(train_data, [55000, 5000])
# gonna hold everything for training
train_loader = DataLoader(train, batch_size=32)
# gonna hold everything for validation
val_loader = DataLoader(val, batch_size=32)

In [None]:
# let's define our optimizer now (we will be using stochastic gradient descent) and our loss function
optimizer = optim.SGD(model.parameters(), lr=0.01)
loss_function = nn.CrossEntropyLoss()

# now let's get to our training loop
nepochs = 8
# the full run through the dataset
for epoch in range(nepochs):
  loss_history = []

  for batch in train_loader:
    # image and label extracted from batch (x,y)
    x, y = batch
    b = x.size(0)
    x = x.view(b, -1)

    # step 1: our forward function
    # l is logit
    logit = model(x) 

    # step 2: compute the objective function
    # the loss is the distance between the networking performance and the final task
    J = loss_function(logit,y)

    # step 3: clean the gradients
    model.zero_grad()
      # model.zero_grad() can work as well

    # step 4: accumulate the partial derivatives of J with respect to the given parameters (basically summing them)
    J.backward()

    # step 5: step in the opposite direction of the gradient
    optimizer.step()
    
    loss_history.append(J.item())
    
    # scalar tensor
  print(f'Epoch {epoch + 1}, train loss: {torch.tensor(loss_history).mean():.2f}')

  loss_history = []

  for batch in val_loader:
    # image and label extracted from batch (x,y)
    x, y = batch
    b = x.size(0)
    x = x.view(b, -1)

    # step 1: our forward function
    # l is logit
    with torch.no_grad():
      logit = model(x) 

    # step 2: compute the objective function
    # the loss is the distance between the networking performance and the final task
    J = loss_function(logit,y)

    loss_history.append(J.item())

  print(f'Epoch {epoch + 1}, validation loss: {torch.tensor(loss_history).mean():.2f}')

Epoch 1, train loss: 1.19
Epoch 1, validation loss: 0.46
Epoch 2, train loss: 0.39
Epoch 2, validation loss: 0.35
Epoch 3, train loss: 0.32
Epoch 3, validation loss: 0.31
Epoch 4, train loss: 0.29
Epoch 4, validation loss: 0.28
Epoch 5, train loss: 0.26
Epoch 5, validation loss: 0.25
Epoch 6, train loss: 0.23
Epoch 6, validation loss: 0.23
Epoch 7, train loss: 0.21
Epoch 7, validation loss: 0.21
Epoch 8, train loss: 0.20
Epoch 8, validation loss: 0.20
