### Imports

In [1]:
import torch
import torch.nn as nn 
import torch.optim as optim 
import torch.functional as F 
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.transforms as transforms

### Set Device

In [2]:
device = "cpu"

if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"

In [3]:
device = torch.device(device)
device

device(type='mps')

### Hyperparameters

- we had image size `(betch size, imange number, width, height) --> (64, 1, 28, 28)`
- we are passing input as 28 (input_size) , one row at a time

- sequence length is 28 mean, number of sequences feeded to model at time is 28 rows in one go

- number of layers would be 2

- hidden layer size is 256

In [60]:
input_size = 28
sequence_length = 28
num_layers = 2
hidden_size = 256
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 2

- we ussually don't use RNNs for image data, for implementation purpose only we rae using it

### Create RNN Network

In [14]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc  = nn.Linear(hidden_size*sequence_length, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)# initalizing hidden state 1st

        # Forward pass
        out, _ = self.rnn(x, h0) # '_' has hidden state as output / every point has its own hidden state

        out = out.reshape(out.shape[0], -1)

        out = self.fc(out)

        return out

- when we write `batch_first = True` then input is `(batch_size, time_seq, features)`

- in Lienar layer we are passing `hidden_size x sequence_length` we have `28` time sequences it will concatinate all those and thats what we are going to send to linear layer, so its going to use all information from all the hiddel sattes

**Alternative option:**

    - you can also take information from last hidden state

- `out, _ = self.rnn(x, h0)` here `_` has hidden state output, every data point has its own hidden state

- `out = out.reshape(out.shape[0], -1)` keeping `out.shape[0]` as batch size, changes rest `-1` = `(1 x 28 x 28)` = `784`

### Create GRU Network

In [32]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc  = nn.Linear(hidden_size*sequence_length, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)# initalizing hidden state 1st

        # Forward pass
        out, _ = self.gru(x, h0) # '_' has hidden state as output / every point has its own hidden state

        out = out.reshape(out.shape[0], -1)

        out = self.fc(out)

        return out

- to use `GRU` we just have to change `RNN` unit with `GRU` they have same input and output, is just that internal structure changes

### Create LSTM Network

In [65]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc  = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # hidden state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)# initalizing hidden state 1st
        
        # cell state
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        
        # Forward pass
        out, _ = self.lstm(x, (h0, c0)) # '_' has hidden state as output / every point has its own hidden state

        out = self.fc(out[:, -1, :])

        return out

- here in `LSTM`, it has two states to maintain, `cell state` for leng term context amd `hidden state` for short term context

- we need to add once cell state in LSTM and add it in lstm layer as tuple

- we don't have to perform concatination of all the state like `RNN` and `GRU`

- we just take the last one in fully connected layer `out[:, -1, :]` -->  `[all training samples at same time , last hidden state,  all features]`, by doing this we are going to loose information , if some cases its better to take only relevaent infromation and training on it 

### Load Data

In [66]:
train_dataset = datasets.MNIST(root='dataset/', train=True, transform=transforms.ToTensor(), download=False)
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True, )

test_dataset = datasets.MNIST(root='dataset/', train=False, transform=transforms.ToTensor(), download=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=True)

### Initalize Networks

In [67]:
# model with RNN
model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)

# model with GRU
model1 = GRU(input_size, hidden_size, num_layers, num_classes).to(device)

# model with LSTM
model2 = LSTM(input_size, hidden_size, num_layers, num_classes).to(device)

### Loss and Optimizers

In [68]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model2.parameters(), lr=learning_rate)

### Train Network

In [69]:
for epoch in range(num_epochs):
    total_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):

        data = data.to(device).squeeze(1)
        target = target.to(device)

        # forward pass
        scores = model2(data)

        loss = criterion(scores, target)

        # backward

        model.zero_grad() # clear previous gradients

        loss.backward() # back propogation

        optimizer.step() # update mode weights

        total_loss += loss.item() # Accumulate loss

    average_loss = total_loss / len(train_loader) # Calculating average loss

    print(f"Epoch {epoch}: Average Loss: {average_loss}")

Epoch 0: Average Loss: 1.9105836407207986
Epoch 1: Average Loss: 1.3656939141023388


### Check accuracy on train & test see how good our model

In [26]:
def check_accuracy(loader, model):
    if loader.dataset.train:
        print("Checking accuracy on training data")
    else:
        print("Checking accuracy on testing data")
    
    num_correct = 0
    num_samples = 0
    model.eval() # put model into evaluation mode

    with torch.no_grad():
        for x, y in loader:

            x = x.to(device).squeeze(1)
            y = y.to(device)

            scores =model(x)

            _, predictions = scores.max(1)

            num_correct += (predictions == y).sum()

            num_samples += predictions.size(0)

        acc = float(num_correct) / float(num_samples) * 100

        print(f"Got {num_correct} / {num_samples} with accuracy {acc:.2f}")
    
    model.train() # putting model in training mode

### RNN Score Check

In [24]:
check_accuracy(train_loader, model)

Checking accuracy on training data
Got 58274 / 60000 with accuracy 97.12


In [27]:
check_accuracy(test_loader, model)

Checking accuracy on testing data
Got 9717 / 10000 with accuracy 97.17


### GUR Score Check

In [46]:
check_accuracy(train_loader, model1)

Checking accuracy on training data
Got 45052 / 60000 with accuracy 75.09


In [47]:
check_accuracy(test_loader, model1)

Checking accuracy on testing data
Got 7585 / 10000 with accuracy 75.85


### LSTM Score Check

In [70]:
check_accuracy(train_loader, model2)

Checking accuracy on training data
Got 32751 / 60000 with accuracy 54.58


In [71]:
check_accuracy(test_loader, model2)

Checking accuracy on testing data
Got 5424 / 10000 with accuracy 54.24
