In [6]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision.datasets as datasets 
import torchvision.transforms as transforms
import torch.optim as optim
import torchvision
#torch.nn: This module provides the building blocks to create neural networks.
#torch.nn.functional: This module contains functions for various operations such as activations (ReLU, softmax, etc.).

In [7]:
# #                        Tensor Indexing                        #
# # ============================================================= #

# batch_size = 10
# features = 25
# x = torch.rand((batch_size, features))

# # Get first examples features
# print(x[0].shape)  # shape [25], this is same as doing x[0,:]

# # Get the first feature for all examples
# print(x[:, 0].shape)  # shape [10]

# # For example: Want to access third example in the batch and the first ten features
# print(x[2, 0:10].shape)  # shape: [10]

# # For example we can use this to, assign certain elements
# x[0, 0] = 100

# # Fancy Indexing
# x = torch.arange(10)
# indices = [2, 5, 8]
# print(x[indices])  # x[indices] = [2, 5, 8]

# create fully connected network

In [8]:
class NN(nn.Module):
    def __init__(self,input_size,num_classes):
        super(NN,self).__init__()
        self.fc1=nn.Linear(input_size,50)  #input feature layer is of input size and then next hidden layer is of 50 nodes 
        self.fc2=nn.Linear(50,num_classes)

    def forward(self,x):
        x=F.relu(self.fc1(x))
        x=self.fc2(x)
        return x
    
model=NN(784,10)
x=torch.randn(64,784)
print(model(x).shape)

torch.Size([64, 10])


Let's break down the process of how the first fully connected (linear) layer `fc1` transforms the input tensor from shape `(64, 784)` to shape `(64, 50)`.

### Linear Layer Transformation

The linear layer `nn.Linear(input_size, 50)` performs a linear transformation on the input data. Here, `input_size` is 784, and the output size is 50. This layer can be described by the following equation:

\[ \text{output} = xW^T + b \]

- \( x \): Input tensor of shape `(64, 784)` (64 samples, each with 784 features).
- \( W \): Weight matrix of shape `(784,50)` (50 output features, each with 784 input features).
- \( b \): Bias vector of shape `(50)` (one bias term for each of the 50 output features).

### Matrix Multiplication and Bias Addition

1. **Matrix Multiplication**:
    - The input tensor `x` of shape `(64, 784)` is multiplied by the transpose of the weight matrix `W^T` of shape `(784, 50)`.
    - The resulting product has shape `(64, 50)` because:
      - The multiplication of a `(64, 784)` matrix with a `(784, 50)` matrix results in a `(64, 50)` matrix.
      - Each of the 64 samples is transformed from 784 features to 50 features.

2. **Bias Addition**:
    - The bias vector `b` of shape `(50)` is added to each of the 64 rows of the resulting matrix.
    - The addition of the bias does not change the shape, so the final output remains `(64, 50)`.

### Applying ReLU Activation

After the linear transformation, the ReLU activation function is applied element-wise:

```python
x = F.relu(self.fc1(x))
```

- This does not change the shape of the tensor; it remains `(64, 50)`.
- ReLU sets all negative values in the tensor to zero, keeping positive values unchanged.

### Summary

- Input to `fc1`: Shape `(64, 784)`.
- Weight matrix `W` in `fc1`: Shape `(50, 784)`.
- Bias vector `b` in `fc1`: Shape `(50)`.
- Output from `fc1` after linear transformation: Shape `(64, 50)`.
- Output from `fc1` after ReLU activation: Shape `(64, 50)`.

Thus, the first fully connected layer `fc1` correctly transforms the input tensor from shape `(64, 784)` to shape `(64, 50)`. This is how the output shape of `(64, 50)` is obtained.

# CNN

In [9]:
class CNN(nn.Module):
    def __init__(self,in_channels=1,num_classes=10):
        super(CNN,self).__init__()
        self.conv1=nn.Conv2d(in_channels=in_channels,out_channels=8,kernel_size=(3,3),stride=(1,1),padding=(1,1))
        self.pool=nn.MaxPool2d(kernel_size=(2,2),stride=(2,2))
        self.conv2=nn.Conv2d(in_channels=8,out_channels=16,kernel_size=(3,3),stride=(1,1),padding=(1,1))
        self.fc1=nn.Linear(16*7*7,num_classes)
  
    def forward(self,x):
        x=F.relu(self.conv1(x))
        x=self.pool(x)
        x=F.relu(self.conv2(x))
        x=self.pool(x)
        x=x.reshape(x.shape[0],-1)
        x=self.fc1(x)

        return x
    
model2=CNN()
x=torch.randn(64,1,28,28)
print(model2(x).shape)

torch.Size([64, 10])


# saving checkpoint

In [10]:
def saving_checkpoint(state,filename="my_checkpoint.pth.tar"):
    print("saving checkpoint")
    torch.save(state,filename)

In [11]:
def load_checkpoint(checkpoint):
    print('loading checkpoint')
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])

# set device

In [12]:
device=torch.device("cuda" if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

# Hyperparameters

In [13]:
input_size=784
num_classes=10
learning_rate=0.001
batch_size=64
num_epochs=5
load_model=True

# Transfer learning and fine tuning 

In [14]:
class Identity(nn.Module):
    def __init__(self):
        super(Identity,self).__init__()

    def forward(self,x):
        return x

In [15]:
import torchvision
#load pretrained model and modify it 
model=torchvision.models.vgg16(pretrained=True)
model.avgpool=Identity()
model.classifier=nn.Linear(512,10)
model.to(device=device)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

# Load data

In [16]:
train_dataset=datasets.MNIST(root='dataset/',train=True,transform=transforms.ToTensor(), download=True)
train_loader=DataLoader(dataset=train_dataset,batch_size=batch_size,shuffle=True)

test_dataset=datasets.MNIST(root='dataset/',train=False,transform=transforms.ToTensor(), download=True)
test_loader=DataLoader(dataset=test_dataset,batch_size=batch_size,shuffle=True)

The DataLoader is a PyTorch utility that wraps a dataset and provides an iterable over the dataset with automatic batching, shuffling, and parallel data loading.

python

train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

dataset: The dataset object to load data from.

batch_size: Number of samples per batch.

shuffle: Whether to shuffle the data at every epoch (helps to avoid overfitting by providing different mini-batches on each epoch).

The DataLoader splits the dataset into manageable batches and shuffles it if specified. It provides an efficient way to iterate over data during training and evaluation.


Why Use DataLoader for Accuracy?

When calculating accuracy, you need to evaluate the model on batches of data to handle large datasets that cannot fit into memory all at once.
 The DataLoader helps by:

Batching: Automatically splitting the dataset into smaller batches. This is memory efficient and necessary for both training and evaluation, especially for large datasets.

Shuffling: Ensuring that the data is shuffled during training to improve generalization. However, shuffling is not strictly necessary during evaluation.

Parallel Data Loading: Utilizing multiple workers to load data in parallel, which speeds up the data loading process.

# Initialize network

In [17]:
model=NN(input_size=input_size,num_classes=num_classes).to(device)

# Loss and optimizer

In [18]:
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=learning_rate)

# load

In [19]:
if load_model:
    load_checkpoint(torch.load('my_checkpoint.pth.tar'))

loading checkpoint


# Train network

In [20]:
for epoch in range(num_epochs):
    if epoch==2:
        checkpoint={'state_dict':model.state_dict(),'optimizer':optimizer.state_dict()}
        saving_checkpoint(checkpoint)
    for batch_index,(data,targets) in enumerate(train_loader):
        #get data to device if possible 
        
        data=data.to(device=device)
        targets=targets.to(device=device)

        #print(data.shape) # here output is 64,1,28,28 means 64 examples , as mnist images are black and white there is only 1 channel so 1 , height =28,width=28 
        # But we want this to be in 1 single shape means 784 so we want to unroll this matrix in long vector

        #correct shape 
        data=data.reshape(data.shape[0],-1)  # shape of 64, -1 means 1*28*28

        #forward
        scores=model(data)
        loss=criterion(scores,targets)
                              
        #backward
        optimizer.zero_grad()
        loss.backward()

        #gradient descent or adam step()
        optimizer.step()


saving checkpoint


Certainly! The snippet of code provided seems to be from a PyTorch-based implementation, likely in the context of evaluating a machine learning model's performance. Here's a detailed explanation of each line:

```python
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
```

### Line-by-line Breakdown

1. **`_, predictions = scores.max(1)`**:
   - `scores`: This is a tensor containing the raw output scores from the model for each class, before applying any activation function like softmax. Its shape is likely `[batch_size, num_classes]`.
   - `scores.max(1)`: This function computes the maximum value of each row in the `scores` tensor along the dimension specified (dimension 1 here, which represents the class dimension). It returns two values:
     - The first value (`_`), which represents the maximum values themselves. This isn't used in your code, hence the underscore (`_`) to ignore it.
     - The second value (`predictions`), which contains the indices of the maximum values (i.e., the predicted class labels for each sample in the batch). `predictions` will be a tensor of shape `[batch_size]`, where each element is an integer representing the predicted class for the corresponding sample.

2. **`num_correct += (predictions == y).sum()`**:
   - `predictions == y`: This performs an element-wise comparison between `predictions` and `y`, where `y` is the ground truth labels tensor. The result is a tensor of the same shape as `predictions`, containing `True` where the prediction matches the ground truth, and `False` otherwise.
   - `(predictions == y).sum()`: This converts the boolean tensor to an integer tensor (with `True` as 1 and `False` as 0) and sums the elements. The result is the number of correct predictions in the current batch.
   - `num_correct += ...`: This updates the `num_correct` counter by adding the number of correct predictions from the current batch to the running total.

3. **`num_samples += predictions.size(0)`**:
   - `predictions.size(0)`: This returns the size of the 0th dimension of the `predictions` tensor, which is the batch size (i.e., the number of samples in the current batch).
   - `num_samples += ...`: This updates the `num_samples` counter by adding the batch size to the running total, keeping track of the total number of samples processed so far.

### Putting It All Together
This code snippet is part of a loop that iterates over batches of data to evaluate a model's accuracy. Here's a summary of its purpose:
- **`scores.max(1)`** finds the predicted class for each sample in the batch.
- **`num_correct += (predictions == y).sum()`** counts how many predictions were correct in the current batch and updates the total count of correct predictions.
- **`num_samples += predictions.size(0)`** updates the total count of samples processed so far.

By the end of the loop, `num_correct` will hold the total number of correct predictions across all batches, and `num_samples` will hold the total number of samples processed. These can be used to calculate the overall accuracy of the model as:

```python
accuracy = num_correct / num_samples
```

# check accuracy on training and test to see how good our model

model.eval(): Sets the model to evaluation mode. This is important because it disables dropout and batch normalization, which behave differently during training.

torch.no_grad(): Context manager that disables gradient calculation. This reduces memory consumption and speeds up computations, as gradients are not needed during evaluation.

In [21]:
def accuracy(loader,model):
    if loader.dataset.train:
        print("Checking accuracy on training data")
    else:
        print("Checking accuracy on testing data")
    num_correct=0
    num_samples=0
    model.eval()

    with torch.no_grad():
        for x,y in loader:
            x=x.to(device=device)
            y=y.to(device=device)
            x=x.reshape(x.shape[0],-1)
            scores=model(x)
            _,predictions=scores.max(1)
            num_correct+=(predictions==y).sum()
            num_samples+=predictions.size(0)

    print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f} ')

    model.train()
    
accuracy(train_loader,model)
accuracy(test_loader,model)

Checking accuracy on training data
Got 59100 / 60000 with accuracy 98.50 
Checking accuracy on testing data
Got 9702 / 10000 with accuracy 97.02 


The accuracy function:

-> Switches the model to evaluation mode.

-> Iterates over batches of data from the provided DataLoader.

-> Moves the data and labels to the appropriate device.

-> Reshapes the data to the required input format.

-> Computes the model's predictions.

-> Counts the number of correct predictions.

-> Computes and prints the overall accuracy.

-> Switches the model back to training mode.

-> This function is useful for monitoring the performance of your model on both the training and testing datasets.
