# **How a PyTorch Model class Works?**

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        # layers are defined here

    def forward(self,x):
        # define the data flow here
        return x

# **How Image Data Enters a CNN**
#### (batch_size, channels, height, width)
- batch_size -> images are fed in batches
- channels -> channels of image like for rgb image is 3 channels like red, blue, & green and B/W image is 1 channel ranging from 0 to 255
### Examples:

- MNIST ‚Üí (N, 1, 28, 28)

- CIFAR-10 ‚Üí (N, 3, 32, 32)

‚ö†Ô∏è PyTorch is channels-first (unlike TensorFlow).

# **Understanding Conv2D**

#### nn.Conv2D(in_channels, out_channels, kernel_size, stride, padding)
#### in_channels is the number of channels from an image and out_channels is the number of filters used, basically how much feature map will be created
- it slides a kernel over the image
- extracts local patterns
- produces feature maps

**Output size formula (must memorize)**
Output = (W ‚àí K + 2P) / S + 1

Where:

W = input width/height

K = kernel size

P = padding

S = stride

In [3]:
# example for MNIST:
# Conv2D(1, 16, kernel_size = 3, stride = 1, padding = 1)
# here img is b/w so in_channel = 1
# out_channels = 16 because we want 16 filters/feature maps
# stride = 1 and padding = 1


# **Relu application**
- sets negative values to zero
- adds non-linearity
- Does NOT change shape i.e., shape before relu = shape after relu

In [None]:
# x = F.relu(x)

# **Max Pooling (Here info is reduced)**
- takes max value from a 2 x 2 window
- keeps strongest activation
- reduces spatial resolution

In [4]:
# nn.MaxPool2d(kernel_size = 2, stride = 2)

Example:

Before: (N, 16, 28, 28)<br>
After : (N, 16, 14, 14)

Why this matters

- Reduces computation

- Adds translation invariance

- Yes, loses spatial detail ‚Äî intentionally

**This answers:**

**Why max-pool reduces spatial info?**<br>
Because it discards exact pixel locations and keeps only strongest responses.

# **Basic CNN Structure - Shape By Shape**

### Architecture
- Conv ‚Üí ReLU ‚Üí MaxPool
- Conv ‚Üí ReLU ‚Üí MaxPool
- Flatten ‚Üí Linear ‚Üí Output

## **Without stride**

In [7]:
class BasicCNN(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Conv2d(1, 16, kernel_size = 3, padding = 1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size = 3, padding = 1)

        self.pool = nn.MaxPool2d(2,2)

        self.fc1 = nn.Linear(32 * 7 * 7, 10) # It expects inputs as (flatten) 32*7*7 -> (1568,10) and outputs to 10 different classes

    def forward(self,x):
        x = self.pool(self.conv1(x))
        x = self.pool(self.conv2(x))

        x = torch.flatten(x, start_dim = 1)

        x = self.fc1(x)

        return x

## **Shape Flow**
#### **Input**
- (N, 1, 28, 28)

#### **After Conv1 + Pool**
- Conv1 ‚Üí (N, 16, 28, 28)
- Pool  ‚Üí (N, 16, 14, 14)

#### **After Conv2 + Pool**
- Conv2 ‚Üí (N, 32, 14, 14)
- Pool  ‚Üí (N, 32, 7, 7)

#### **Flatten**
- (N, 32 * 7 * 7) = (N, 1568)

#### **Linear**
- (N, 10)

# **Training on MNIST**

In [None]:
from torchvision import datasets,transforms
from torch.utils.data import DataLoader

transform = transforms.Compose([
    transforms.ToTensor(),
])

train_dataset = datasets.MNIST(
    root = "./data-MNIST",
    train = True,
    download = True,
    transform = transform
)
test_dataset = datasets.MNIST(
    './data-MNIST',
    train = False,
    download = True,
    transform = transform
)

train_loader = DataLoader(
    train_dataset,
    batch_size = 64,
    shuffle = True
)
test_loader = DataLoader(
    test_dataset,
    batch_size = 64,
    shuffle = True
)

In [12]:
device = 'gpu' if torch.cuda.is_available() else 'cpu'

model = BasicCNN().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [13]:
# -------- training loop --------
epochs = 5

for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for images, labels  in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        # 1. Forward
        output = model(images)
        loss = criterion(output, labels)

        # 2.Backward
        optimizer.zero_grad()
        loss.backward()

        # 3. Update
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {running_loss/len(train_loader):.4f}") 

Epoch 1, Loss: 0.2262
Epoch 2, Loss: 0.0724
Epoch 3, Loss: 0.0572
Epoch 4, Loss: 0.0482
Epoch 5, Loss: 0.0411


In [16]:
# accuracy calculation (PyTorch)

correct = 0
total = 0

model.eval()
with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)          # (N, 10)
        _, preds = torch.max(outputs, 1) # predicted class

        total += labels.size(0)
        correct += (preds == labels).sum().item()

accuracy = 100 * correct / total
print(f"Accuracy: {accuracy:.2f}%")


Accuracy: 98.48%


# **With Stride**
### **What stride convolution does**

- Downsamples spatial size AND

- Learns which information to keep

- Uses trainable filters

## **Why modern CNNs prefer strided convolution?**

### üîπ Reason 1: Learnable downsampling
**Stride conv decides:**
- Which edges matter
- Which textures to preserve
- What to discard

**Pooling doesn‚Äôt ask questions ‚Äî it just drops data.**

### üîπ Reason 2: Better gradient flow

Pooling creates hard selection (only max survives).

**Stride conv:**
- Keeps richer gradients
- Improves stability

**Plays better with BatchNorm & residual connections**

### üîπ Reason 3: Cleaner architectures
### üîπ Reason 4: Works better with residual blocks

In [25]:
class BasicCNN1(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 16, 3, stride=2, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, 3, stride=2, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU()
        )

        self.fc1 = nn.Linear(32 * 7 * 7, 10) # It expects inputs as (flatten) 32*7*7 -> (1568,10) and outputs to 10 different classes

    def forward(self,x):
        x = self.conv1(x)
        x = self.conv2(x)

        x = torch.flatten(x, start_dim = 1)

        x = self.fc1(x)

        return x

In [26]:
model1 = BasicCNN1().to(device)
optimizer = torch.optim.Adam(model1.parameters(), lr=0.001)

# -------- training loop --------
epochs = 5

for epoch in range(epochs):
    model1.train()
    running_loss = 0.0

    for images, labels  in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        # 1. Forward
        output = model1(images)
        loss = criterion(output, labels)

        # 2.Backward
        optimizer.zero_grad()
        loss.backward()

        # 3. Update
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {running_loss/len(train_loader):.4f}") 

Epoch 1, Loss: 0.1790
Epoch 2, Loss: 0.0640
Epoch 3, Loss: 0.0475
Epoch 4, Loss: 0.0375
Epoch 5, Loss: 0.0312


In [27]:
# accuracy calculation (PyTorch)

correct = 0
total = 0

model1.eval()
with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model1(images)          # (N, 10)
        _, preds = torch.max(outputs, 1) # predicted class

        total += labels.size(0)
        correct += (preds == labels).sum().item()

accuracy = 100 * correct / total
print(f"Accuracy: {accuracy:.2f}%")


Accuracy: 98.38%


# **Basic CNN for CIFAR-10**

In [13]:
class CIFARCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size = 3, padding = 1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size = 3, padding = 1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(64, 128, kernel_size =3, padding =1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        self.classifier = nn.Linear(128 * 4 * 4, 10)

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, start_dim = 1)
        x = self.classifier(x)
        return x



In [8]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# CIFAR-10 normalization (standard values)
transform_train = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=(0.4914, 0.4822, 0.4465),
        std=(0.2470, 0.2435, 0.2616)
    )
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=(0.4914, 0.4822, 0.4465),
        std=(0.2470, 0.2435, 0.2616)
    )
])

train_dataset = datasets.CIFAR10(
    root="./data-CIFAR",
    train=True,
    download=True,
    transform=transform_train
)

test_dataset = datasets.CIFAR10(
    root="./data-CIFAR",
    train=False,
    download=True,
    transform=transform_test
)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=128, shuffle=False)


In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = CIFARCNN().to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [15]:
epochs = 10

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        # forward
        outputs = model(images) # get the output
        loss = criterion(outputs, labels) # calculate the loss

        # backward
        optimizer.zero_grad() # set the gradient to zero
        loss.backward() # calculate the backward prop
        optimizer.step() # update the weights

        running_loss += loss.item() # add the loss

        _, preds = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (preds == labels).sum().item()

    train_acc = 100 * correct / total
    print(f"Epoch [{epoch+1}/{epochs}] "
          f"Loss: {running_loss/len(train_loader):.4f} "
          f"Train Acc: {train_acc:.2f}%")


Epoch [1/10] Loss: 1.1631 Train Acc: 58.78%
Epoch [2/10] Loss: 0.8231 Train Acc: 71.56%
Epoch [3/10] Loss: 0.6995 Train Acc: 75.89%
Epoch [4/10] Loss: 0.6112 Train Acc: 78.85%
Epoch [5/10] Loss: 0.5406 Train Acc: 81.47%
Epoch [6/10] Loss: 0.4760 Train Acc: 83.70%
Epoch [7/10] Loss: 0.4223 Train Acc: 85.58%
Epoch [8/10] Loss: 0.3679 Train Acc: 87.49%
Epoch [9/10] Loss: 0.3214 Train Acc: 89.19%
Epoch [10/10] Loss: 0.2856 Train Acc: 90.37%


In [17]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        _, preds = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (preds == labels).sum().item()

test_acc = 100 * correct / total
print(f"Test Accuracy: {test_acc:.2f}%")

Test Accuracy: 74.38%
