<a href="https://colab.research.google.com/github/Redcoder815/Deep_Learning_PyTorch/blob/main/15CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn

The Cross-Correlation Operation

In [None]:
def corr2d(X, K):
    """Compute 2D cross-correlation."""
    h, w = K.shape
    Y = torch.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            Y[i, j] = (X[i:i + h, j:j + w] * K).sum()
    return Y

In [None]:
X = torch.tensor([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]])
K = torch.tensor([[0.0, 1.0], [2.0, 3.0]])
corr2d(X, K)

tensor([[19., 25.],
        [37., 43.]])

Convolutional Layers

In [None]:
class Conv2D(nn.Module):
    def __init__(self, kernel_size):
        super().__init__()
        self.weight = nn.Parameter(torch.rand(kernel_size))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        return corr2d(x, self.weight) + self.bias

In [None]:
X = torch.ones((6, 8))
X[:, 2:6] = 0
X

tensor([[1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.],
        [1., 1., 0., 0., 0., 0., 1., 1.]])

In [None]:
K = torch.tensor([[1.0, -1.0]])

In [None]:
Y = corr2d(X, K)
Y

tensor([[ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0., -1.,  0.]])

In [None]:
corr2d(X.t(), K)

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

Learning a Kernel

In [None]:
# Construct a two-dimensional convolutional layer with 1 output channel and a
# kernel of shape (1, 2). For the sake of simplicity, we ignore the bias here
conv2d = nn.LazyConv2d(1, kernel_size=(1, 2), bias=False)

# The two-dimensional convolutional layer uses four-dimensional input and
# output in the format of (example, channel, height, width), where the batch
# size (number of examples in the batch) and the number of channels are both 1
X = X.reshape((1, 1, 6, 8))
Y = Y.reshape((1, 1, 6, 7))
lr = 3e-2  # Learning rate

for i in range(10):
    Y_hat = conv2d(X)
    l = (Y_hat - Y) ** 2
    conv2d.zero_grad()
    l.sum().backward()
    # Update the kernel
    conv2d.weight.data[:] -= lr * conv2d.weight.grad
    if (i + 1) % 2 == 0:
        print(f'epoch {i + 1}, loss {l.sum():.3f}')

epoch 2, loss 12.942
epoch 4, loss 2.189
epoch 6, loss 0.374
epoch 8, loss 0.066
epoch 10, loss 0.012


In [None]:
conv2d.weight.data.reshape((1, 2))

tensor([[ 0.9767, -0.9860]])

Padding

In [None]:
# We define a helper function to calculate convolutions. It initializes the
# convolutional layer weights and performs corresponding dimensionality
# elevations and reductions on the input and output
def comp_conv2d(conv2d, X):
    # (1, 1) indicates that batch size and the number of channels are both 1
    X = X.reshape((1, 1) + X.shape)
    Y = conv2d(X)
    # Strip the first two dimensions: examples and channels
    return Y.reshape(Y.shape[2:])

# 1 row and column is padded on either side, so a total of 2 rows or columns
# are added
conv2d = nn.LazyConv2d(1, kernel_size=3, padding=1)
X = torch.rand(size=(8, 8))
comp_conv2d(conv2d, X).shape

torch.Size([8, 8])

In [None]:
# We use a convolution kernel with height 5 and width 3. The padding on either
# side of the height and width are 2 and 1, respectively
conv2d = nn.LazyConv2d(1, kernel_size=(5, 3), padding=(2, 1))
comp_conv2d(conv2d, X).shape

torch.Size([8, 8])

Let's break down how the output shape `(8, 8)` is obtained. The key factors are the input shape, kernel size, and padding. The formula to calculate the output dimensions for a convolutional layer (assuming a stride of 1, which is the default for `nn.Conv2d` when not specified) is:

Output Height = `(Input Height - Kernel Height + 2 * Padding Height) + 1`
Output Width = `(Input Width - Kernel Width + 2 * Padding Width) + 1`

In the code block, you have:
*   **Input shape (X):** `(8, 8)` (so, `Input Height = 8`, `Input Width = 8`)
*   **Kernel size:** `(5, 3)` (so, `Kernel Height = 5`, `Kernel Width = 3`)
*   **Padding:** `(2, 1)` (so, `Padding Height = 2`, `Padding Width = 1`)

Now, let's plug these values into the formulas:

**For the Output Height:**
`Output Height = (8 - 5 + 2 * 2) + 1`
`Output Height = (8 - 5 + 4) + 1`
`Output Height = (3 + 4) + 1`
`Output Height = 7 + 1 = 8`

**For the Output Width:**
`Output Width = (8 - 3 + 2 * 1) + 1`
`Output Width = (8 - 3 + 2) + 1`
`Output Width = (5 + 2) + 1`
`Output Width = 7 + 1 = 8`

Therefore, the resulting output shape is `(8, 8)`, which matches the output `torch.Size([8, 8])` you observed. The padding effectively compensates for the reduction in size caused by the kernel, allowing the output to maintain the same dimensions as the input.

Stride

In [None]:
conv2d = nn.LazyConv2d(1, kernel_size=3, padding=1, stride=2)
comp_conv2d(conv2d, X).shape

torch.Size([4, 4])

In [None]:
conv2d = nn.LazyConv2d(1, kernel_size=(3, 5), padding=(0, 1), stride=(3, 4))
comp_conv2d(conv2d, X).shape

torch.Size([2, 2])

Multiple Input Channels

Let's break down how the output shape `(2, 2)` is obtained for the last code block. This time, in addition to input shape, kernel size, and padding, the **stride** plays a crucial role. The general formulas to calculate the output dimensions for a convolutional layer are:

Output Height = `floor((Input Height - Kernel Height + 2 * Padding Height) / Stride Height) + 1`
Output Width = `floor((Input Width - Kernel Width + 2 * Padding Width) / Stride Width) + 1`

In the code block, you have:
*   **Input shape (X):** `(8, 8)` (so, `Input Height = 8`, `Input Width = 8`)
*   **Kernel size:** `(3, 5)` (so, `Kernel Height = 3`, `Kernel Width = 5`)
*   **Padding:** `(0, 1)` (so, `Padding Height = 0`, `Padding Width = 1`)
*   **Stride:** `(3, 4)` (so, `Stride Height = 3`, `Stride Width = 4`)

Now, let's plug these values into the formulas:

**For the Output Height:**
`Output Height = floor((8 - 3 + 2 * 0) / 3) + 1`
`Output Height = floor((5 + 0) / 3) + 1`
`Output Height = floor(5 / 3) + 1`
`Output Height = floor(1.666...) + 1`
`Output Height = 1 + 1 = 2`

**For the Output Width:**
`Output Width = floor((8 - 5 + 2 * 1) / 4) + 1`
`Output Width = floor((3 + 2) / 4) + 1`
`Output Width = floor(5 / 4) + 1`
`Output Width = floor(1.25) + 1`
`Output Width = 1 + 1 = 2`

Therefore, the resulting output shape is `(2, 2)`, which matches the output `torch.Size([2, 2])` you observed. The stride significantly reduces the output dimensions by controlling how many pixels the kernel skips during its movement across the input.

The last code block you executed demonstrates how the `corr2d_multi_in` function works with a concrete example:

1.  **`X_multi_in` (Input with 2 Channels):**
    *   It defines a `torch.Tensor` named `X_multi_in` with a shape of `(2, 3, 3)`. This represents an input with 2 channels, where each channel is a `3x3` matrix. Think of it like a simplified 2-channel image where each channel holds different feature information.
    *   `Channel 1`: `[[0., 1., 2.], [3., 4., 5.], [6., 7., 8.]]`
    *   `Channel 2`: `[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]]`

2.  **`K_multi_in` (Kernel with 2 Input Channels):**
    *   It defines a `torch.Tensor` named `K_multi_in` with a shape of `(2, 2, 2)`. This is a convolution kernel designed to operate on inputs with 2 channels, where each channel has a `2x2` kernel.
    *   `Kernel for Channel 1`: `[[0., 1.], [2., 3.]]`
    *   `Kernel for Channel 2`: `[[1., 0.], [3., 2.]]`

3.  **`Y_multi_in = corr2d_multi_in(X_multi_in, K_multi_in)`:**
    *   This line calls the `corr2d_multi_in` function with the multi-channel input `X_multi_in` and multi-channel kernel `K_multi_in`.
    *   Internally, `corr2d_multi_in` does the following:
        *   It performs `corr2d(X_multi_in[0], K_multi_in[0])` (cross-correlation for the first channel).
        *   It performs `corr2d(X_multi_in[1], K_multi_in[1])` (cross-correlation for the second channel).
        *   It then **sums** the results of these two individual cross-correlations element-wise to produce a single output feature map.

4.  **Output Calculation Breakdown (as the function does it):
    *   For Channel 1 (`X_multi_in[0]` and `K_multi_in[0]`):
        `corr2d(torch.tensor([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]]), torch.tensor([[0.0, 1.0], [2.0, 3.0]]))`
        This would yield `torch.tensor([[19., 25.], [37., 43.]])` (as seen in earlier examples).

    *   For Channel 2 (`X_multi_in[1]` and `K_multi_in[1]`):
        `corr2d(torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]), torch.tensor([[1.0, 0.0], [3.0, 2.0]]))`
        Let's calculate this manually:
        *   Top-left: `(1*1 + 2*0 + 4*3 + 5*2) = 1 + 0 + 12 + 10 = 23`
        *   Top-right: `(2*1 + 3*0 + 5*3 + 6*2) = 2 + 0 + 15 + 12 = 29`
        *   Bottom-left: `(4*1 + 5*0 + 7*3 + 8*2) = 4 + 0 + 21 + 16 = 41`
        *   Bottom-right: `(5*1 + 6*0 + 8*3 + 9*2) = 5 + 0 + 24 + 18 = 47`
        So, `torch.tensor([[23., 29.], [41., 47.]])`

    *   **Summing the results:**
        `[[19., 25.], [37., 43.]] + [[23., 29.], [41., 47.]] = [[19+23, 25+29], [37+41, 43+47]] = [[42., 54.], [78., 90.]]`

    The output `Y_multi_in` will therefore be `torch.tensor([[42., 54.], [78., 90.]])`, representing the aggregated feature map across all input channels.

In [None]:
def corr2d_multi_in(X, K):
    # Iterate through the 0th dimension (channel) of K first, then add them up
    return sum(corr2d(x, k) for x, k in zip(X, K))

Let's explain the result of the last executed code block with `X` and `K` from that block:

**Input `X`:**
```
torch.tensor([[[
    [0.0, 1.0, 2.0],
    [3.0, 4.0, 5.0],
    [6.0, 7.0, 8.0]]],  # Channel 1

   [[
    [1.0, 2.0, 3.0],
    [4.0, 5.0, 6.0],
    [7.0, 8.0, 9.0]]]]) # Channel 2
```

**Kernel `K`:**
```
torch.tensor([[[
    [0.0, 1.0],
    [2.0, 3.0]]],  # Kernel for Channel 1

   [[
    [1.0, 2.0],
    [3.0, 4.0]]]]) # Kernel for Channel 2
```

The `corr2d_multi_in(X, K)` function performs two individual `corr2d` operations (one for each channel) and then sums their outputs.

### **Step 1: Cross-correlation for Channel 1**

`X_ch1 = [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]]`
`K_ch1 = [[0.0, 1.0], [2.0, 3.0]]`

Applying `corr2d(X_ch1, K_ch1)` (as we've seen in previous examples):

*   **Element (0,0):** `(0*0 + 1*1 + 3*2 + 4*3) = 0 + 1 + 6 + 12 = 19`
*   **Element (0,1):** `(1*0 + 2*1 + 4*2 + 5*3) = 0 + 2 + 8 + 15 = 25`
*   **Element (1,0):** `(3*0 + 4*1 + 6*2 + 7*3) = 0 + 4 + 12 + 21 = 37`
*   **Element (1,1):** `(4*0 + 5*1 + 7*2 + 8*3) = 0 + 5 + 14 + 24 = 43`

**Result for Channel 1:**
```
torch.tensor([[
    [19., 25.],
    [37., 43.]]])
```

### **Step 2: Cross-correlation for Channel 2**

`X_ch2 = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]`
`K_ch2 = [[1.0, 2.0], [3.0, 4.0]]`

Applying `corr2d(X_ch2, K_ch2)`:

*   **Element (0,0):** `(1*1 + 2*2 + 4*3 + 5*4) = 1 + 4 + 12 + 20 = 37`
*   **Element (0,1):** `(2*1 + 3*2 + 5*3 + 6*4) = 2 + 6 + 15 + 24 = 47`
*   **Element (1,0):** `(4*1 + 5*2 + 7*3 + 8*4) = 4 + 10 + 21 + 32 = 67`
*   **Element (1,1):** `(5*1 + 6*2 + 8*3 + 9*4) = 5 + 12 + 24 + 36 = 77`

**Result for Channel 2:**
```
torch.tensor([[
    [37., 47.],
    [67., 77.]]])
```

### **Step 3: Summing the Results of All Channels**

Finally, `corr2d_multi_in` sums the results from each channel element-wise:

```
[[19., 25.],   +   [[37., 47.],   =   [[19+37, 25+47],
 [37., 43.]]       [67., 77.]]       [37+67, 43+77]]

= [[56.,  72.],
   [104., 120.]]
```

This matches the output you observed: `tensor([[ 56.,  72.], [104., 120.]])`.

In [None]:
X = torch.tensor([[[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]],
               [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]])
K = torch.tensor([[[0.0, 1.0], [2.0, 3.0]], [[1.0, 2.0], [3.0, 4.0]]])

corr2d_multi_in(X, K)

tensor([[ 56.,  72.],
        [104., 120.]])

Multiple Output Channels

Let's re-explain how the `corr2d_multi_in_out(X, K)` function works, focusing on the input and kernel shapes and what happens at each step:

**Function Definition:**
```python
def corr2d_multi_in_out(X, K):
    return torch.stack([corr2d_multi_in(X, k) for k in K], 0)
```

**Assumptions about `X` and `K`:**

*   **Input `X`:** This is your input feature map(s). It has multiple input channels.
    *   Its shape is `(num_input_channels, height, width)`.
    *   For our example, `X` has shape `(2, 3, 3)`.

*   **Kernel `K` (for `corr2d_multi_in_out`):** This kernel is designed to produce multiple output channels.
    *   Its shape is `(num_output_channels, num_input_channels, kernel_height, kernel_width)`.
    *   For our example, after `K = torch.stack((K, K + 1, K + 2), 0)`, this `K` has shape `(3, 2, 2, 2)`.

**Step-by-Step Breakdown:**

1.  **`for k in K`:** The core of this function is a list comprehension `[corr2d_multi_in(X, k) for k in K]`. This loop iterates through the *first dimension* of the kernel `K`.

    *   In our example, `K` has `num_output_channels = 3` along its first dimension. So, the loop will run 3 times.

    *   **Iteration 1:** `k` becomes `K[0]`.
        *   `K[0]` is a 3D tensor with shape `(num_input_channels, kernel_height, kernel_width)`, which is `(2, 2, 2)` in our case.
        *   This `K[0]` represents the set of kernels that will produce the *first output channel*.

    *   **Iteration 2:** `k` becomes `K[1]`.
        *   `K[1]` also has shape `(2, 2, 2)`.
        *   This `K[1]` represents the set of kernels that will produce the *second output channel*.

    *   **Iteration 3:** `k` becomes `K[2]`.
        *   `K[2]` also has shape `(2, 2, 2)`.
        *   This `K[2]` represents the set of kernels that will produce the *third output channel*.

2.  **`corr2d_multi_in(X, k)` (inside the loop):** In each iteration, `corr2d_multi_in` is called with the original multi-channel input `X` and the current `k` (which is a kernel set for *one* output channel).

    *   Recall that `corr2d_multi_in(X, k)` does the following:
        *   It takes `X` (e.g., `(2, 3, 3)`) and `k` (e.g., `(2, 2, 2)`).
        *   It performs `corr2d(X[0], k[0])` (cross-correlation for input channel 0).
        *   It performs `corr2d(X[1], k[1])` (cross-correlation for input channel 1).
        *   It then **sums these results** element-wise.
    *   The output of `corr2d_multi_in(X, k)` is a single 2D tensor (e.g., `(2, 2)`), representing *one* output feature map (one output channel).

    *   So, in our example:
        *   `corr2d_multi_in(X, K[0])` produces the **first output feature map** (shape `(2, 2)`).
        *   `corr2d_multi_in(X, K[1])` produces the **second output feature map** (shape `(2, 2)`).
        *   `corr2d_multi_in(X, K[2])` produces the **third output feature map** (shape `(2, 2)`).

3.  **`torch.stack([...], 0)` (after the loop):** The list comprehension generates a list containing these three 2D output feature maps.

    *   `torch.stack` then takes this list of 2D tensors `[output_map_0, output_map_1, output_map_2]` and stacks them along a new dimension at index `0`.

    *   This combines them into a single 3D tensor where the new first dimension corresponds to the `num_output_channels`.

    *   The final output shape will be `(num_output_channels, output_height, output_width)`.
    *   In our example, the final output shape is `(3, 2, 2)`.

**In summary:** `corr2d_multi_in_out` processes the input `X` with *each complete set of kernels* (each `k` from the first dimension of the main `K`) independently. Each of these independent processing steps, handled by `corr2d_multi_in`, generates one output channel. Finally, all these generated output channels are stacked together to form the multi-channel output.

In [None]:
def corr2d_multi_in_out(X, K):
    # Iterate through the 0th dimension of K, and each time, perform
    # cross-correlation operations with input X. All of the results are
    # stacked together
    return torch.stack([corr2d_multi_in(X, k) for k in K], 0)

Let's re-explain the line `K = torch.stack((K, K + 1, K + 2), 0)`.

Imagine you have a single `K` which is like a recipe for a certain flavor (output feature).

Initially, `K` has a shape like `(num_input_channels, kernel_height, kernel_width)`. Let's say, in our example, `K.shape` was `(2, 2, 2)`.

This means:
*   It takes `2` input channels.
*   For each input channel, it has a `2x2` kernel.

Now, let's look at `torch.stack((K, K + 1, K + 2), 0)`:

1.  **`K`**: This is your original recipe (kernel set).

2.  **`K + 1`**: This creates a *new* recipe. It's similar to the original `K`, but every number in it has been increased by 1. So, it will produce a *different* flavor or detect *different* features.

3.  **`K + 2`**: This creates *another* new recipe, where every number from the original `K` has been increased by 2. This is yet another unique recipe/flavor.

4.  **`torch.stack( (recipe1, recipe2, recipe3), dim=0)`**: The `torch.stack` function takes these individual recipes (tensors) and bundles them together into a *new* dimension. The `dim=0` means it puts them one on top of the other, creating a new "outermost" dimension.

**Visualizing the dimensions:**

*   Original `K` shape: `(2, 2, 2)` (e.g., `[InputChannel, Height, Width]`). This can be thought of as a single "output kernel set" that produces one output feature map.

*   When you do `torch.stack((K, K + 1, K + 2), 0)`, you are essentially saying:
    *   "Here's my first set of kernels (`K`) to produce **Output Channel 1**."
    *   "Here's my second set of kernels (`K + 1`) to produce **Output Channel 2**."
    *   "Here's my third set of kernels (`K + 2`) to produce **Output Channel 3**."

    And you stack these three sets along a new 0-th dimension.

*   The new `K` will have a shape like: `(3, 2, 2, 2)`.
    *   The `3` at the beginning signifies that this new `K` is now a collection of 3 *different* kernel sets, each designed to produce a separate *output channel*.
    *   The `2, 2, 2` that follows for each of the 3 sets is still `[InputChannel, Height, Width]` for that particular output channel's kernel set.

So, this operation effectively transforms a kernel designed for a single output channel (given multiple input channels) into a kernel designed to produce *multiple* output channels (still considering multiple input channels for each output). Each of the 3 "stacked" `K`s will be used by `corr2d_multi_in_out` to generate one of the three output feature maps.

The line `K = torch.stack((K, K + 1, K + 2), 0)` is designed to create a new multi-dimensional kernel that can produce multiple output channels.

Let's break it down:

1.  **Original `K`:** Before this line, `K` was a tensor representing a kernel structure with multiple *input* channels. Its shape was `(num_input_channels, kernel_height, kernel_width)`.
    *   For example, in the previous code block, `K` was `torch.tensor([[[0.0, 1.0], [2.0, 3.0]], [[1.0, 2.0], [3.0, 4.0]]])`, which has a shape of `(2, 2, 2)`. This means it had 2 input channels, each with a 2x2 kernel.

2.  **`K + 1` and `K + 2`:** These operations perform element-wise addition to the original `K` tensor. They create new kernel tensors that are essentially shifted versions of the original `K`.
    *   `K + 1` results in a new kernel tensor where every element of the original `K` has been increased by 1.
    *   `K + 2` results in a new kernel tensor where every element of the original `K` has been increased by 2.

3.  **`torch.stack((K, K + 1, K + 2), 0)`:** The `torch.stack` function takes a sequence of tensors and concatenates them along a *new* dimension. The `0` indicates that this new dimension should be inserted at the very beginning (index 0) of the tensor's shape.
    *   You are stacking three tensors: the original `K`, `K + 1`, and `K + 2`.
    *   Each of these three tensors has the shape of the original `K` (e.g., `(2, 2, 2)`).
    *   By stacking them along dimension 0, you are essentially creating 3 *output* channels. Each of these output channels will be generated by its own distinct set of kernels (the original `K`, `K+1`, or `K+2`).

**Resulting `K` Shape:**

If the original `K` had a shape of `(C_in, H_k, W_k)` (e.g., `(2, 2, 2)`), after this `stack` operation, the new `K` will have a shape of `(3, C_in, H_k, W_k)` (e.g., `(3, 2, 2, 2)`).

*   The first dimension (`3`) now represents the number of *output* channels.
*   The second dimension (`C_in`) represents the number of *input* channels that each output kernel operates on.
*   The remaining dimensions (`H_k`, `W_k`) are the height and width of each individual kernel within the output channel.

This new `K` is then used by `corr2d_multi_in_out` to produce three separate output feature maps, each corresponding to one of the stacked kernel sets.

In [None]:
K = torch.stack((K, K + 1, K + 2), 0)
K.shape

torch.Size([3, 2, 2, 2])

In [None]:
corr2d_multi_in_out(X, K)

tensor([[[ 56.,  72.],
         [104., 120.]],

        [[ 76., 100.],
         [148., 172.]],

        [[ 96., 128.],
         [192., 224.]]])

1x1 Convolutional Layer

In [None]:
def corr2d_multi_in_out_1x1(X, K):
    c_i, h, w = X.shape
    c_o = K.shape[0]
    X = X.reshape((c_i, h * w))
    K = K.reshape((c_o, c_i))
    # Matrix multiplication in the fully connected layer
    Y = torch.matmul(K, X)
    return Y.reshape((c_o, h, w))

Let's explain the `corr2d_multi_in_out_1x1` function:

This function is a special case that implements a **1x1 convolutional layer** (also known as a Network in Network layer). A 1x1 convolution is essentially a fully connected layer applied independently to each pixel location across all input channels. This function showcases how this can be achieved efficiently using matrix multiplication.

**Function Definition:**
```python
def corr2d_multi_in_out_1x1(X, K):
    c_i, h, w = X.shape
    c_o = K.shape[0]
    X = X.reshape((c_i, h * w))
    K = K.reshape((c_o, c_i))
    # Matrix multiplication in the fully connected layer
    Y = torch.matmul(K, X)
    return Y.reshape((c_o, h, w))
```

**Assumptions about `X` and `K`:**

*   **Input `X`:** This is your input tensor with multiple input channels. Its shape is `(num_input_channels, height, width)`, which is `(c_i, h, w)` in the code.
    *   Example: `X` with shape `(3, 3, 3)` means `c_i=3`, `h=3`, `w=3`.

*   **Kernel `K`:** This is your 1x1 convolution kernel. Its shape is `(num_output_channels, num_input_channels, 1, 1)`, which is `(c_o, c_i, 1, 1)` in the code.
    *   Example: `K` with shape `(2, 3, 1, 1)` means `c_o=2`, `c_i=3`.

**Step-by-Step Breakdown:**

1.  **Extract Dimensions:**
    *   `c_i, h, w = X.shape`: Gets the input channels, height, and width of `X`.
    *   `c_o = K.shape[0]`: Gets the number of output channels from the first dimension of `K`.

2.  **Reshape Input `X`:**
    *   `X = X.reshape((c_i, h * w))`: The input `X` is reshaped from `(c_i, h, w)` to `(c_i, h * w)`. This flattens the spatial dimensions (height and width) into a single dimension. Each column in this new `X` represents all input channel values for a single pixel location.
        *   Example: `X` `(3, 3, 3)` becomes `(3, 9)`. This means 3 input channels, and 9 pixel locations. Each column `[:, j]` contains the 3 input channel values for pixel `j`.

3.  **Reshape Kernel `K`:**
    *   `K = K.reshape((c_o, c_i))`: The kernel `K` is reshaped from `(c_o, c_i, 1, 1)` to `(c_o, c_i)`. Since it's a 1x1 kernel, the `1, 1` dimensions are redundant for the matrix multiplication, and it effectively becomes a weight matrix where rows are output channels and columns are input channels.
        *   Example: `K` `(2, 3, 1, 1)` becomes `(2, 3)`. This means 2 output channels, and for each, 3 weights corresponding to the 3 input channels.

4.  **Matrix Multiplication:**
    *   `Y = torch.matmul(K, X)`: This is the core operation. It performs a matrix multiplication between the reshaped `K` and `X`.
        *   `K` has shape `(c_o, c_i)`.
        *   `X` has shape `(c_i, h * w)`.
        *   The result `Y` will have shape `(c_o, h * w)`.
    *   Conceptually, for each pixel location `j` (column in `X`), the `c_i` input channel values are multiplied by the `(c_o, c_i)` weight matrix `K`. This produces `c_o` output channel values for that single pixel location `j`. This is exactly what a 1x1 convolution does: it applies a linear transformation across the channel dimension independently for each spatial location.

5.  **Reshape Output `Y`:**
    *   `return Y.reshape((c_o, h, w))`: The result `Y` is reshaped back from `(c_o, h * w)` to `(c_o, h, w)`. This restores the spatial dimensions, giving you the final output feature map with `c_o` output channels.
        *   Example: `Y` `(2, 9)` becomes `(2, 3, 3)`.

**In Summary:** The `corr2d_multi_in_out_1x1` function efficiently implements a 1x1 convolution by transforming the input and kernel into 2D matrices, performing a single matrix multiplication, and then reshaping the result back into a multi-channel image-like format. This highlights the close relationship between 1x1 convolutions and fully connected layers.

The shape of X is (3, 3, 3). This means it has 3 input channels, a height of 3, and a width of 3.
The shape of K is (2, 3, 1, 1). This means it is designed to produce 2 output channels, processes 3 input channels, and each individual kernel has a height of 1 and a width of 1 (a 1x1 convolution).

In [None]:
X = torch.normal(0, 1, (3, 3, 3))
K = torch.normal(0, 1, (2, 3, 1, 1))
Y1 = corr2d_multi_in_out_1x1(X, K)
Y2 = corr2d_multi_in_out(X, K)
assert float(torch.abs(Y1 - Y2).sum()) < 1e-6

Maximum and Average pooling

In [None]:
def pool2d(X, pool_size, mode='max'):
    p_h, p_w = pool_size
    Y = torch.zeros((X.shape[0] - p_h + 1, X.shape[1] - p_w + 1))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            if mode == 'max':
                Y[i, j] = X[i: i + p_h, j: j + p_w].max()
            elif mode == 'avg':
                Y[i, j] = X[i: i + p_h, j: j + p_w].mean()
    return Y

Let's break down the line `Y[i, j] = X[i: i + p_h, j: j + p_w].max()`:

This line is responsible for performing the "max pooling" operation. It calculates the maximum value within a sliding window of the input `X` and assigns that maximum value to a corresponding position in the output `Y`.

Let's use the example from your last executed code block:
`X = torch.tensor([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]])`
`pool_size = (p_h, p_w) = (2, 2)`

And let's consider the first iteration where `i=0` and `j=0`.

1.  **`X[i: i + p_h, j: j + p_w]`**: This is a tensor slicing operation.
    *   `i: i + p_h` becomes `0: 0 + 2`, which is `0:2`. This selects rows from index 0 up to (but not including) index 2.
    *   `j: j + p_w` becomes `0: 0 + 2`, which is `0:2`. This selects columns from index 0 up to (but not including) index 2.

    So, `X[0:2, 0:2]` selects the top-left `2x2` sub-region (or "window") from `X`:
    ```
    [[0.0, 1.0],
     [3.0, 4.0]]
    ```

2.  **`.max()`**: This method is called on the selected sub-tensor. It returns the single maximum value found within that window.
    *   For the window `[[0.0, 1.0], [3.0, 4.0]]`, the maximum value is `4.0`.

3.  **`Y[i, j] = ...`**: This assigns the result of the `.max()` operation to the corresponding `(i, j)` position in the output tensor `Y`.
    *   So, `Y[0, 0]` will be assigned the value `4.0`.

### Let's look at the next step (`i=0, j=1`):

1.  **`X[0: 0 + 2, 1: 1 + 2]`** becomes `X[0:2, 1:3]`. This selects the next `2x2` window:
    ```
    [[1.0, 2.0],
     [4.0, 5.0]]
    ```

2.  **`.max()`**: The maximum value in this window `[[1.0, 2.0], [4.0, 5.0]]` is `5.0`.

3.  **`Y[0, 1] = ...`**: So, `Y[0, 1]` will be assigned the value `5.0`.

This process continues for all possible `(i, j)` positions in the output tensor `Y`, effectively sliding the `2x2` window across `X` and picking the maximum value within each window.

In [None]:
X = torch.tensor([[0.0, 1.0, 2.0], [3.0, 4.0, 5.0], [6.0, 7.0, 8.0]])
pool2d(X, (2, 2))

tensor([[4., 5.],
        [7., 8.]])

In [None]:
pool2d(X, (2, 2), 'avg')

tensor([[2., 3.],
        [5., 6.]])

In [None]:
X = torch.arange(16, dtype=torch.float32).reshape((1, 1, 4, 4))
X

tensor([[[[ 0.,  1.,  2.,  3.],
          [ 4.,  5.,  6.,  7.],
          [ 8.,  9., 10., 11.],
          [12., 13., 14., 15.]]]])

In [None]:
pool2d = nn.MaxPool2d(3)
# Pooling has no model parameters, hence it needs no initialization
pool2d(X)

tensor([[[[10.]]]])

In [None]:
pool2d = nn.MaxPool2d(3, padding=1, stride=2)
pool2d(X)

tensor([[[[ 5.,  7.],
          [13., 15.]]]])

Let's explain how the output `tensor([[[ 5.,  7.], [13., 15.]]])` is obtained from the last executed code block with `nn.MaxPool2d`.

First, let's recall the input `X` and the `nn.MaxPool2d` parameters:

*   **Input `X` (shape `(1, 1, 4, 4)`):**
    ```
    [[[[
        0.,  1.,  2.,  3.],
        4.,  5.,  6.,  7.],
        8.,  9., 10., 11.],
        12., 13., 14., 15.]]]
    ```

*   **Pooling Parameters:**
    *   `kernel_size=3`: The pooling window will be `3x3`.
    *   `padding=1`: One row/column of padding (zeros by default for `nn.MaxPool2d`) is added around the input.
    *   `stride=2`: The pooling window moves 2 steps horizontally and 2 steps vertically.

### **Step 1: Apply Padding to Input `X`**

With `padding=1`, the `4x4` input `X` becomes a `6x6` padded input (adding zeros around the border):

```
Padded X:
0  0  0  0  0  0
0  0  1  2  3  0
0  4  5  6  7  0
0  8  9 10 11  0
0 12 13 14 15  0
0  0  0  0  0  0
```

### **Step 2: Calculate Output Dimensions**

The output dimensions can be calculated using the formula:
`Output Size = floor((Input Size + 2 * Padding - Kernel Size) / Stride) + 1`

For our `4x4` input and `3x3` kernel with `padding=1` and `stride=2`:
*   **Height:** `floor((4 + 2*1 - 3) / 2) + 1 = floor(3 / 2) + 1 = floor(1.5) + 1 = 1 + 1 = 2`
*   **Width:** `floor((4 + 2*1 - 3) / 2) + 1 = floor(3 / 2) + 1 = floor(1.5) + 1 = 1 + 1 = 2`

So, the output feature map will be `2x2`.

### **Step 3: Sliding the `3x3` Window and Taking the Maximum**

Now, let's see how each element of the `2x2` output is derived from the padded input `X`.

1.  **Output Element (0, 0): Value `5`**
    *   The `3x3` window starts at `(0,0)` of the padded input.
    *   The window covers: `X_padded[0:3, 0:3]`
    ```
    0  0  0
    0  0  1
    0  4  5
    ```
    *   The maximum value in this window is `5`.

2.  **Output Element (0, 1): Value `7`**
    *   Due to `stride=2`, the window moves 2 steps to the right, starting at `(0,2)` of the padded input.
    *   The window covers: `X_padded[0:3, 2:5]`
    ```
    0  0  0
    1  2  3
    5  6  7
    ```
    *   The maximum value in this window is `7`.

3.  **Output Element (1, 0): Value `13`**
    *   Due to `stride=2`, the window moves 2 steps down and back to column `0`, starting at `(2,0)` of the padded input.
    *   The window covers: `X_padded[2:5, 0:3]`
    ```
    0  4  5
    0  8  9
    0 12 13
    ```
    *   The maximum value in this window is `13`.

4.  **Output Element (1, 1): Value `15`**
    *   Due to `stride=2`, the window moves 2 steps down and 2 steps right, starting at `(2,2)` of the padded input.
    *   The window covers: `X_padded[2:5, 2:5]`
    ```
    5  6  7
    9 10 11
    13 14 15
    ```
    *   The maximum value in this window is `15`.

This process results in the output tensor: `[[[ 5.,  7.], [13., 15.]]]`

In [None]:
pool2d = nn.MaxPool2d((2, 3), stride=(2, 3), padding=(0, 1))
pool2d(X)

tensor([[[[ 5.,  7.],
          [13., 15.]]]])

Let's explain how the output `tensor([[[ 5.,  7.], [13., 15.]]])` is obtained from the last executed code block with `nn.MaxPool2d`.

First, let's recall the input `X` and the `nn.MaxPool2d` parameters:

*   **Input `X` (shape `(1, 1, 4, 4)`):**
    ```
    [[[[
        0.,  1.,  2.,  3.],
        4.,  5.,  6.,  7.],
        8.,  9., 10., 11.],
        12., 13., 14., 15.]]]
    ```

*   **Pooling Parameters:**
    *   `kernel_size=(2, 3)`: The pooling window will be `2` rows high and `3` columns wide.
    *   `padding=(0, 1)`: No padding on height (0 rows), and one column of padding (zeros by default for `nn.MaxPool2d`) is added on each side of the width.
    *   `stride=(2, 3)`: The pooling window moves `2` steps vertically and `3` steps horizontally.

### **Step 1: Apply Padding to Input `X`**

With `padding=(0, 1)`, the `4x4` input `X` becomes a `4x6` padded input (adding zeros to the left and right of the columns):

```
Padded X:
0  0.  1.  2.  3.  0
0  4.  5.  6.  7.  0
0  8.  9. 10. 11.  0
0 12. 13. 14. 15.  0
```

### **Step 2: Calculate Output Dimensions**

The output dimensions can be calculated using the formula:
`Output Size = floor((Input Size + 2 * Padding - Kernel Size) / Stride) + 1`

For our `4x4` input `X`, `kernel_size=(2, 3)`, `padding=(0, 1)`, and `stride=(2, 3)`:

*   **Height:** `floor((Input Height + 2 * Padding Height - Kernel Height) / Stride Height) + 1`
    `floor((4 + 2 * 0 - 2) / 2) + 1 = floor((4 - 2) / 2) + 1 = floor(2 / 2) + 1 = 1 + 1 = 2`

*   **Width:** `floor((Input Width + 2 * Padding Width - Kernel Width) / Stride Width) + 1`
    `floor((4 + 2 * 1 - 3) / 3) + 1 = floor((4 + 2 - 3) / 3) + 1 = floor(3 / 3) + 1 = 1 + 1 = 2`

So, the output feature map will be `2x2`.

### **Step 3: Sliding the `2x3` Window and Taking the Maximum**

Now, let's see how each element of the `2x2` output is derived from the padded input `X`.

1.  **Output Element (0, 0): Value `5`**
    *   The `2x3` window starts at `(0,0)` of the padded input.
    *   The window covers: `X_padded[0:2, 0:3]`
    ```
    0.  0.  1.
    0.  4.  5.
    ```
    *   The maximum value in this window is `5.`

2.  **Output Element (0, 1): Value `7`**
    *   Due to `stride=(2, 3)`, the window moves 0 steps down and `3` steps to the right, starting at `(0,3)` of the padded input (the next column for padding is 0, then the actual data starts from index 1, so the window shifts by 3 from index 0).
    *   The window covers: `X_padded[0:2, 3:6]`
    ```
    2.  3.  0
    6.  7.  0
    ```
    *   The maximum value in this window is `7.`

3.  **Output Element (1, 0): Value `13`**
    *   Due to `stride=(2, 3)`, the window moves `2` steps down and back to column `0`, starting at `(2,0)` of the padded input.
    *   The window covers: `X_padded[2:4, 0:3]`
    ```
    0.  8.  9.
    0. 12. 13.
    ```
    *   The maximum value in this window is `13.`

4.  **Output Element (1, 1): Value `15`**
    *   Due to `stride=(2, 3)`, the window moves `2` steps down and `3` steps to the right, starting at `(2,3)` of the padded input.
    *   The window covers: `X_padded[2:4, 3:6]`
    ```
    10. 11.  0
    14. 15.  0
    ```
    *   The maximum value in this window is `15.`

This process results in the output tensor: `[[[ 5.,  7.], [13., 15.]]]`

Multiple Channels

In [None]:
X = torch.cat((X, X + 1), 1)
X

tensor([[[[ 0.,  1.,  2.,  3.],
          [ 4.,  5.,  6.,  7.],
          [ 8.,  9., 10., 11.],
          [12., 13., 14., 15.]],

         [[ 1.,  2.,  3.,  4.],
          [ 5.,  6.,  7.,  8.],
          [ 9., 10., 11., 12.],
          [13., 14., 15., 16.]]]])

In [None]:
pool2d = nn.MaxPool2d(3, padding=1, stride=2)
pool2d(X)

tensor([[[[ 5.,  7.],
          [13., 15.]],

         [[ 6.,  8.],
          [14., 16.]]]])

Convolutional Neural Network

In [None]:
def init_cnn(module):
    """Initialize weights for CNNs."""
    if type(module) == nn.Linear or type(module) == nn.Conv2d:
        nn.init.xavier_uniform_(module.weight)

class LeNet(nn.Module):
    """The LeNet-5 model."""
    def __init__(self, lr=0.1, num_classes=10):
        super().__init__()
        self.net = nn.Sequential(
            nn.LazyConv2d(6, kernel_size=5, padding=2), nn.Sigmoid(),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.LazyConv2d(16, kernel_size=5), nn.Sigmoid(),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.LazyLinear(120), nn.Sigmoid(),
            nn.LazyLinear(84), nn.Sigmoid(),
            nn.LazyLinear(num_classes))

    def forward(self, X):
        return self.net(X)

    def layer_summary(self, X_shape):
        X = torch.randn(*X_shape)
        for layer in self.net:
            X = layer(X)
            print(layer.__class__.__name__, 'output shape:\t', X.shape)

model = LeNet()
model.layer_summary((1, 1, 28, 28))

Conv2d output shape:	 torch.Size([1, 6, 28, 28])
Sigmoid output shape:	 torch.Size([1, 6, 28, 28])
AvgPool2d output shape:	 torch.Size([1, 6, 14, 14])
Conv2d output shape:	 torch.Size([1, 16, 10, 10])
Sigmoid output shape:	 torch.Size([1, 16, 10, 10])
AvgPool2d output shape:	 torch.Size([1, 16, 5, 5])
Flatten output shape:	 torch.Size([1, 400])
Linear output shape:	 torch.Size([1, 120])
Sigmoid output shape:	 torch.Size([1, 120])
Linear output shape:	 torch.Size([1, 84])
Sigmoid output shape:	 torch.Size([1, 84])
Linear output shape:	 torch.Size([1, 10])


In [None]:
import torchvision
from torchvision import transforms
from torch.utils import data

# 2. Define a transforms.Compose object named trans
trans = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.1307], std=[0.3081]) # Standard normalization for FashionMNIST
])

# 3. Download and load the FashionMNIST training dataset
mnist_train = torchvision.datasets.FashionMNIST(
    root="./data", train=True, transform=trans, download=True
)

# 4. Download and load the FashionMNIST testing dataset
mnist_test = torchvision.datasets.FashionMNIST(
    root="./data", train=False, transform=trans, download=True
)

# 5. Define the batch_size
batch_size = 256

# 6. Create a DataLoader for the training dataset
train_iter = data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=4)

# 7. Create a DataLoader for the testing dataset
test_iter = data.DataLoader(mnist_test, batch_size, shuffle=False, num_workers=4)



In [None]:
model = LeNet(num_classes=10)
model.apply(init_cnn)
loss = nn.CrossEntropyLoss(reduction='none')
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [None]:
def train_batch(model, X, y, loss):
    # 1. Perform a forward pass
    y_hat = model(X)
    # 2. Calculate the loss
    l = loss(y_hat, y)
    return y_hat, l

def accuracy(y_hat, y):
    # 3. Calculate accuracy by comparing predictions with true labels
    # y_hat is logits, y is true labels
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
    cmp = (y_hat.type(y.dtype) == y)
    return float(cmp.type(torch.float32).sum())

In [None]:
import torch # Import torch if not already imported in this scope

def train_epoch(model, train_iter, loss, optimizer):
    # Set the model to training mode
    model.train()
    # Initialize metrics for the epoch
    total_loss = 0
    total_correct = 0
    total_samples = 0

    # Iterate through the training data loader
    for X, y in train_iter:
        # 5a. Perform a forward pass and calculate the loss
        y_hat, l = train_batch(model, X, y, loss)

        # 5b. Zero out the gradients
        optimizer.zero_grad()

        # 5c. Perform a backward pass
        l.sum().backward()

        # 5d. Update the model's weights
        optimizer.step()

        # 5e. Accumulate the total loss and correct predictions
        total_loss += l.sum().item() # .item() to get scalar from tensor
        total_correct += accuracy(y_hat, y)
        total_samples += y.numel()

    # 6. Calculate and return the average training loss and accuracy
    avg_loss = total_loss / total_samples
    avg_accuracy = total_correct / total_samples
    return avg_loss, avg_accuracy

In [None]:
def evaluate_accuracy(model, data_iter):
    # Set the model to evaluation mode
    model.eval()
    # Initialize variables to accumulate metrics
    total_correct = 0
    total_samples = 0

    # Disable gradient computation for evaluation
    with torch.no_grad():
        for X, y in data_iter:
            # Perform a forward pass
            y_hat = model(X)
            # Calculate the number of correct predictions
            total_correct += accuracy(y_hat, y)
            total_samples += y.numel()

    # Return the overall accuracy
    return total_correct / total_samples

In [None]:
for epoch in range(10):
  print(train_epoch(model, train_iter, loss, optimizer))

(2.31896084874471, 0.1012)
(1.5896739673614502, 0.3494833333333333)
(0.8661880306243896, 0.6575666666666666)
(0.6667089829762777, 0.7402166666666666)
(0.5766110443115234, 0.7755833333333333)
(0.4986470890045166, 0.8130166666666667)
(0.4615550043106079, 0.8281166666666666)
(0.431716061147054, 0.8409333333333333)
(0.4122020723660787, 0.84845)
(0.38354802646636965, 0.8576333333333334)


In [None]:
evaluate_accuracy(model, test_iter)

0.8602