# PART 1: Implementing a convolutional layer

In [162]:
# imports
import numpy as np
import torch
import torch.nn.functional as F

Start with a non-vectorized implementation. The convolution works as follows: given an input tensor x with dimensions
(batch_size, input_channels, input_width, input_height), an amount of padding, a number of output channels, kernel size, and a stride, we produce an output tensor with dimensions (batch_size, output_channels, output_width, output_height)

Question 1: Write a pseudo-code for how you would implement this with a set of nested
for loops. The convolution is defined by a set of weights/parameters which we will learn.
How do you represent these weights?

### Question 1: Pseudocode for convolution (non-vectorized)
- Weights are represented as a 4D tensor with dimensions (output_channels, input_channels, kernel_width, kernel_height)

In [None]:
#'weights' is a 4D tensor with dimensions :
#(output_channels, input_channels, kernel_height,kernel_width)

Function convolution(input_tensor, weights, bias, padding, stride):
    '''
    Convolve the input tensor with the given filter weights 
    input_tensor: 4D tensor with dimensions 
    (batch_size, input_channels,
    input_height, input_width)
    weights: 4D tensor (output_channels, 
    input_channels, 
    kernel_height, kernel_width)
    bias: 1D tensor (output_channels)
    padding: int
    stride: int
    '''
    
    # Extract input dimensions
    batch_size, \
    input_channels, \
    input_height, input_width = dimensions(input_tensor)

    # Calculate dimensions of the output tensor
    output_height, output_width = compute_output_size((input_height, input_width), \
    (kernel_height, kernel_width), stride, padding)
    output_channels = dimensions(weights)[0]

    # Initialize the output tensor with zeros
    output_tensor = zeros(batch_size, output_channels, output_height, output_width)

    # Apply padding to the input tensor
    padded_input = apply_padding(input_tensor, padding)

    # Loop over every example in the batch
    For b in range(batch_size):
        # Loop over every output channel
        For oc in range(output_channels):
            # Loop over the output spatial dimensions
            For oh in range(output_height):
                For ow in range(output_width):
                    # Initialize a variable to store the convolved value
                    convolved_value = 0
                    
                    # Iterate over each input channel
                    For ic in range(input_channels):
                        # Iterate over the kernel's spatial dimensions
                        For kh in range(kernel_height):
                            For kw in range(kernel_width):
                                # Calculate the indices on the padded input
                                i = oh * stride + kh
                                j = ow * stride + kw
                                
                                # Accumulate the weighted sum for the convolution
                                convolved_value += padded_input[b, ic, i, j] *
                                                   weights[oc, ic, kh, kw]

                    # Store the output 
                    output_tensor[b, oc, oh, ow] = convolved_value 

    Return output_tensor

Function apply_padding(input_tensor, padding):
    # Apply zero-padding to the input tensor
    # ...

Function compute_output_size(input_size, kernel_size, stride, padding):
    # Calculate the height and width of the output tensor
    # ...

### Question 2

Q2.1.Function to compute the output size

In [190]:
def compute_output_size(input_tensor, filters, stride, padding):
    """
    Computes the output size of a convolutional operation for a batch of images.

    :param input_tensor: 4D tensor with shape (batch_size, input_channels, input_height, input_width)
    :param filters: 4D tensor with shape (output_channels, input_channels, filter_height, filter_width)
    :param stride: Integer representing the stride of the convolution.
    :param padding: Integer representing the amount of padding added to the input tensor.

    :return: Tuple (batch_size, output_channels, output_width, output_height) representing the dimensions of the output tensor.
    """
    
    # Extract batch size and input channels from input tensor, and output channels from filters
    batch_size, input_channels = input_tensor.shape[0], input_tensor.shape[1]
    output_channels = filters.shape[0]

    # Extract the spatial dimensions of the input tensor
    input_height, input_width = input_tensor.shape[2], input_tensor.shape[3]

    # Extract the spatial dimensions of the filters
    filter_height, filter_width = filters.shape[2], filters.shape[3]

    # Compute the output spatial dimensions
    output_height = ((input_height - filter_height + 2 * padding) // stride) + 1
    output_width = ((input_width - filter_width + 2 * padding) // stride) + 1
    
    # Return the dimensions of the output tensor
    return (batch_size, output_channels, output_width, output_height)

# Example usage:
# Assuming input_tensor and filters are numpy arrays with the correct 4D shapes
# and stride and padding are integers.

filters = np.random.rand(8, 3, 3, 3) # 8 filters, 3 channels, 3x3 kernel
input_tensor = np.random.rand(100, 4, 5, 5) # 100 images, 3 channels, 32x32 input
stride = 1
padding = 1
output_size = compute_output_size(input_tensor, filters, stride, padding)

# Actual output size from PyTorch's convolution operation
inputs = torch.randn(100, 3, 32, 32)
filters = torch.randn(8, 3, 3, 3) 
actual_output_size = F.conv2d(inputs, filters, padding=padding, stride=stride).shape

# Expected output size from our custom compute_output_size function
expected_output_size = compute_output_size(inputs, filters, stride, padding)
print(expected_output_size)
print(actual_output_size)
# Convert PyTorch shape to a tuple for comparison
actual_output_size = tuple(actual_output_size)

# Assert that the sizes match
assert actual_output_size == expected_output_size, f"Expected {expected_output_size}, got {actual_output_size}"
print("The output size is correct.")

(100, 8, 32, 32)
torch.Size([100, 8, 32, 32])
The output size is correct.


Q2.2.vectorized implementation 

1. Extract all patches from the input
2. Flatten these patches (with all channels) into vectors, arranged as the rows of a
matrix X.
3. Multiply this matrix by a weight matrix Y = XW.
4. Reshape the matrix Y, so that its rows become the pixels of the output tensor.

In [200]:
import torch
import torch.nn.functional as F

def vectorized_convolution(input_tensor, weight, stride=1, padding=1):
    # Step 1: Extract all patches from the input tensor
    # The size of the weight determines the kernel_size
    kernel_size = (weight.shape[2], weight.shape[3])
    patches = F.unfold(
        input_tensor, 
        kernel_size=kernel_size, 
        stride=stride, 
        padding=padding
    )
    print(f'Patches (X): {patches.shape}')
    
    # Step 2: Flatten these patches
    # 'unfold' output has shape (batch_size, C * kernel_height * kernel_width, L)
    X = patches.transpose(1, 2).reshape(
        -1, 
        weight.shape[1] * kernel_size[0] * kernel_size[1]
    )
    print(f'Flattened patches (X): {X.shape}')
    
    # Reshape weights to match the flattened patches
    W = weight.reshape(weight.shape[0], -1).t()
    print(f'Reshaped Kernel (Y): {W.shape}')
    
    # Step 3: Multiply by the weight matrix
    Y = torch.matmul(X, W)
    print(f'Output from Matrix multiplication (Y): {Y.shape}')
    
    # Step 4: Reshape the matrix Y so that its rows become the pixels of the output tensor
    # The output shape will have the following dimensions (batch_size, out_channels, out_height, out_width)
    output_height = (input_tensor.shape[2] + 2 * padding - kernel_size[0]) // stride + 1
    output_width = (input_tensor.shape[3] + 2 * padding - kernel_size[1]) // stride + 1
    output = Y.reshape(
        input_tensor.shape[0], 
        output_height, 
        output_width, 
        -1
    ).permute(0, 3, 1, 2)
    print(f'Recovered output shape (Y): {output.shape}')
    
    return output

# Example usage
batch_size = 100
in_channels = 3
height, width = 32, 32
out_channels = 8
kernel_height, kernel_width = 3, 3

input_tensor = torch.randn(batch_size, in_channels, height, width)
weight = torch.randn(out_channels, in_channels, kernel_height, kernel_width)

output_tensor = vectorized_convolution(input_tensor, weight, stride=1, padding=1)

# Check that output size is correct
assert output_tensor.shape == (batch_size, out_channels, height, width), "The output size does not match the expected size."

print("The output size matches the expected size.")

Patches (X): torch.Size([100, 27, 1024])
Flattened patches (X): torch.Size([102400, 27])
Reshaped Kernel (Y): torch.Size([27, 8])
Output from Matrix multiplication (Y): torch.Size([102400, 8])
Recovered output shape (Y): torch.Size([100, 8, 32, 32])
The output size matches the expected size.


### Question 3

Q3.1. Unfold pseudocode

In [None]:
function unfold(input_tensor, kernel_size, stride, padding):
    """
    Extracts sliding local blocks from a batched input tensor.

    Parameters:
    input_tensor: A 4D tensor of shape (batch_size, channels, height, width).
    kernel_size: A tuple (kH, kW) representing the height and width of the kernel.
    stride: A tuple (sH, sW) representing the vertical and horizontal strides.
    padding: A tuple (pH, pW) representing the padding added to the height and width.

    Returns:
    A 3D tensor where each patch is flattened into a row vector.
    """

    batch_size, channels, height, width = get_shape(input_tensor)
    kH, kW = kernel_size
    sH, sW = stride
    pH, pW = padding

    # Apply padding to the input tensor
    padded_tensor = apply_padding(input_tensor, pH, pW)

    # Calculate the output dimensions
    output_height = (height + 2 * pH - kH) // sH + 1
    output_width = (width + 2 * pW - kW) // sW + 1

    # Initialize an empty list to store the patches
    patches = []

    # Loop over every example in the batch
    for i in range(batch_size):
        # Loop over the output spatial dimensions
        for h in range(output_height):
            for w in range(output_width):
                # Calculate the starting and ending indices of the patch
                start_h = h * sH
                start_w = w * sW
                end_h = start_h + kH
                end_w = start_w + kW

                # Extract the patch and flatten it
                patch = flattened_patch(padded_tensor[i, :, start_h:end_h, start_w:end_w])
                patches.append(patch)

    # Reshape the list of patches into a 3D tensor 
    # batch_size, input_channels * kernel_height * kernel_width, number_of_patches
    output_tensor = reshape_into_3D_tensor(patches, batch_size, output_height, output_width)

    return output_tensor

Q3.2.Pytorch Module implementation

In [201]:
import torch
import torch.nn.functional as F
from torch import nn

class Conv2D(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=(3, 3), stride=1, padding=1):
        super(Conv2D, self).__init__()
        self.kernel = torch.randn(out_channels, in_channels * kernel_size[0] * kernel_size[1])
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.out_channels = out_channels
        self.in_channels = in_channels
        
    def forward(self, input_batch):
        b, c, h, w = input_batch.size()

        # Get all input patches
        patches = F.unfold(input_batch, self.kernel_size, padding=self.padding, stride=self.stride)
        # print(patches.shape)

        # Flatten patches into row vectors and treat both b and p as batch dimensions
        # Reshape patches to a (b*p, k) tensor for batched matrix multiplication
        X = patches.transpose(1, 2).reshape(-1, c * self.kernel_size[0] * self.kernel_size[1])
        #print(X.shape)
        
        # Matrix multiplication with the kernel matrix
        Y = X.matmul(self.kernel.t())
        #print(Y.shape)

        # Calculate expected output dimensions
        expected_h, expected_w = ((h + 2 * self.padding - self.kernel_size[0]) // self.stride + 1,
                                  (w + 2 * self.padding - self.kernel_size[1]) // self.stride + 1)

        # Reshape Y back from (b*p, k) to ouput tensor dimensions
        output = Y.reshape(b, expected_h, expected_w, self.out_channels).permute(0, 3, 1, 2)
        #print(output.shape)

        # Assert the output dimensions
        assert output.shape[1:] == (self.out_channels, expected_h, expected_w), "Output dimensions do not match expected dimensions"

        return output

# Example usage
in_channels = 3
out_channels = 8
conv = Conv2D(in_channels, out_channels)
input_batch = torch.randn(100, in_channels, 32, 32)
output_batch = conv(input_batch)

torch.Size([100, 27, 1024])
torch.Size([102400, 27])
torch.Size([102400, 8])
torch.Size([100, 8, 32, 32])


Q3.3. Pytorch Function implementation (see Q4 for backward)

In [300]:
from torch.autograd import Function

class Conv2DFunction(Function):
    
    @staticmethod
    def forward(ctx, input_batch, kernel, stride=1, padding=1):
        ctx.save_for_backward(input_batch, kernel)
        
        ctx.stride = stride
        ctx.padding = padding
        
        b, c, h, w = input_batch.size()
        out_channels, in_channels, kh, kw = kernel.shape
        
        # Get all input patches
        patches = F.unfold(input_batch, (kh, kw), padding= padding, stride=stride)
        
        # Flatten patches into row vectors
        X = patches.transpose(1, 2).reshape(-1, c * kh * kw)
        
        # Matrix multiplication with the kernel matrix
        Y = X.matmul(kernel.view(kernel.size(0), -1).t())
        print(Y.shape)
        
        # Calculate expected output dimensions
        expected_h, expected_w = ((h + 2 * padding - kh) // stride + 1,
                                  (w + 2 * padding - kw) // stride + 1)
        
        # Reshape Y to match the expected output dimensions
        output = Y.reshape(b, -1, expected_h, expected_w).permute(0, 1, 2, 3)
        
        # Assert that the output dimensions match the expected dimensions
        assert output.shape[2] == expected_h and output.shape[3] == expected_w, "Output dimensions do not match expected dimensions"
        
        return output
    
# Define the number of input and output channels
input_batch = torch.randn(100, 3, 32, 32)
kernel = torch.randn(out_channels, in_channels, 3, 3)
output = Conv2DFunction.apply(input_batch, kernel)

print(output.shape)

torch.Size([102400, 8])
torch.Size([100, 8, 32, 32])


- Q.4 Gradient of the loss w.r.t. weights 
- Q.5. Gradient of the loss w.r.t. input

In [348]:
import numpy as np
import torch

# Define dimensions and create random tensors for input_batch and grad_output
n, c_in, h, w = 100, 3, 5, 5  # Example dimensions
c_out, k_h, k_w = 2, 3, 3  # Kernel dimensions

# Create random Tensors for input_batch, grad_output, and kernel
input_batch = torch.randn(n, c_in, h, w, requires_grad=True)
grad_output = torch.randn(n, c_out, h, w)  # Assuming h_out and w_out are same as h and w for simplicity
kernel = torch.randn(c_out, c_in, k_h, k_w, requires_grad=True)

# Perform batch matrix multiplication to get the gradient with respect to the kernel weights.
# Reshape grad_output as if it's the result of an im2col operation (n, c_out, h_out*w_out)
grad_output_reshaped = grad_output.view(n, c_out, -1)
print(grad_output_reshaped.shape)

# Reshape input_batch to (n, h*w, c_in) to perform batch matrix multiplication with grad_output_reshaped
input_batch_unfolded = F.unfold(input_batch, (k_h, k_w), stride=1, padding=1)
print(input_batch_unfolded.shape)

# Perform the batch matrix multiplication
grad_w_mul = torch.bmm(grad_output_reshaped, input_batch_unfolded.transpose(1, 2))
print(grad_w_mul.shape)

# Sum over the batch dimension to aggregate the gradients from each example
grad_w = grad_w_mul.sum(dim=0)

# We need to swap axes since the result of bmm doesn't match the (c_out, c_in, k_h, k_w) layout yet
# grad_w = grad_w_summed.transpose(0, 1).contiguous()
# print(grad_w.shape)
grad_w = grad_w.view(c_out, c_in, k_h, k_w)
print(grad_w.shape)

torch.Size([100, 2, 25])
torch.Size([100, 27, 25])
torch.Size([100, 2, 27])
torch.Size([2, 3, 3, 3])


In [None]:
# Reshape grad_output to match Y'
grad_output_reshaped = grad_output.reshape(grad_output.size(0), grad_output.size(1), -1)
print(f'Reshaped output : {grad_output_reshaped.shape}')
        
# Prepare input_batch for matrix multiplication with grad_output
# Reshape input_batch to match the dimensions for matrix multiplication
input_reshaped = input_batch.reshape(input_batch.size(0),-1,input_batch.size(2) * input_batch.size(3))   
print(f'Reshaped input: {input_reshaped.shape}')
print((input_reshaped.transpose(1, 2)).sum(0).shape)
        
        #print(f'Before bmm {input_reshaped.transpose(1, 2).sum(0).reshape_as(kernel)}:')
        # Matrix multiplication to compute the gradient w.r.t. kernel weights
        # Transposing input_reshaped for proper matrix multiplication orientation
        #kernel_grad = grad_output_reshaped.bmm(input_reshaped.transpose(1, 2)).sum(0).reshape(kernel)
        #print(f'Shape kernel : {kernel_grad.shape}')


        # Calculate the necessary shape for input_batch reshaping
        # The shape of input_reshaped should facilitate the matrix multiplication with grad_output_reshaped
        #input_reshaped = input_batch.reshape(input_batch.size(0), input_batch.size(1), -1)

        # Perform batch matrix multiplication
        # Note: Adjust the transpose dimensions if necessary based on the actual multiplication requirements
        #kernel_grad = torch.bmm(grad_output_reshaped, input_reshaped.transpose(1, 2))
        
        #print(kernel_grad.shape)

        # Sum over the batch and reshape to match the kernel shape
        #kernel_grad = kernel_grad.sum(0).reshape_as(kernel)

In [318]:
class Conv2DFunction(Function):
    
    @staticmethod
    def forward(ctx, input_batch, kernel, stride=1, padding=1):
        # store objects for the backward
        ctx.save_for_backward(input_batch, kernel)
        
        ctx.stride = stride
        ctx.padding = padding
        
        b, c, h, w = input_batch.size()
        out_channels, in_channels, kh, kw = kernel.shape
        
        # Get all input patches
        patches = F.unfold(input_batch, (kh, kw), padding= padding, stride=stride)
        
        # Flatten patches into row vectors
        X = patches.transpose(1, 2).reshape(-1, c * kh * kw)
        
        # Matrix multiplication with the kernel matrix
        Y = X.matmul(kernel.view(kernel.size(0), -1).t())
        
        # Calculate expected output dimensions
        expected_h, expected_w = ((h + 2 * padding - kh) // stride + 1,
                                  (w + 2 * padding - kw) // stride + 1)
        
        # Reshape Y to match the expected output dimensions
        output = Y.reshape(b, -1, expected_h, expected_w).permute(0, 1, 2, 3)
        
        # Assert that the output dimensions match the expected dimensions
        assert output.shape[2] == expected_h and output.shape[3] == expected_w, "Output dimensions do not match expected dimensions"
        
        return output

    @staticmethod
    def backward(ctx, grad_output):
        # Retrieve input and kernel from context saved in the forward pass
        input_batch, kernel = ctx.saved_tensors
        
        # Retrieve stride and padding from context
        stride, padding = ctx.stride, ctx.padding
        out_channels, in_channels, kh, kw = kernel.shape
        
        # (Q4) : Calculate gradient w.r.t. kernel using matrix multiplication
        # We start with grad_output, which is the gradient of the loss with respect to the layer's output. 
        # we need to consider how the kernel interacts with the input. 
        # Since the forward pass involves multiplying the kernel with the input, 
        # the backward pass involves multiplying the gradient of the output with the input.
        # gradients with respect to the kernel for each of these patches are summed up. 
        # This summation is equivalent to the batch matrix multiplication between the grad_output and the input_batch
        
        # unfold input
        input_batch_unfolded = F.unfold(input_batch, (k_h, k_w), stride=1)

        # Reshape grad_output to match Y'
        # grad_output to shape ready for bmm: (n, c_out, h*w)
        grad_output_reshaped = grad_output.view(n, c_out, -1)

        # Perform the batch matrix multiplication
        # grad_w_mul shape: (n, c_out, c_in * k_h * k_w)
        grad_w_mul = torch.bmm(grad_output_reshaped, input_batch_unfolded.transpose(1, 2))

        # Sum over the batch dimension to aggregate the gradients from each example
        # grad_w shape: (c_out, c_in * k_h * k_w)
        grad_w = grad_w_mul.sum(dim=0)

        # Reshape the summed gradient to the shape of the kernel weights
        # grad_w shape: (c_out, c_in, k_h, k_w)
        grad_w = grad_w.view(c_out, c_in, k_h, k_w)
        
        
        # (Q5): Compute gradient w.r.t. input (grad_input)
        # We start with grad_output, which is the gradient of the loss with respect to the output 
        # Need to perform a 'reverse' convolution operation. 
        # This is achieved using a transposed convolution. 
        # In transposed convolution, we slide the kernel over the grad_output, 
        # much like in the forward pass, but in a way that reconstructs the gradient with respect to the input
        # For this, we need to 'unfold' the grad_output
        # Since the kernel overlaps with multiple regions of the input during the forward pass, 
        # during the backward pass, the gradients from these overlapping regions are summed up in the input gradient.  
        # Then we use the 'fold' operation to map it back to input space
        
             
        # Reshape and transpose the kernel
        #kernel_transposed = kernel.reshape(kernel.size(0), -1).transpose(0, 1)
        #print(f'kernel transposed : {kernel_transposed.shape}')
        
        # Reshape grad_output as grad_output_reshaped to match Y'
        #grad_output_unfolded = F.unfold(grad_output, (kh, kw), padding=padding, stride=stride)
        
        #grad_output_reshaped = grad_output.reshape(grad_output.size(0),
        #                                           grad_output.size(1), 
        #                                           -1)
        #print(f'grad_output  unfolded : {grad_output_unfolded.shape}')
        
        #grad_output_reshaped = grad_output_unfolded.transpose(1, 2).reshape(-1, in_channels * kh * kw)
        #print(f'grad_output reshaped : {grad_output_unfolded.shape}')
        
        # Matrix multiplication with the transposed kernel to get the gradient w.r.t. U
        #grad_U = grad_output_reshaped.matmul(kernel_transposed)
        #print(f'Gradient w.r.t. U shape : {grad_U.shape}')

        # Fold the gradient w.r.t. U back to the shape of the input_batch to get grad_input
        #input_batch_grad = F.fold(grad_U, 
        #                        (h, w), 
        #                        kernel_size=(kh, kw), 
        #                        padding=padding, 
        #                        stride=stride)
        #print(f'Input batch gradient shape : {input_batch_grad.shape}')
        
        # Compute gradient w.r.t. input (grad_input)
        # Reshape grad_output to the shape of 'Y' in the forward pass
        grad_Y = grad_output.permute(0, 2, 3, 1).reshape(-1, kernel.size[0])
        print(grad_Y.shape)
        # Perform the transpose convolution operation using the original kernel, which has been flipped
        grad_input_padded = grad_Y.matmul(kernel.reshape(kernel.size[0], -1))
        print(grad_input_padded.shape)
        # Fold back to the original input dimensions
        grad_input_padded = grad_input_padded.view(input_batch.size(0), input_batch.size(2) + 2 * padding - 2, input_batch.size(3) + 2 * padding - 2, -1)
        grad_input = F.fold(grad_input_padded, (input_batch.size(2), input_batch.size(3)), (1, 1))
        print(grad_input.shape)

        
        return None, None, None, None

Check implementation

In [293]:
# Define the input batch and the kernel with compatible dimensions
# Define the input batch and kernel with dimensions
in_channels, out_channels, kernel_size = 3, 8, (3, 3)
input_batch = torch.randn(100, in_channels, 32, 32, requires_grad=True)
kernel = torch.randn(out_channels, in_channels, *kernel_size, requires_grad=True)

# Forward pass
output = Conv2DFunction.apply(input_batch, kernel)
#print(output.shape)

# Fake a gradient output by creating a tensor of ones with the same shape as the output
grad_output = torch.ones_like(output)
#print(grad_output.shape)

# Get the gradients of the input and kernel by calling backward on the output
output.backward(grad_output)

# Print out the gradients
print('Gradient with respect to the input batch:')
print(input_batch.grad)
print('Gradient with respect to the kernel:')
print(kernel.grad)



# Backward pass
#output.backward(output, grad_output)

#print('Input gradient shape:', input_grad.shape)
#print('Kernel gradient shape:', kernel_grad.shape)

# Verify the gradients by comparing with the PyTorch's autograd
# output.backward(grad_output)

#print('Autograd input gradient shape:', input_batch.grad.shape)
#print('Autograd kernel gradient shape:', kernel.grad.shape)

# Check if the gradients match to a reasonable degree of precision
#assert torch.allclose(input_batch.grad, input_grad, atol=1e-5), "Input gradients do not match"
#assert torch.allclose(kernel.grad, kernel_grad, atol=1e-5), "Kernel gradients do not match"

#print("Gradients computed correctly!")

Reshaped output : torch.Size([100, 8, 1024])
Reshaped input: torch.Size([100, 3, 1024])
torch.Size([1024, 3])


TypeError: 'builtin_function_or_method' object is not subscriptable

For comparison with autograd already defined function

In [316]:
import torch
import torch.nn.functional as F

class Conv2DFunc(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward
    passes which operate on Tensors.
    """
    @staticmethod
    def forward(ctx, input_batch, kernel, stride=1, padding=1):
        """
        In the forward pass we receive a Tensor containing the input
        and return a Tensor containing the output. ctx is a context
        object that can be used to stash information for backward
        computation. You can cache arbitrary objects for use in the
        backward pass using the ctx.save_for_backward method.
        """
        # store objects and parameters for the backward
        ctx.save_for_backward(input_batch, kernel)
        ctx.stride = stride
        ctx.padding = padding
        
        # Perform the convolution operation
        output_batch = F.conv2d(input_batch, kernel, stride=stride, padding=padding)
        
        return output_batch

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the
        gradient of the loss with respect to the output, and we need
        to compute the gradient of the loss with respect to the
        input.
        """
        # retrieve stored objects
        input_batch, kernel = ctx.saved_tensors
        stride = ctx.stride
        padding = ctx.padding
        
        # Compute gradients with respect to the input and kernel
        input_batch_grad = F.grad.conv2d_input(input_batch.shape, kernel, grad_output, stride=stride, padding=padding)
        kernel_grad = F.grad.conv2d_weight(input_batch, kernel.shape, grad_output, stride=stride, padding=padding)
        
        # Return the gradients, with None for the stride and padding
        return input_batch_grad, kernel_grad, None, None

# Define the input batch and kernel with dimensions
in_channels, out_channels, kernel_size = 3, 8, (3, 3)
input_batch = torch.randn(16, in_channels, 32, 32, requires_grad=True)
kernel = torch.randn(out_channels, in_channels, *kernel_size, requires_grad=True)

# Apply the custom convolution function
output_batch = Conv2DFunc.apply(input_batch, kernel)

# Pretend we have some gradient coming back during backpropagation
grad_output = torch.randn_like(output_batch)

# Get the gradients of the input and kernel by calling backward on the output
output_batch.backward(grad_output)

# Print out the gradients
print('Gradient with respect to the input batch:')
print(input_batch.grad.shape)
print('Gradient with respect to the kernel:')
print(kernel.grad.shape)

Gradient with respect to the input batch:
torch.Size([16, 3, 32, 32])
Gradient with respect to the kernel:
torch.Size([8, 3, 3, 3])


In [217]:
import torch
import torch.nn.functional as F
from torch.autograd import Function

class Conv2DFunction(Function):
    
    @staticmethod
    def forward(ctx, input_batch, kernel, stride=1, padding=1):
        ctx.save_for_backward(input_batch, kernel)
        
        ctx.stride = stride
        ctx.padding = padding

        output_batch = F.conv2d(input_batch, kernel, stride=stride, padding=padding)
        return output_batch

    @staticmethod
    def backward(ctx, grad_output):
        input_batch, kernel = ctx.saved_tensors
        stride = ctx.stride
        padding = ctx.padding

        input_batch_grad = kernel_grad = None

        if ctx.needs_input_grad[0]:
            input_batch_grad = F.grad.conv2d_input(input_batch.shape, kernel, grad_output, stride, padding)
        if ctx.needs_input_grad[1]:
            kernel_grad = F.grad.conv2d_weight(input_batch, kernel.shape, grad_output, stride, padding)
        
        return input_batch_grad, kernel_grad, None, None

# Define the number of input and output channels
in_channels = 3
out_channels = 8

input_batch = torch.randn(100, in_channels, 32, 32, requires_grad=True)
kernel = torch.randn(out_channels, in_channels, 3, 3, requires_grad=True)

# Forward pass
output_batch = Conv2DFunction.apply(input_batch, kernel)

# Fake a gradient output by creating a tensor of ones with the same shape as the output
grad_output = torch.ones_like(output_batch)

# Backward pass
input_batch_grad, kernel_grad = Conv2DFunction.apply(input_batch, kernel).backward(grad_output)

print('Input gradient shape:', input_batch_grad.shape)
print('Kernel gradient shape:', kernel_grad.shape)
print('Input gradient:', input_batch_grad)
print('Kernel gradient:', kernel_grad)


TypeError: cannot unpack non-iterable NoneType object

In [218]:
import torch
import torch.nn.functional as F
from torch.autograd import Function

class Conv2DFunction(Function):
    
    @staticmethod
    def forward(ctx, input_batch, kernel, stride=1, padding=1):
        ctx.save_for_backward(input_batch, kernel)
        ctx.stride = stride
        ctx.padding = padding

        output_batch = F.conv2d(input_batch, kernel, stride=stride, padding=padding)
        return output_batch

    @staticmethod
    def backward(ctx, grad_output):
        input_batch, kernel = ctx.saved_tensors
        stride = ctx.stride
        padding = ctx.padding

        input_batch_grad = kernel_grad = None

        if ctx.needs_input_grad[0]:
            input_batch_grad = F.conv2d_input(input_batch.shape, kernel, grad_output, stride=stride, padding=padding)
        if ctx.needs_input_grad[1]:
            kernel_grad = F.conv2d_weight(input_batch, kernel.shape, grad_output, stride=stride, padding=padding)
        
        return input_batch_grad, kernel_grad, None, None

# Define the number of input and output channels
in_channels = 3
out_channels = 8

input_batch = torch.randn(100, in_channels, 32, 32, requires_grad=True)
kernel = torch.randn(out_channels, in_channels, 3, 3, requires_grad=True)

# Forward pass
output_batch = Conv2DFunction.apply(input_batch, kernel)

# Fake a gradient output by creating a tensor of ones with the same shape as the output
grad_output = torch.ones_like(output_batch)

# Backward pass
input_batch_grad, kernel_grad = Conv2DFunction.apply(input_batch, kernel).backward(grad_output)

print('Input gradient shape:', input_batch_grad.shape)
print('Kernel gradient shape:', kernel_grad.shape)
print('Input gradient:', input_batch_grad)
print('Kernel gradient:', kernel_grad)


AttributeError: module 'torch.nn.functional' has no attribute 'conv2d_input'

End part 1