In [None]:
import numpy as np
import torch
from sklearn.datasets import load_sample_images


sample_images = np.stack(load_sample_images()["images"])
sample_images = torch.tensor(sample_images, dtype = torch.float32) / 255

In [None]:
sample_images.shape
# height 427, width 630 and 3 channels red,green,blue

torch.Size([2, 427, 640, 3])

In [None]:
sample_images_permuted = sample_images.permute(0,3,1,2)
sample_images_permuted.shape

# Pytorch expects channel to be before height and width so we change the using permute

torch.Size([2, 3, 427, 640])

In [None]:
import torchvision
import torchvision.transforms.v2 as T


cropped_images = T.CenterCrop((70, 120))(sample_images_permuted)
cropped_images.shape

# we used center crop function to force the incoming dimensions to converted to 70 X 120 this make sure any image come within the crop function converted to this dimension

torch.Size([2, 3, 70, 120])

In [None]:
# now using 2d convolutional layer to process the cropped image

import torch.nn as nn

torch.manual_seed(42)

conv_layer = nn.Conv2d(in_channels = 3, out_channels = 32, kernel_size = 7)
fmaps = conv_layer(cropped_images)

In [None]:
fmaps.shape

# we used conv2d to convert the image pixels to feature maps by giving pixel size 7x7 in the receptive field so it can take 7x7 pizxels as batches to identify images
# Then we convert the channel from 3 to 32 feature maps as various features

torch.Size([2, 32, 64, 114])

In [None]:
conv_layer2 = nn.Conv2d(in_channels = 3, out_channels=32, kernel_size = 7, padding = "same")

fmaps2 = conv_layer2(cropped_images)
fmaps2.shape

# we can use padding "same" to get every pixel to learned from convolutional layer

torch.Size([2, 32, 70, 120])

In [None]:
conv_layer2.weight.shape
# out_channel, in_channel, kernel_height, kernel_width

torch.Size([32, 3, 7, 7])

In [None]:
conv_layer2.bias.shape

# every feature map in a convalutional layer shares the same parameter

torch.Size([32])

In [None]:
maxpool = nn.MaxPool2d(kernel_size = 2)

# Pooling layers reduces the size of an image from the first convolutional layer by retaining only the important pixels
# here we used maxpool which only get maximum intensed pixel of the particular kernel

In [None]:
import torch.nn.functional as F

class DepthPool(torch.nn.Module):

  def __init__(self, kernel_size, stride = None, padding = 0):
    super().__init__()

    self.kernel_size = kernel_size
    self.stride = stride if stride is not None else kernel_size
    self.padding = padding

  def forward(self, inputs):
    batch, channels, height, width = inputs.shape

    z = inputs.view(batch, channels, height * width)
    z = z.permute(0,2,1)
    z = F.max_pool1d(z, kernel_size = self.kernel_size, stride = self.stride, padding = self.padding)
    z = z.permute(0,2,1)

    return z.view(batch, -1, height, width)

# Th depthpool function will take a input and shift the shape by interchange the height * width with channel so we can do max pooling on channels wise
# finally change it back to normal size by permuting again with the changed size


In [None]:
global_avg_pool = nn.AdaptiveAvgPool2d(output_size = 1)
output_pool = global_avg_pool(cropped_images)
output_pool

# Global average pool reduces informations from all feature maps into single output but it reduces most of the informations
# we take overall from an image shape in the final layer to detect the image

tensor([[[[0.6434]],

         [[0.5972]],

         [[0.5825]]],


        [[[0.7631]],

         [[0.2601]],

         [[0.1085]]]])