In [1]:
import torch

from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
class Linear:
    """
    Linear transformation neural layer.
    
    Args:
    - n_in (int): Input dimension.
    - n_out (int): Output dimension.
    - bias (bool, optional): Whether to include bias terms. Default is True.
    
    Attributes:
    - weight (torch.Tensor): Learnable weight matrix of shape (n_in, n_out).
    - bias (torch.Tensor or None): Learnable bias vector of shape (n_out) if bias is True, otherwise None.
    
    Methods:
    - __init__(self, n_in, n_out, bias=True): Initializes the Linear layer with Kaiming initialization.
    - __call__(self, x): Computes the linear transformation of the input tensor x.
    - parameters(self): Returns a list of learnable parameters (weight and bias if applicable).
    
    Returns:
    - out (torch.Tensor): Linearly transformed output of shape (m, n_out).
    """
    def __init__(self, n_in, n_out, bias = True):
        self.weight = torch.randn((n_in, n_out)) / n_in**.5 # Kaiming initialisation
        self.bias = torch.zeros(n_out) if bias else None
        
    def __call__(self, x):
        self.out = x @ self.weight 
        
        if self.bias is not None:
            self.out += self.bias
            
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])
    
class Flatten:
    """
    Flatten layer.
    
    Args:
    - x (torch.Tensor): Input tensor of shape (m, n_c, n_h, n_w).
    
    Methods:
    - __call__(self, x): Flattens the input tensor x.
    - parameters(self): Returns an empty list since there are no learnable parameters.
    
    Returns:
    - out (torch.Tensor): Flattened output of shape (m, n_c * n_h * n_w).
    """
    def __call__(self, x):
        self.out = x.reshape(x.shape[0], -1)
        return self.out
    
    def parameters(self):
        return []
    
class ZeroPad:
    """
    Pad with zeros all images of the dataset x. The padding is applied to the height and width of each image.
    
    Args:
    - pad (int): Amount of padding around each image on the vertical and horizontal dimensions.
    - x (torch.Tensor): Input tensor of shape (m, n_c, n_h, n_w), representing a batch of m images.
    
    Methods:
    - __init__(self, pad): Initializes the ZeroPad layer with the specified padding.
    - __call__(self, x): Applies zero padding to the input batch of images x.
    - parameters(self): Returns an empty list since there are no learnable parameters.
    
    Returns:
    - X_pad (torch.Tensor): Padded images of shape (m, n_c, n_h + 2 * pad, n_w + 2 * pad).
    """
    def __init__(self, pad):
        self.padding = pad
    
    def __call__(self, x):
        
        m, n_c, n_h, n_w = x.size()
        X_pad = torch.zeros([m, n_c, n_h + 2 * self.padding, n_w + 2 * self.padding])
        X_pad[:,:, self.padding : self.padding + n_h, self.padding: self.padding + n_w] = x
        
        return X_pad  
    
    def parameters(self):
        return []
    

class Convolution2D:
    """
    Convolutional neural layer.
    
    Args:
    - n_c_prev (int): Number of input channels.
    - n_c (int): Number of output channels (filters).
    - f (int): Kernel size (filter size).
    - stride (int, optional): Stride for the convolution operation. Default is 1.
    - bias (bool, optional): Whether to include bias terms. Default is True.
    - X (torch.Tensor): Input tensor of shape (m, n_c_prev, n_h_prev, n_w_prev).
    
    Attributes:
    - n_c_prev (int): Number of input channels.
    - n_c (int): Number of output channels (filters).
    - f (int): Kernel size (filter size).
    - stride (int): Stride for the convolution operation.
    - weight (torch.Tensor): Learnable weight tensor of shape (n_c, n_c_prev, f, f).
    - bias (torch.Tensor or None): Learnable bias tensor of shape (n_c, 1, 1, 1) if bias is True, otherwise None.
    
    Methods:
    - __init__(self, n_c_prev, n_c, f, stride=1, bias=True): Initializes the Convolution2D layer with Kaiming initialization.
    - __call__(self, X): Performs convolution on the input tensor X.
    - parameters(self): Returns a list of learnable parameters (weight and bias if applicable).
    
    Returns:
    - Z (torch.Tensor): Convolved output of shape (m, n_c, (n_h_prev - f) // stride + 1, (n_w_prev - f) // stride + 1).
    """
    def __init__(self, n_c_prev, n_c, f, stride = 1, bias = True):
        self.n_c_prev = n_c_prev
        self.n_c = n_c
        self.f = f
        self.stride = stride
        self.weight = torch.randn((n_c, n_c_prev, f, f)) / n_c_prev**.5 # Kaiming initialisation
        self.bias = torch.zeros((n_c, 1, 1, 1)) if bias else None
        
    def __call__(self, X):
        
        # Get the input shape.
        m, _, n_h_prev, n_w_prev = X.size()
        
        # Compute the height and width dimensions of the output.
        n_h = int((n_h_prev - self.f) / self.stride + 1)
        n_w = int((n_w_prev - self.f) / self.stride + 1)
        
        # Initialise the output tensor.
        Z = torch.zeros(m, self.n_c, n_h, n_w)
        
        # Loop over the batch of training examples to fill the output tensor Z.
        for i in range(m):
            # Loop over vertical axis of the output volume.
            for h in range(n_h):           
                #Find the vertical start and end of the current "slice".
                vert_start = h * self.stride
                vert_end = vert_start + self.f
                # Loop over horizontal axis of the output volume.
                for w in range(n_w):
                    #Find the horizontal start and end of the current "slice".
                    horiz_start = w * self.stride
                    horiz_end = horiz_start + self.f
                    # Loop over channels (= #filters) of the output volume.
                    for c in range(self.n_c):
                        # Use the corners to define the (3D) slice of the ith training example.
                        X_slice = X[i, :, vert_start : vert_end, horiz_start : horiz_end] # (n_c_prev, f, f)
                        #Convolve the (3D) slice with the correct filter W and bias b, to get back one output neuron.
                        Z[i, c, h, w] = torch.sum(X_slice * self.weight[c, :, :, :]) + torch.squeeze(self.bias[c, :, :, :])
        return Z
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])
    
    
class Pooling:
    """
    Pooling neural layer.
    
    Args:
    - f (int): Kernel size (pooling window size).
    - stride (int, optional): Stride for the pooling operation. Default is 1.
    - mode (str, optional): Pooling mode, either "max" or "average". Default is "max".
    - X (torch.Tensor): Input tensor of shape (m, n_c, n_h_prev, n_w_prev).
    
    Methods:
    - __init__(self, f, stride=1, mode="max"): Initializes the Pooling layer.
    - __call__(self, X): Applies pooling to the input tensor X.
    - parameters(self): Returns an empty list since there are no learnable parameters.
    
    Returns:
    - A (torch.Tensor): Pooled output of shape (m, n_c, (n_h_prev - f) // stride + 1, (n_w_prev - f) // stride + 1).
    """
    def __init__(self, f, stride = 1, mode = "max"):
        self.f = f
        self.stride = stride
        self.mode = mode
        assert mode == "max" or mode == "average", 'mode must be either "max" or "average".'
        
    def __call__(self, X):
        
        # Get the input shape. 
        m, n_c, n_h_prev, n_w_prev = X.size()
        
        # Compute the height and width dimensions of the output.
        n_h = int((n_h_prev - self.f) / self.stride + 1)
        n_w = int((n_w_prev - self.f) / self.stride + 1)
        
        # Initialise the output tensor.
        A = torch.zeros(m, n_c, n_h, n_w)
        
        # Loop over the batch of training examples to fill the output tensor A
        for i in range(m):
            # Loop over vertical axis of the output volume
            for h in range(n_h):           
                #Find the vertical start and end of the current "slice"
                vert_start = h * self.stride
                vert_end = vert_start + self.f
                # Loop over horizontal axis of the output volume.
                for w in range(n_w):
                    #Find the horizontal start and end of the current "slice".
                    horiz_start = w * self.stride
                    horiz_end = horiz_start + self.f
                    # Loop over channels (= #filters) of the output volume.
                    for c in range(n_c):
                        # Use the corners to define the (3D) slice of the ith training example.
                        X_slice = X[i, :, vert_start : vert_end, horiz_start : horiz_end] # (n_c, f, f)
                        # Compute the pooling operation on the slice. 
                        # Use an if statement to differentiate the modes. 
                        if self.mode == "max":
                            A[i, c, h, w] = torch.max(X_slice)
                        elif self.mode == "average":
                            A[i, c, h, w] = torch.mean(X_slice)
        return A
    
    def parameters(self):
        return []
    
class ReLU:
    """
    Rectified Linear Unit (ReLU) activation layer.
    
    Methods:
    - __call__(self, x): Applies ReLU activation to the input tensor x.
    - parameters(self): Returns an empty list since there are no learnable parameters.
    """
    def __call__(self, x):
        self.out = x * (x > 0).float()
        return self.out
    
    def parameters(self):
        return []

class Sigmoid:
    """
    Sigmoid activation layer.
    
    Methods:
    - __call__(self, x): Applies the sigmoid activation function to the input tensor x.
    - parameters(self): Returns an empty list since there are no learnable parameters.
    """
    def __call__(self, x):
        self.out = 1 / (1 + torch.exp(-x))
        return self.out
    
    def parameters(self):
        return []


class Sequential:
    """
    Container for organizing and applying a sequence of neural network layers.

    Args:
    - layers (list): List of layer instances to be added to the sequential model.

    Methods:
    - __init__(self, layers): Initializes the Sequential model with a list of layers.
    - __repr__(self): Returns a string representation of the Sequential model, including the number of layers and parameters.
    - __call__(self, x, verbose=False): Applies each layer in sequence to the input tensor x.
    - add(self, layer): Adds a new layer to the Sequential model.
    - parameters(self): Returns a list of all learnable parameters in the model.

    Attributes:
    - layers (list): List of layers in the Sequential model.
    - out (torch.Tensor): Output tensor after applying all layers in the sequence.
    """
    def __init__(self, layers):
        self.layers = layers
        
    def __repr__(self):
        return f'Sequential model, number of layers: {len(self.layers)}, number of parameters: {sum([p.numel() for layer in self.layers for p in layer.parameters()])}'
    
    def __call__(self, x, verbose = False):
        for layer in self.layers:
            x = layer(x)
            if verbose:
                print(x.size())
        self.out = x
        return self.out
    
    def add(self, layer):
        self.layers.append(layer)
    
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

In [3]:
torch.random.manual_seed(1)

f1 = 4
f2 = 8
f3 = 2
f4 = 4

stride1 = 1
stride2 = 8
stride3 = 1
stride4 = 6

n_c0 = 3
n_c1 = 8
n_c2 = 16

pad1 = (f1 - 1) // 2
pad2 = (f2 - 1) // 2
pad3 = (f3 - 1) // 2
pad4 = (f4 - 1) // 2

m = 1
n_h0 = 64
n_w0 = 64
n_out = 6

model = Sequential([ZeroPad(pad1),
                    Convolution2D(n_c_prev = n_c0, n_c = n_c1, f = f1, stride = stride1),
                    ReLU(),
                    ZeroPad(pad2),
                    Pooling(f2, stride2),
                    ZeroPad(pad3),
                    Convolution2D(n_c_prev = n_c1, n_c = n_c2, f = f3, stride = stride3),
                    ReLU(),
                    ZeroPad(pad4),
                    Pooling(f4, stride4),
                    Flatten(),
                    Linear(n_c2, n_out),
                    Sigmoid()
                   ])

for p in model.parameters(): 
    p.requires_grad = True

print(model)
model.layers

Sequential model, number of layers: 13, number of parameters: 1022


[<__main__.ZeroPad at 0x7f93683beee0>,
 <__main__.Convolution2D at 0x7f9310323e20>,
 <__main__.ReLU at 0x7f93103232e0>,
 <__main__.ZeroPad at 0x7f9310323340>,
 <__main__.Pooling at 0x7f9310323610>,
 <__main__.ZeroPad at 0x7f9310323520>,
 <__main__.Convolution2D at 0x7f9310323a90>,
 <__main__.ReLU at 0x7f9310323d60>,
 <__main__.ZeroPad at 0x7f9310323d30>,
 <__main__.Pooling at 0x7f93103237f0>,
 <__main__.Flatten at 0x7f9310323280>,
 <__main__.Linear at 0x7f93103234f0>,
 <__main__.Sigmoid at 0x7f9310323400>]

In [4]:
X = torch.rand(m, n_c0, n_h0, n_w0)
print(X.size())
out = model(X, verbose = True)

torch.Size([1, 3, 64, 64])
torch.Size([1, 3, 66, 66])
torch.Size([1, 8, 63, 63])
torch.Size([1, 8, 63, 63])
torch.Size([1, 8, 69, 69])
torch.Size([1, 8, 8, 8])
torch.Size([1, 8, 8, 8])
torch.Size([1, 16, 7, 7])
torch.Size([1, 16, 7, 7])
torch.Size([1, 16, 9, 9])
torch.Size([1, 16, 1, 1])
torch.Size([1, 16])
torch.Size([1, 6])
torch.Size([1, 6])
