In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math


<h1>Chapter 2: Logistic regression / MLPs </h1>

<h5>Logistic regression - Cross entropy loss </h5>
memory complexity of eqn : O(N * d^2)

In [2]:
def cross_entropy_loss( y_true, y_pred ):

    epsilon = 1e-15  # small constant to avoid division by zero
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)  # clip predictions to avoid log(0)
    
    # Calculate cross-entropy loss
    loss = - np.sum(y_true * np.log(y_pred))
    return loss





<h5> Multi Layer Perceptron</h5>


In [8]:
class MLP:
    def __init__(self, input_size, hidden_sizes, output_size, activation='relu'):
        self.num_layers = 1 + len(hidden_sizes) + 1  # Input layer + hidden layers + output layer
        self.sizes = [input_size] + hidden_sizes + [output_size]
        self.weights = [None] * (self.num_layers - 1)
        self.biases = [None] * (self.num_layers - 1)
        self.set_activation(activation)
        
    def set_weights(self, layer_idx, weights):
        self.weights[layer_idx] = weights
        
    def set_biases(self, layer_idx, biases):
        self.biases[layer_idx] = biases
    
    def set_activation(self, activation):
        self.activation_name = activation
        if activation == 'prelu':
            self.activation = self.prelu
        elif activation == 'gelu':
            self.activation = self.gelu
        elif activation == 'relu':
            self.activation = self.relu
        elif activation == 'tanh':
            self.activation = self.tanh
        elif activation == 'linear':
            self.activation = self.linear
        elif activation == 'elu':
            self.activation = self.elu
        elif activation == 'swish':
            self.activation = self.swish
        else:
            raise ValueError(f"Activation function '{activation}' not supported.")
    
    def prelu(self, x, alpha=0.01):
        return np.where(x > 0, x, alpha * x)
    
    def gelu(self, x):
        return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
    
    def relu(self, x):
        return np.maximum(0, x)
    
    def tanh(self, x):
        return np.tanh(x)
    
    def linear(self, x):
        return x
    
    def elu(self, x, alpha=1.0):
        return np.where(x > 0, x, alpha * (np.exp(x) - 1))
    
    def swish(self, x):
        return x * self.sigmoid(x)
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
     # ... (other activation functions and their derivatives)

    def forward(self, x):
        self.activations = [x]
        self.z_values = []
        a = x
        
        for i in range(self.num_layers - 1):
            z = np.dot(a, self.weights[i]) + self.biases[i]
            self.z_values.append(z)
            
            if i in self.skip_layers and i > 0:
                a = self.activation(z) + self.activations[i - 1]  # Adding skip connection
            else:
                a = self.activation(z)
            
            self.activations.append(a)
        
        return a
    def backward(self, x, y, learning_rate):
        output = self.activations[-1]
        loss = np.mean((output - y)**2)
        
        delta = (output - y) * self.activation_derivative(output)
        self.deltas = [delta]
        
        for i in range(self.num_layers - 2, 0, -1):
            delta = np.dot(delta, self.weights[i].T) * self.activation_derivative(self.activations[i])
            self.deltas.insert(0, delta)
        
        for i in range(self.num_layers - 1):
            self.weights[i] -= learning_rate * np.dot(self.activations[i].T, self.deltas[i])
            self.biases[i] -= learning_rate * self.deltas[i]
        
        return loss
    
    def activation_derivative(self, x):
        if self.activation_name == 'prelu':
            return np.where(x > 0, 1, 0.01)
        elif self.activation_name == 'gelu':
            return 0.5 * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
        elif self.activation_name == 'relu':
            return np.where(x > 0, 1, 0)
        elif self.activation_name == 'tanh':
            return 1 - np.square(np.tanh(x))
        elif self.activation_name == 'linear':
            return np.ones_like(x)
        elif self.activation_name == 'elu':
            return np.where(x > 0, 1, self.activation(x) + 1)
        elif self.activation_name == 'swish':
            return self.sigmoid(x) * (1 + x * (1 - self.sigmoid(x)))
        else:
            raise ValueError(f"Activation function '{self.activation_name}' not supported.")

# Example usage
input_size = 2
hidden_sizes = [4]
output_size = 1
learning_rate = 0.1
activation_function = 'relu'

mlp = MLP(input_size, hidden_sizes, output_size, activation_function)

# Set custom weights and biases for each layer
custom_weights = [np.array([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6], [0.7, 0.8]]),
                  np.array([[0.2, 0.3, 0.4, 0.5]])]
custom_biases = [np.array([[0.1, 0.2, 0.3, 0.4]]),
                 np.array([[0.2]])]

for i in range(len(custom_weights)):
    mlp.set_weights(i, custom_weights[i])
    mlp.set_biases(i, custom_biases[i])

[array([[0.1, 0.2],
       [0.3, 0.4],
       [0.5, 0.6],
       [0.7, 0.8]]), array([[0.2, 0.3, 0.4, 0.5]])]


ValueError: shapes (1,2) and (4,2) not aligned: 2 (dim 1) != 4 (dim 0)

<h3><span style="color:yellow;">Attention Calculator</span> </h3>

In [23]:

import numpy as np

# Given values (example values)
wq = np.array([[0.0], [2.0]])
wk = np.array([[2.0], [0.0]])
wv = np.array([[0.5], [0.5]])
wo = np.array([[0.5, 0.5]])

x = np.array([[1, 1], [0, 0], [1, 1]])

# Calculating self-attention
q = np.dot(x, wq)  # Query
k = np.dot(x, wk)  # Key
v = np.dot(x, wv)  # Value

# Compute attention scores (unnormalized)
attention_scores = np.dot(q, k.T)  # Shape: (sequence_length, sequence_length)
attention_scores /= np.sqrt(wq.shape[1])  # Normalizing by the square root of the dimension

# Apply softmax to get attention weights
attention_weights = np.exp(attention_scores) / np.sum(np.exp(attention_scores), axis=1, keepdims=True)

# Calculate the context vector using attention weights and values
context_vector = np.dot(attention_weights, v)

# Calculate the final output using the output weights
output = np.dot(context_vector, wo)

print("Input:")
print(x)
print("\nQuery:")
print(q)
print("\nKey:")
print(k)
print("\nValue:")
print(v)
print("\nAttention Scores:")
print(np.round(attention_scores,2))
print("\nAttention Weights:")
print( np.round(attention_weights,2))
print("\nContext Vector:")
print(np.round(context_vector,2))
print("\nOutput:")
print(np.round(output,2))


Input:
[[1 1]
 [0 0]
 [1 1]]

Query:
[[2.]
 [0.]
 [2.]]

Key:
[[2.]
 [0.]
 [2.]]

Value:
[[1.]
 [0.]
 [1.]]

Attention Scores:
[[4. 0. 4.]
 [0. 0. 0.]
 [4. 0. 4.]]

Attention Weights:
[[0.5  0.01 0.5 ]
 [0.33 0.33 0.33]
 [0.5  0.01 0.5 ]]

Context Vector:
[[0.99]
 [0.67]
 [0.99]]

Output:
[[0.5  0.5 ]
 [0.33 0.33]
 [0.5  0.5 ]]


<h1> Parameter Number Calculator </h1> 

<h3><span style="color:yellow;">MLP params</span> </h3>

In [25]:
def mlp_params(layer_sizes):
    params = 0
    for i in range(len(layer_sizes) - 1):
        params += (layer_sizes[i] * layer_sizes[i+1]) + layer_sizes[i+1]
    return params

layer_sizes = [ 5, 20]
print(mlp_params(layer_sizes))

120


<h3> <span style="color:yellow;">Batch Norm params</span></h3>

In [4]:
def batch_norm_params(input_dimension, network_type):
    # Function to calculate number of trainable parameters in batch normalization layer
    # Input: input_dimension: number of input channels
    #        network_type: CNN or Fully Connected
    # Output: number of trainable parameters in batch normalization layer
    if network_type == 'CNN':
        return 2*input_dimension
    elif network_type == 'FC':
        return 2*input_dimension

input_dimension = 10 # Number of input channels to bn layer
network_type = 'CNN'
print("Number of trainable parameters for", network_type, ":", batch_norm_params(input_dimension, network_type))

Number of trainable parameters for CNN : 20


<h3> <span style="color:yellow;">Layer Norm params</span></h3>

In [6]:
def layer_norm_params(input_dimension, spatial_dimension=None, network_type='FC'):
    """
    Function to calculate number of trainable parameters in layer normalization layer
    
    Inputs:
    - input_dimension: number of input channels or neurons
    - spatial_dimension: tuple (height, width) for spatial dimensions in CNN. Not required for FC.
    - network_type: 'CNN' or 'FC'
    
    Output: number of trainable parameters in layer normalization layer
    """
    
    if network_type == 'CNN':
        if spatial_dimension is None:
            raise ValueError("For CNN, spatial_dimension (height, width) must be provided")
        height, width = spatial_dimension
        return 2 * input_dimension * height * width
    elif network_type == 'FC':
        return 2 * input_dimension

input_dimension = 10  # Number of input channels to ln layer
spatial_dimension = (3, 3)  # Assuming a spatial size of 32x32 for CNN
network_type = 'CNN'
print("Number of trainable parameters for", network_type, ":", layer_norm_params(input_dimension, spatial_dimension, network_type))


Number of trainable parameters for CNN : 180


<h3> <span style="color:yellow;">Recurrent network params</span></h3>

In [10]:
def rnn_params(input_dim, hidden_dim, cell_type='RNN'):
    """
    Function to calculate number of trainable parameters in RNN, LSTM, or GRU cell
    
    Inputs:
    - input_dim: Dimensionality of input data
    - hidden_dim: Dimensionality of hidden state
    - cell_type: 'RNN', 'LSTM', or 'GRU'
    
    Output: number of trainable parameters in the specified cell
    """
    
    if cell_type == 'RNN':
        return input_dim * hidden_dim + hidden_dim**2 + hidden_dim
    elif cell_type == 'LSTM':
        return 4 * (input_dim * hidden_dim + hidden_dim**2 + hidden_dim)
    elif cell_type == 'GRU':
        return 3 * (input_dim * hidden_dim + hidden_dim**2 + hidden_dim)
    else:
        raise ValueError("Invalid cell_type. Choose from 'RNN', 'LSTM', or 'GRU'.")

input_dim = 64 # Example input dimension
hidden_dim = 256 # Example hidden dimension
cell_type = 'LSTM'
print("Number of trainable parameters for", cell_type, ":", rnn_params(input_dim, hidden_dim, cell_type))



Number of trainable parameters for LSTM : 328704


In [21]:
import numpy as np
def maintain_rf(filter_A_dims, stride_A, dilation_A, filter_B_dims, stride_B, dilation_B):
    # Ensure filter dimensions are of the same length for A and B
    assert len(filter_A_dims) == len(filter_B_dims), "Dimension mismatch"
    
    n_filters = []
    
    for dim_A, dim_B in zip(filter_A_dims, filter_B_dims):
        RF_A = ((dim_A - 1) * dilation_A) + 1
        
        n = 0
        RF_B = 1
        while RF_B < RF_A:
            RF_B = ((dim_B - 1) * dilation_B) + 1 + (RF_B - 1) * stride_B
            n += 1
        n_filters.append(n)

    return n_filters

def compare_parameters(input_channels, output_channels, filter_A_dims, filter_B_dims, n_filters):
    params_A = input_channels * output_channels * filter_A_dims[0] * filter_A_dims[1]
    params_B = input_channels * output_channels * filter_B_dims[0] * filter_B_dims[1] * n_filters[0] * n_filters[1]
    return params_A, params_B

# Test functions
print(maintain_rf([5, 7], 1, 1, [3, 1], 1, 1))  # Example: Replacing 5x7 filter with 3x1 filters
input_channels = 3
output_channels = 64
filter_A_dims = [5, 7]
filter_B_dims = [3, 1]
n_filters = maintain_rf(filter_A_dims, 1, 1, filter_B_dims, 1, 1)
print(compare_parameters(input_channels, output_channels, filter_A_dims, filter_B_dims, n_filters))


KeyboardInterrupt: 