In [7]:
import numpy as np

class DenseLayer:
    def __init__(self, input_dim, output_dim, gradient_clip_value=None):
        """
        Initialize a dense (fully connected) layer with given input and output dimensions.
        
        Parameters:
        - input_dim: int, number of input features
        - output_dim: int, number of output units
        - gradient_clip_value: float or None, maximum absolute value for gradients (if clipping is desired)
        """
        # Xavier/Glorot initialization for weights
        limit = np.sqrt(6 / (input_dim + output_dim))
        self.weights = np.random.uniform(-limit, limit, (input_dim, output_dim))
        self.biases = np.zeros((1, output_dim))
        
        # Placeholder for gradients
        self.d_weights = None
        self.d_biases = None
        
        # Gradient clipping threshold
        self.gradient_clip_value = gradient_clip_value

    def forward(self, X):
        """
        Perform the forward pass through the dense layer.
        
        Parameters:
        - X: ndarray, input data of shape (batch_size, input_dim)
        
        Returns:
        - output: ndarray, result of the layer transformation, shape (batch_size, output_dim)
        """
        self.input = X
        self.output = np.dot(X, self.weights) + self.biases
        return self.output

    def backward(self, d_out):
        """
        Compute the gradients of weights and biases with respect to the loss.
        
        Parameters:
        - d_out: ndarray, gradient of the loss with respect to the output of this layer, 
        shape (batch_size, output_dim)
        
        Returns:
        - d_input: ndarray, gradient of the loss with respect to the input of this layer, shape (batch_size, input_dim)
        """
        # Gradient of the loss with respect to weights and biases
        self.d_weights = np.dot(self.input.T, d_out)
        self.d_biases = np.sum(d_out, axis=0, keepdims=True)
        
        # Gradient of the loss with respect to the input of this layer
        d_input = np.dot(d_out, self.weights.T)
        
        # Clip gradients if gradient clipping is enabled
        if self.gradient_clip_value is not None:
            np.clip(self.d_weights, -self.gradient_clip_value, self.gradient_clip_value, out=self.d_weights)
            np.clip(self.d_biases, -self.gradient_clip_value, self.gradient_clip_value, out=self.d_biases)
        
        return d_input




In [8]:
class AdamOptimizer:
    def __init__(self, layer, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.layer = layer
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon

        # Initialize moment estimates
        if hasattr(layer, 'weights'):
            # For DenseLayer
            self.m_weights = np.zeros_like(layer.weights)
            self.v_weights = np.zeros_like(layer.weights)
            self.m_biases = np.zeros_like(layer.biases)
            self.v_biases = np.zeros_like(layer.biases)
        elif hasattr(layer, 'gamma'):
            # For BatchNormalization
            self.m_gamma = np.zeros_like(layer.gamma)
            self.v_gamma = np.zeros_like(layer.gamma)
            self.m_beta = np.zeros_like(layer.beta)
            self.v_beta = np.zeros_like(layer.beta)

        # Time step for bias correction
        self.t = 1

    def update(self):
        if hasattr(self.layer, 'weights'):
            # Update DenseLayer weights and biases
            self._update_dense()
        elif hasattr(self.layer, 'gamma'):
            # Update BatchNormalization gamma and beta
            self._update_batch_norm()

        # Increment time step
        self.t += 1

    def _update_dense(self):
        # Adam update for weights
        self.m_weights = self.beta1 * self.m_weights + (1 - self.beta1) * self.layer.d_weights
        self.v_weights = self.beta2 * self.v_weights + (1 - self.beta2) * (self.layer.d_weights ** 2)
        m_weights_corr = self.m_weights / (1 - self.beta1 ** self.t)
        v_weights_corr = self.v_weights / (1 - self.beta2 ** self.t)
        self.layer.weights -= self.learning_rate * m_weights_corr / (np.sqrt(v_weights_corr) + self.epsilon)
        
        # Adam update for biases
        self.m_biases = self.beta1 * self.m_biases + (1 - self.beta1) * self.layer.d_biases
        self.v_biases = self.beta2 * self.v_biases + (1 - self.beta2) * (self.layer.d_biases ** 2)
        m_biases_corr = self.m_biases / (1 - self.beta1 ** self.t)
        v_biases_corr = self.v_biases / (1 - self.beta2 ** self.t)
        self.layer.biases -= self.learning_rate * m_biases_corr / (np.sqrt(v_biases_corr) + self.epsilon)

    def _update_batch_norm(self):
        # Adam update for gamma
        self.m_gamma = self.beta1 * self.m_gamma + (1 - self.beta1) * self.layer.d_gamma
        self.v_gamma = self.beta2 * self.v_gamma + (1 - self.beta2) * (self.layer.d_gamma ** 2)
        m_gamma_corr = self.m_gamma / (1 - self.beta1 ** self.t)
        v_gamma_corr = self.v_gamma / (1 - self.beta2 ** self.t)
        self.layer.gamma -= self.learning_rate * m_gamma_corr / (np.sqrt(v_gamma_corr) + self.epsilon)
        
        # Adam update for beta
        self.m_beta = self.beta1 * self.m_beta + (1 - self.beta1) * self.layer.d_beta
        self.v_beta = self.beta2 * self.v_beta + (1 - self.beta2) * (self.layer.d_beta ** 2)
        m_beta_corr = self.m_beta / (1 - self.beta1 ** self.t)
        v_beta_corr = self.v_beta / (1 - self.beta2 ** self.t)
        self.layer.beta -= self.learning_rate * m_beta_corr / (np.sqrt(v_beta_corr) + self.epsilon)


# Batch Normalization

In [9]:
class BatchNormalization:
    def __init__(self, dim, momentum=0.9, epsilon=1e-5):
        """
        Initialize the Batch Normalization layer.
        
        Parameters:
        - dim: int, number of features in the input
        - momentum: float, momentum for moving average of mean and variance
        - epsilon: float, small constant to prevent division by zero
        """
        self.momentum = momentum
        self.epsilon = epsilon
        self.gamma = np.ones((1, dim))  # Scale parameter
        self.beta = np.zeros((1, dim))  # Shift parameter
        self.running_mean = np.zeros((1, dim))
        self.running_var = np.ones((1, dim))
        self.training = True

    def forward(self, X):
        """
        Forward pass for Batch Normalization.
        
        Parameters:
        - X: ndarray, input data of shape (batch_size, dim)
        
        Returns:
        - out: ndarray, normalized and scaled output
        """
        if self.training:
            # Compute mean and variance for the batch
            batch_mean = np.mean(X, axis=0, keepdims=True)
            batch_var = np.var(X, axis=0, keepdims=True)
            
            # Normalize
            self.X_centered = X - batch_mean
            self.stddev_inv = 1.0 / np.sqrt(batch_var + self.epsilon)
            X_norm = self.X_centered * self.stddev_inv
            
            # Update running mean and variance for inference
            self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * batch_mean
            self.running_var = self.momentum * self.running_var + (1 - self.momentum) * batch_var
        else:
            # Use running mean and variance for inference
            X_norm = (X - self.running_mean) / np.sqrt(self.running_var + self.epsilon)
        
        # Scale and shift
        out = self.gamma * X_norm + self.beta
        self.X_norm = X_norm  # Store for backward pass
        return out

    def backward(self, d_out, learning_rate):
        """
        Backward pass for Batch Normalization.
        
        Parameters:
        - d_out: ndarray, gradient of the loss with respect to the output of this layer
        - learning_rate: float, learning rate for parameter updates
        
        Returns:
        - d_input: ndarray, gradient of the loss with respect to the input of this layer
        """
        # Gradients with respect to gamma and beta
        self.d_gamma = np.sum(d_out * self.X_norm, axis=0, keepdims=True)
        self.d_beta = np.sum(d_out, axis=0, keepdims=True)
        
        # Gradient with respect to normalized input
        d_X_norm = d_out * self.gamma
        
        # Gradient with respect to variance
        d_var = np.sum(d_X_norm * self.X_centered, axis=0, keepdims=True) * -0.5 * self.stddev_inv**3
        
        # Gradient with respect to mean
        d_mean = np.sum(d_X_norm * -self.stddev_inv, axis=0, keepdims=True) + d_var * np.mean(-2.0 * self.X_centered, axis=0, keepdims=True)
        
        # Gradient with respect to input
        d_input = (d_X_norm * self.stddev_inv) + (d_var * 2 * self.X_centered / d_out.shape[0]) + (d_mean / d_out.shape[0])
        
        # Update parameters
        self.gamma -= learning_rate * self.d_gamma
        self.beta -= learning_rate * self.d_beta
        
        return d_input

    def set_training_mode(self, mode=True):
        """
        Set the layer in training or evaluation mode.
        
        Parameters:
        - mode: bool, True for training mode, False for evaluation mode
        """
        self.training = mode


# Activation: ReLU

In [10]:
class ReLU:
    def __init__(self):
        # Placeholder to store input for the backward pass
        self.input = None
        self.d_input = None

    def forward(self, X):
        """
        Perform the forward pass of the ReLU activation.
        
        Parameters:
        - X: ndarray, input data of shape (batch_size, input_dim)
        
        Returns:
        - output: ndarray, output after ReLU activation
        """
        self.input = X
        # Apply ReLU activation: ReLU(x) = max(0, x)
        return np.maximum(0, X)

    def backward(self, d_out):
        """
        Perform the backward pass of the ReLU activation.
        
        Parameters:
        - d_out: ndarray, gradient of the loss with respect to the output of the ReLU
        
        Returns:
        - d_input: ndarray, gradient of the loss with respect to the input of the ReLU
        """
        # The derivative of ReLU is 1 where input > 0 and 0 where input <= 0
        self.d_input = d_out * (self.input > 0)  # Element-wise multiplication with mask
        return self.d_input


# Regularization: Dropout

In [11]:
class Dropout:
    def __init__(self, dropout_rate=0.5):
        self.dropout_rate = dropout_rate
        self.training = True

    def set_training_mode(self, mode=True):
        """
        Set the dropout layer to training mode (drop units) or testing mode (no dropout).
        
        Parameters:
        - mode: bool, True for training mode, False for testing mode.
        """
        self.training = mode

    def forward(self, X):
        """
        Apply dropout during the forward pass.
        
        Parameters:
        - X: ndarray, input data
        
        Returns:
        - output: ndarray, with dropout applied if in training mode
        """
        if self.training:
            mask = (np.random.rand(*X.shape) > self.dropout_rate).astype(np.float32)
            self.mask = mask / (1 - self.dropout_rate)
            return X * self.mask
        else:
            return X

    def backward(self, d_out):
        """
        Backward pass for dropout, scaled by the mask created in the forward pass.
        
        Parameters:
        - d_out: ndarray, gradient of loss w.r.t dropout layer output
        
        Returns:
        - d_input: ndarray, gradient of loss w.r.t dropout layer input
        """
        return d_out * self.mask if self.training else d_out


In [12]:
import pickle
import numpy as np
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# Define transformations and load the dataset
transform = transforms.ToTensor()

# Load training and test data
trainset = datasets.FashionMNIST(root='~/.pytorch/F_MNIST_data/', download=True, train=True, transform=transform)
testset = datasets.FashionMNIST(root='~/.pytorch/F_MNIST_data/', download=True, train=False, transform=transform)

# Split trainset into training and validation sets (e.g., 90% train, 10% validation)
train_size = int(0.9 * len(trainset))
val_size = len(trainset) - train_size
trainset, valset = torch.utils.data.random_split(trainset, [train_size, val_size])

# Set up data loaders
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
valloader = DataLoader(valset, batch_size=64, shuffle=False)
testloader = DataLoader(testset, batch_size=64, shuffle=False)

# Initialize model layers and optimizers
input_dim = 28 * 28  # FashionMNIST images are 28x28
hidden_dim = 128     # Hidden layer dimension
output_dim = 10      # 10 classes for classification
learning_rate = 0.0001

# Define the layers
dense_layer1 = DenseLayer(input_dim=input_dim, output_dim=hidden_dim, gradient_clip_value=1.0)
batch_norm1 = BatchNormalization(dim=hidden_dim)
relu1 = ReLU()
dropout1 = Dropout(dropout_rate=0.5)

dense_layer2 = DenseLayer(input_dim=hidden_dim, output_dim=hidden_dim, gradient_clip_value=1.0)
batch_norm2 = BatchNormalization(dim=hidden_dim)
relu2 = ReLU()
dropout2 = Dropout(dropout_rate=0.5)

dense_layer3 = DenseLayer(input_dim=hidden_dim, output_dim=output_dim, gradient_clip_value=1.0)
batch_norm3 = BatchNormalization(dim=output_dim)
dropout3 = Dropout(dropout_rate=0.5)

# Define the optimizers for each layer
optimizer_dense1 = AdamOptimizer(dense_layer1, learning_rate=learning_rate)
optimizer_bn1 = AdamOptimizer(batch_norm1, learning_rate=learning_rate)

optimizer_dense2 = AdamOptimizer(dense_layer2, learning_rate=learning_rate)
optimizer_bn2 = AdamOptimizer(batch_norm2, learning_rate=learning_rate)

optimizer_dense3 = AdamOptimizer(dense_layer3, learning_rate=learning_rate)
optimizer_bn3 = AdamOptimizer(batch_norm3, learning_rate=learning_rate)

# Define a function to save the weights (without bias)
def save_weights():
    weights = {
        'dense_layer1_weights': dense_layer1.weights,
        'dense_layer2_weights': dense_layer2.weights,
        'dense_layer3_weights': dense_layer3.weights
    }
    
    # Save the weights to a pickle file
    with open('model_weights.pkl', 'wb') as f:
        pickle.dump(weights, f)
    print("Weights saved successfully.")

# Define a function to load the weights (without bias)
def load_weights():
    with open('model_weights.pkl', 'rb') as f:
        weights = pickle.load(f)
    
    # Load the weights into the model layers
    dense_layer1.weights = weights['dense_layer1_weights']
    dense_layer2.weights = weights['dense_layer2_weights']
    dense_layer3.weights = weights['dense_layer3_weights']
    
    print("Weights loaded successfully.")

# Training function (with some modification for saving weights)
def train(epochs):
    for epoch in range(epochs):
        # Training logic here...

        # After training, save the weights to reduce the model size
        if epoch == epochs - 1:  # Save weights after the last epoch
            save_weights()

# Train the model
train(epochs=100)

# Load the model weights for testing
load_weights()

# Testing function
def test():
    correct = 0
    total = 0

    batch_norm1.set_training_mode(False)
    dropout1.set_training_mode(False)
    batch_norm2.set_training_mode(False)
    dropout2.set_training_mode(False)
    batch_norm3.set_training_mode(False)
    dropout3.set_training_mode(False)

    with torch.no_grad():
        for images, labels in testloader:
            images = images.view(images.shape[0], -1).numpy()

            # Forward pass using the loaded weights
            outputs_dense1 = dense_layer1.forward(images)
            outputs_bn1 = batch_norm1.forward(outputs_dense1)
            outputs_relu1 = relu1.forward(outputs_bn1)
            outputs_dropout1 = dropout1.forward(outputs_relu1)

            outputs_dense2 = dense_layer2.forward(outputs_dropout1)
            outputs_bn2 = batch_norm2.forward(outputs_dense2)
            outputs_relu2 = relu2.forward(outputs_bn2)
            outputs_dropout2 = dropout2.forward(outputs_relu2)

            outputs_dense3 = dense_layer3.forward(outputs_dropout2)
            outputs_bn3 = batch_norm3.forward(outputs_dense3)
            outputs_dropout3 = dropout3.forward(outputs_bn3)

            predicted = np.argmax(outputs_dropout3, axis=1)
            correct += np.sum(predicted == labels.numpy())
            total += labels.size(0)

    print(f"Final Test Accuracy: {100 * correct / total:.2f}%")

# Evaluate the model on the test set
test()


Weights saved successfully.
Weights loaded successfully.
Final Test Accuracy: 10.05%
