In [None]:
# pip install numpy matplotlib tensorflow tensorflow pickle-mixin scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np

class NeuralNetwork:
    def __init__(self, N, layer_sizes, lr, activation, weight_init, epochs, batch_size):
        self.N = N  # Number of layers
        self.layer_sizes = layer_sizes  # List of neurons in each layer
        self.lr = lr  # Learning rate
        self.activation = activation  # Activation function
        self.weight_init = weight_init  # Weight initialization method
        self.epochs = epochs  # Number of epochs
        self.batch_size = batch_size  # Batch size

        # Initialize weights and biases
        self.weights, self.biases = self.initialize_weights()

    def initialize_weights(self):
        weights = []
        biases = []

        for i in range(self.N - 1):
            if self.weight_init == "xavier":
                weight = np.random.randn(self.layer_sizes[i], self.layer_sizes[i + 1]) * np.sqrt(1 / self.layer_sizes[i])
            elif self.weight_init == "he":
                weight = np.random.randn(self.layer_sizes[i], self.layer_sizes[i + 1]) * np.sqrt(2 / self.layer_sizes[i])
            else:  # default random initialization
                weight = np.random.randn(self.layer_sizes[i], self.layer_sizes[i + 1]) * 0.01
            bias = np.zeros((1, self.layer_sizes[i + 1]))

            weights.append(weight)
            biases.append(bias)

        return weights, biases

    def activation_function(self, z):
        if self.activation == "relu":
            return np.maximum(0, z)
        elif self.activation == "tanh":
            return np.tanh(z)
        elif self.activation == "sigmoid":
            return 1 / (1 + np.exp(-z))
        else:
            raise ValueError("Unsupported activation function")

    def activation_derivative(self, z):
        if self.activation == "relu":
            return np.where(z > 0, 1, 0)
        elif self.activation == "tanh":
            return 1 - np.tanh(z) ** 2
        elif self.activation == "sigmoid":
            sig = self.activation_function(z)
            return sig * (1 - sig)
        else:
            raise ValueError("Unsupported activation function")

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def forward(self, X):
        activations = [X]
        zs = []

        for i in range(self.N - 2):
            z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
            zs.append(z)
            activation = self.activation_function(z)
            activations.append(activation)

        # Output layer with softmax for probabilities
        z = np.dot(activations[-1], self.weights[-1]) + self.biases[-1]
        zs.append(z)
        activation = self.softmax(z)
        activations.append(activation)

        return activations, zs

    def backward(self, X, Y, activations, zs):
        grads_w = [None] * (self.N - 1)
        grads_b = [None] * (self.N - 1)

        # Output layer error
        delta = activations[-1] - Y  # Assuming Y is one-hot encoded
        grads_w[-1] = np.dot(activations[-2].T, delta) / X.shape[0]
        grads_b[-1] = np.sum(delta, axis=0, keepdims=True) / X.shape[0]

        # Backpropagation through hidden layers
        for i in range(self.N - 3, -1, -1):
            delta = np.dot(delta, self.weights[i + 1].T) * self.activation_derivative(zs[i])
            grads_w[i] = np.dot(activations[i].T, delta) / X.shape[0]
            grads_b[i] = np.sum(delta, axis=0, keepdims=True) / X.shape[0]

        return grads_w, grads_b

    def update_parameters(self, grads_w, grads_b):
        for i in range(self.N - 1):
            self.weights[i] -= self.lr * grads_w[i]
            self.biases[i] -= self.lr * grads_b[i]

    def fit(self, X, Y):
        for epoch in range(self.epochs):
            indices = np.arange(X.shape[0])
            np.random.shuffle(indices)

            for start in range(0, X.shape[0], self.batch_size):
                end = start + self.batch_size
                batch_indices = indices[start:end]
                X_batch, Y_batch = X[batch_indices], Y[batch_indices]

                activations, zs = self.forward(X_batch)
                grads_w, grads_b = self.backward(X_batch, Y_batch, activations, zs)
                self.update_parameters(grads_w, grads_b)

            # Calculate and print loss for the current epoch
            # Use Y_batch for calculating loss for each batch, or calculate it after each epoch on the entire dataset
            # Here, I calculate loss only for the last batch of the epoch for simplicity.
            if (epoch + 1) % 10 == 0 or epoch == 0:
                batch_loss = -np.mean(np.sum(Y_batch * np.log(activations[-1] + 1e-8), axis=1))
                print(f"Epoch {epoch + 1}/{self.epochs}, Loss: {batch_loss:.4f}")

    def predict(self, X):
        activations, _ = self.forward(X)
        return np.argmax(activations[-1], axis=1)

    def predict_proba(self, X):
        activations, _ = self.forward(X)
        return activations[-1]

    def score(self, X, Y):
        predictions = self.predict(X)
        labels = np.argmax(Y, axis=1)  # Assuming Y is one-hot encoded
        accuracy = np.mean(predictions == labels)
        return accuracy

In [3]:
import numpy as np

class ActivationFunctions:
    
    @staticmethod
    def sigmoid(z):
        """
        Sigmoid activation function.
        """
        return 1 / (1 + np.exp(-z))

    @staticmethod
    def sigmoid_derivative(z):
        """
        Derivative of the sigmoid function.
        """
        sig = ActivationFunctions.sigmoid(z)
        return sig * (1 - sig)

    @staticmethod
    def tanh(z):
        """
        Tanh activation function.
        """
        return np.tanh(z)

    @staticmethod
    def tanh_derivative(z):
        """
        Derivative of the tanh function.
        """
        return 1 - np.tanh(z) ** 2

    @staticmethod
    def relu(z):
        """
        ReLU activation function.
        """
        return np.maximum(0, z)

    @staticmethod
    def relu_derivative(z):
        """
        Derivative of the ReLU function.
        """
        return np.where(z > 0, 1, 0)

    @staticmethod
    def leaky_relu(z, alpha=0.01):
        """
        Leaky ReLU activation function with a small slope for negative inputs.
        """
        return np.where(z > 0, z, alpha * z)

    @staticmethod
    def leaky_relu_derivative(z, alpha=0.01):
        """
        Derivative of the Leaky ReLU function.
        """
        return np.where(z > 0, 1, alpha)

    @staticmethod
    def softmax(z):
        """
        Softmax activation function for the output layer.
        """
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # Stability improvement
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

In [4]:
import numpy as np

class WeightInitializer:
    
    @staticmethod
    def zero_init(shape):
        """
        Initializes weights to zero.
        """
        return np.zeros(shape)
    
    @staticmethod
    def random_init(shape, scale=0.01):
        """
        Initializes weights randomly within a uniform distribution.
        """
        return np.random.uniform(-scale, scale, shape)
    
    @staticmethod
    def normal_init(shape, scale=1.0):
        """
        Initializes weights using a normal distribution with mean 0 and standard deviation scale.
        """
        return np.random.normal(0, scale, shape)

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
from tensorflow.keras.datasets import mnist
from sklearn.model_selection import train_test_split

# Configuration dictionary
configurations = {
    "num_layers": 5,
    "layer_sizes": [784, 256, 128, 64, 10],
    "learning_rate": 2e-3,
    "epochs": 200,
    "batch_size": 128,
    "activation_functions": ["sigmoid", "tanh", "relu", "leaky_relu"],
    "weight_initializations": ["zero_init", "random_init", "normal_init"]
}

# Load and preprocess the MNIST dataset
(X, y), (X_test, y_test) = mnist.load_data()
X = X.reshape(-1, 784) / 255.0
X_test = X_test.reshape(-1, 784) / 255.0
Y = np.eye(10)[y]
Y_test = np.eye(10)[y_test]

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.1, random_state=42)

# Store training and validation losses for plotting
training_losses = {}
validation_losses = {}

for activation in configurations["activation_functions"]:
    for weight_init in configurations["weight_initializations"]:
        
        print(f"Training with activation: {activation} and weight initialization: {weight_init}")
        
        model = NeuralNetwork(
            N=configurations["num_layers"],
            layer_sizes=configurations["layer_sizes"],
            lr=configurations["learning_rate"],
            activation=activation,
            weight_init=weight_init,
            epochs=configurations["epochs"],
            batch_size=configurations["batch_size"]
        )

        train_loss_history = []
        val_loss_history = []
        
        for epoch in range(configurations["epochs"]):
            train_loss = model.fit(X_train, Y_train)
            val_predictions = model.predict_proba(X_val)
            val_loss = -np.mean(np.sum(Y_val * np.log(val_predictions + 1e-8), axis=1))
            train_loss_history.append(train_loss)
            val_loss_history.append(val_loss)
            print(f"Epoch {epoch + 1}/{configurations['epochs']} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")
        
        training_losses[(activation, weight_init)] = train_loss_history
        validation_losses[(activation, weight_init)] = val_loss_history
        
        model_filename = f"model_{activation}_{weight_init}.pkl"
        with open(model_filename, "wb") as f:
            pickle.dump(model, f)
        
# Plot training and validation loss for each configuration
for (activation, weight_init), train_loss_history in training_losses.items():
    val_loss_history = validation_losses[(activation, weight_init)]
    plt.figure(figsize=(10, 5))
    plt.plot(train_loss_history, label="Train Loss")
    plt.plot(val_loss_history, label="Validation Loss")
    plt.title(f"Activation: {activation}, Weight Init: {weight_init}")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

2024-11-09 22:19:44.967176: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-09 22:19:44.970922: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-09 22:19:44.979486: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731190784.992317 1456917 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731190784.996081 1456917 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-09 22:19:45.011791: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

Training with activation: sigmoid and weight initialization: zero_init
Epoch 1/200, Loss: 2.3003
Epoch 10/200, Loss: 2.3031
Epoch 20/200, Loss: 2.2962
Epoch 30/200, Loss: 2.2959
Epoch 40/200, Loss: 2.3015
Epoch 50/200, Loss: 2.2921
Epoch 60/200, Loss: 2.3023
Epoch 70/200, Loss: 2.2967
Epoch 80/200, Loss: 2.2933
Epoch 90/200, Loss: 2.2965
Epoch 100/200, Loss: 2.2947
Epoch 110/200, Loss: 2.3074
Epoch 120/200, Loss: 2.3075
Epoch 130/200, Loss: 2.2942
Epoch 140/200, Loss: 2.3000
Epoch 150/200, Loss: 2.2982
Epoch 160/200, Loss: 2.2974
Epoch 170/200, Loss: 2.3047
Epoch 180/200, Loss: 2.3048
Epoch 190/200, Loss: 2.2992
Epoch 200/200, Loss: 2.2983


TypeError: unsupported format string passed to NoneType.__format__