In [None]:
pip install numpy matplotlib tensorflow tensorflow pickle-mixin scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np

class ActivationFunctions:
    
    @staticmethod
    def sigmoid(z):
        return 1 / (1 + np.exp(-z))

    @staticmethod
    def sigmoid_derivative(z):
        sig = ActivationFunctions.sigmoid(z)
        return sig * (1 - sig)

    @staticmethod
    def tanh(z):
        return np.tanh(z)

    @staticmethod
    def tanh_derivative(z):
        return 1 - np.tanh(z) ** 2

    @staticmethod
    def relu(z):
        return np.maximum(0, z)

    @staticmethod
    def relu_derivative(z):
        return np.where(z > 0, 1, 0)

    @staticmethod
    def leaky_relu(z, alpha=0.01):
        return np.where(z > 0, z, alpha * z)

    @staticmethod
    def leaky_relu_derivative(z, alpha=0.01):
        return np.where(z > 0, 1, alpha)

    @staticmethod
    def softmax(z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)


class WeightInitializer:
    
    @staticmethod
    def zero_init(shape):
        return np.zeros(shape)
    
    @staticmethod
    def random_init(shape, scale=0.01):
        return np.random.uniform(-scale, scale, shape)
    
    @staticmethod
    def normal_init(shape, scale=1.0):
        return np.random.normal(0, scale, shape)


class NeuralNetwork:
    def __init__(self, N, layer_sizes, lr, activation, weight_init, epochs, batch_size):
        self.N = N  # Number of layers
        self.layer_sizes = layer_sizes  # List of neurons in each layer
        self.lr = lr  # Learning rate
        self.activation = activation  # Activation function
        self.weight_init = weight_init  # Weight initialization method
        self.epochs = epochs  # Number of epochs
        self.batch_size = batch_size  # Batch size

        # Initialize weights and biases
        self.weights, self.biases = self.initialize_weights()

    def initialize_weights(self):
        weights = []
        biases = []

        for i in range(self.N - 1):
            if self.weight_init == "xavier":
                weight = np.random.randn(self.layer_sizes[i], self.layer_sizes[i + 1]) * np.sqrt(1 / self.layer_sizes[i])
            elif self.weight_init == "he":
                weight = np.random.randn(self.layer_sizes[i], self.layer_sizes[i + 1]) * np.sqrt(2 / self.layer_sizes[i])
            elif self.weight_init == "zero_init":
                weight = WeightInitializer.zero_init((self.layer_sizes[i], self.layer_sizes[i + 1]))
            elif self.weight_init == "random_init":
                weight = WeightInitializer.random_init((self.layer_sizes[i], self.layer_sizes[i + 1]))
            elif self.weight_init == "normal_init":
                weight = WeightInitializer.normal_init((self.layer_sizes[i], self.layer_sizes[i + 1]))
            else:
                raise ValueError("Unknown weight initialization method")

            bias = np.zeros((1, self.layer_sizes[i + 1]))
            weights.append(weight)
            biases.append(bias)

        return weights, biases

    def activation_function(self, z):
        if self.activation == "relu":
            return ActivationFunctions.relu(z)
        elif self.activation == "tanh":
            return ActivationFunctions.tanh(z)
        elif self.activation == "sigmoid":
            return ActivationFunctions.sigmoid(z)
        elif self.activation == "leaky_relu":
            return ActivationFunctions.leaky_relu(z)
        else:
            raise ValueError("Unsupported activation function")

    def activation_derivative(self, z):
        if self.activation == "relu":
            return ActivationFunctions.relu_derivative(z)
        elif self.activation == "tanh":
            return ActivationFunctions.tanh_derivative(z)
        elif self.activation == "sigmoid":
            return ActivationFunctions.sigmoid_derivative(z)
        elif self.activation == "leaky_relu":
            return ActivationFunctions.leaky_relu_derivative(z)
        else:
            raise ValueError("Unsupported activation function")

    def softmax(self, z):
        return ActivationFunctions.softmax(z)

    def forward(self, X):
        activations = [X]
        zs = []

        for i in range(self.N - 2):
            z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
            zs.append(z)
            activation = self.activation_function(z)
            activations.append(activation)

        z = np.dot(activations[-1], self.weights[-1]) + self.biases[-1]
        zs.append(z)
        activation = self.softmax(z)
        activations.append(activation)

        return activations, zs

    def backward(self, X, Y, activations, zs):
        grads_w = [None] * (self.N - 1)
        grads_b = [None] * (self.N - 1)

        delta = activations[-1] - Y  # Assuming Y is one-hot encoded
        grads_w[-1] = np.dot(activations[-2].T, delta) / X.shape[0]
        grads_b[-1] = np.sum(delta, axis=0, keepdims=True) / X.shape[0]

        for i in range(self.N - 3, -1, -1):
            delta = np.dot(delta, self.weights[i + 1].T) * self.activation_derivative(zs[i])
            grads_w[i] = np.dot(activations[i].T, delta) / X.shape[0]
            grads_b[i] = np.sum(delta, axis=0, keepdims=True) / X.shape[0]

        return grads_w, grads_b

    def update_parameters(self, grads_w, grads_b):
        for i in range(self.N - 1):
            self.weights[i] -= self.lr * grads_w[i]
            self.biases[i] -= self.lr * grads_b[i]

    def fit(self, X, Y):
        for epoch in range(self.epochs):
            indices = np.arange(X.shape[0])
            np.random.shuffle(indices)

            for start in range(0, X.shape[0], self.batch_size):
                end = start + self.batch_size
                batch_indices = indices[start:end]
                X_batch, Y_batch = X[batch_indices], Y[batch_indices]

                activations, zs = self.forward(X_batch)
                grads_w, grads_b = self.backward(X_batch, Y_batch, activations, zs)
                self.update_parameters(grads_w, grads_b)

            full_activations, _ = self.forward(X)
            epoch_loss = -np.mean(np.sum(Y * np.log(full_activations[-1] + 1e-8), axis=1))
            print(f"Epoch {epoch + 1}/{self.epochs}, Loss: {epoch_loss:.4f}")
    
        return epoch_loss

    def predict(self, X):
        activations, _ = self.forward(X)
        return np.argmax(activations[-1], axis=1)

    def predict_proba(self, X):
        activations, _ = self.forward(X)
        return activations[-1]

    def score(self, X, Y):
        predictions = self.predict(X)
        labels = np.argmax(Y, axis=1)
        accuracy = np.mean(predictions == labels)
        return accuracy

In [3]:
# import numpy as np

# class ActivationFunctions:
    
#     @staticmethod
#     def sigmoid(z):
#         """
#         Sigmoid activation function.
#         """
#         return 1 / (1 + np.exp(-z))

#     @staticmethod
#     def sigmoid_derivative(z):
#         """
#         Derivative of the sigmoid function.
#         """
#         sig = ActivationFunctions.sigmoid(z)
#         return sig * (1 - sig)

#     @staticmethod
#     def tanh(z):
#         """
#         Tanh activation function.
#         """
#         return np.tanh(z)

#     @staticmethod
#     def tanh_derivative(z):
#         """
#         Derivative of the tanh function.
#         """
#         return 1 - np.tanh(z) ** 2

#     @staticmethod
#     def relu(z):
#         """
#         ReLU activation function.
#         """
#         return np.maximum(0, z)

#     @staticmethod
#     def relu_derivative(z):
#         """
#         Derivative of the ReLU function.
#         """
#         return np.where(z > 0, 1, 0)

#     @staticmethod
#     def leaky_relu(z, alpha=0.01):
#         """
#         Leaky ReLU activation function with a small slope for negative inputs.
#         """
#         return np.where(z > 0, z, alpha * z)

#     @staticmethod
#     def leaky_relu_derivative(z, alpha=0.01):
#         """
#         Derivative of the Leaky ReLU function.
#         """
#         return np.where(z > 0, 1, alpha)

#     @staticmethod
#     def softmax(z):
#         """
#         Softmax activation function for the output layer.
#         """
#         exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # Stability improvement
#         return exp_z / np.sum(exp_z, axis=1, keepdims=True)

In [4]:
# import numpy as np

# class WeightInitializer:
    
#     @staticmethod
#     def zero_init(shape):
#         """
#         Initializes weights to zero.
#         """
#         return np.zeros(shape)
    
#     @staticmethod
#     def random_init(shape, scale=0.01):
#         """
#         Initializes weights randomly within a uniform distribution.
#         """
#         return np.random.uniform(-scale, scale, shape)
    
#     @staticmethod
#     def normal_init(shape, scale=1.0):
#         """
#         Initializes weights using a normal distribution with mean 0 and standard deviation scale.
#         """
#         return np.random.normal(0, scale, shape)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
from tensorflow.keras.datasets import mnist
from sklearn.model_selection import train_test_split

# Configuration dictionary
configurations = {
    "num_layers": 5,
    "layer_sizes": [784, 256, 128, 64, 10],
    "learning_rate": 2e-3,
    "epochs": 200,
    "batch_size": 128,
    "activation_functions": ["sigmoid", "tanh", "relu", "leaky_relu"],
    "weight_initializations": ["zero_init", "random_init", "normal_init"]
}

# Load and preprocess the MNIST dataset
(X, y), (X_test, y_test) = mnist.load_data()
X = X.reshape(-1, 784) / 255.0
X_test = X_test.reshape(-1, 784) / 255.0
Y = np.eye(10)[y]
Y_test = np.eye(10)[y_test]

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.1, random_state=42)

# Store training and validation losses for plotting
training_losses = {}
validation_losses = {}

for activation in configurations["activation_functions"]:
    for weight_init in configurations["weight_initializations"]:
        
        print(f"Training with activation: {activation} and weight initialization: {weight_init}")
        
        model = NeuralNetwork(
            N=configurations["num_layers"],
            layer_sizes=configurations["layer_sizes"],
            lr=configurations["learning_rate"],
            activation=activation,
            weight_init=weight_init,
            epochs=configurations["epochs"],
            batch_size=configurations["batch_size"]
        )

        train_loss_history = []
        val_loss_history = []
        
        for epoch in range(configurations["epochs"]):
            train_loss = model.fit(X_train, Y_train)
            val_predictions = model.predict_proba(X_val)
            val_loss = -np.mean(np.sum(Y_val * np.log(val_predictions + 1e-8), axis=1))
            train_loss_history.append(train_loss)
            val_loss_history.append(val_loss)
            print(f"Epoch {epoch + 1}/{configurations['epochs']} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")
        
        training_losses[(activation, weight_init)] = train_loss_history
        validation_losses[(activation, weight_init)] = val_loss_history
        
        model_filename = f"model_{activation}_{weight_init}.pkl"
        with open(model_filename, "wb") as f:
            pickle.dump(model, f)
        
# Plot training and validation loss for each configuration
for (activation, weight_init), train_loss_history in training_losses.items():
    val_loss_history = validation_losses[(activation, weight_init)]
    plt.figure(figsize=(10, 5))
    plt.plot(train_loss_history, label="Train Loss")
    plt.plot(val_loss_history, label="Validation Loss")
    plt.title(f"Activation: {activation}, Weight Init: {weight_init}")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step
Training with activation: sigmoid and weight initialization: zero_init
Epoch 1/200, Loss: 2.3012
Epoch 2/200, Loss: 2.3011
Epoch 3/200, Loss: 2.3011
Epoch 4/200, Loss: 2.3011
Epoch 5/200, Loss: 2.3011
Epoch 6/200, Loss: 2.3011
Epoch 7/200, Loss: 2.3011
Epoch 8/200, Loss: 2.3011
Epoch 9/200, Loss: 2.3011
Epoch 10/200, Loss: 2.3011
Epoch 11/200, Loss: 2.3011
Epoch 12/200, Loss: 2.3011
Epoch 13/200, Loss: 2.3011
Epoch 14/200, Loss: 2.3011
Epoch 15/200, Loss: 2.3011
Epoch 16/200, Loss: 2.3011
Epoch 17/200, Loss: 2.3011
Epoch 18/200, Loss: 2.3011
Epoch 19/200, Loss: 2.3011
Epoch 20/200, Loss: 2.3011
Epoch 21/200, Loss: 2.3011
Epoch 22/200, Loss: 2.3011
Epoch 23/200, Loss: 2.3011
Epoch 24/200, Loss: 2.3011
Epoch 25/200, Loss: 2.3011
Epoch 26/200, Loss: 2.3011
Epoch 27/200, Loss: 2.3011
Epoch 28/200, Loss: 2.3011
Epoch