In [1]:
import numpy as np

# Activation Functions and Derivatives
def relu(z): return np.maximum(0, z)
def relu_derivative(z): return (z > 0).astype(float)

def sigmoid(z): return 1 / (1 + np.exp(-z))
def sigmoid_derivative(z): s = sigmoid(z); return s * (1 - s)

def tanh(z): return np.tanh(z)
def tanh_derivative(z): return 1 - np.tanh(z) ** 2

def softmax(z):  # Stable softmax
    z = z - np.max(z, axis=1, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

# Loss Functions
def binary_cross_entropy(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-9, 1 - 1e-9)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

def cross_entropy(y_true, y_pred):  # y_true is one-hot
    y_pred = np.clip(y_pred, 1e-9, 1)
    return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))

# Activation registry
activation_functions = {
    "relu": (relu, relu_derivative),
    "sigmoid": (sigmoid, sigmoid_derivative),
    "tanh": (tanh, tanh_derivative),
    "softmax": (softmax, None)  # derivative handled separately
}

# Neural Network Class
class NeuralNetwork:
    def __init__(self, layers, activations, output_activation="softmax", loss="cross_entropy", lr=0.01):
        self.layers = layers  # list of sizes [input, hidden1, ..., output]
        self.activations = activations
        self.output_activation = output_activation
        self.loss_function = cross_entropy if loss == "cross_entropy" else binary_cross_entropy
        self.lr = lr
        self.params = {}
        self._init_weights()

    def _init_weights(self):
        np.random.seed(42)
        for i in range(len(self.layers) - 1):
            self.params[f"W{i+1}"] = np.random.randn(self.layers[i], self.layers[i+1]) * 0.01
            self.params[f"b{i+1}"] = np.zeros((1, self.layers[i+1]))

    def forward(self, X):
        self.cache = {"A0": X}
        for i in range(1, len(self.layers)):
            W = self.params[f"W{i}"]
            b = self.params[f"b{i}"]
            Z = np.dot(self.cache[f"A{i-1}"], W) + b
            act_name = self.activations[i-1] if i != len(self.layers) - 1 else self.output_activation
            act_func = activation_functions[act_name][0]
            self.cache[f"Z{i}"] = Z
            self.cache[f"A{i}"] = act_func(Z)
        return self.cache[f"A{len(self.layers) - 1}"]

    def backward(self, Y):
        grads = {}
        L = len(self.layers) - 1
        m = Y.shape[0]
        A_final = self.cache[f"A{L}"]
        
        # Output layer gradient
        if self.output_activation == "softmax":
            dZ = A_final - Y
        elif self.output_activation == "sigmoid":
            dZ = A_final - Y

        for i in reversed(range(1, L + 1)):
            A_prev = self.cache[f"A{i-1}"]
            W = self.params[f"W{i}"]
            grads[f"dW{i}"] = np.dot(A_prev.T, dZ) / m
            grads[f"db{i}"] = np.sum(dZ, axis=0, keepdims=True) / m
            
            if i > 1:
                Z_prev = self.cache[f"Z{i-1}"]
                act_deriv = activation_functions[self.activations[i-2]][1]
                dA_prev = np.dot(dZ, W.T)
                dZ = dA_prev * act_deriv(Z_prev)

        # Gradient descent update
        for i in range(1, L + 1):
            self.params[f"W{i}"] -= self.lr * grads[f"dW{i}"]
            self.params[f"b{i}"] -= self.lr * grads[f"db{i}"]

    def compute_loss(self, Y, Y_hat):
        return self.loss_function(Y, Y_hat)

    def train(self, X, Y, epochs=1000, verbose=True):
        for epoch in range(epochs):
            Y_hat = self.forward(X)
            loss = self.compute_loss(Y, Y_hat)
            self.backward(Y)
            if verbose and (epoch + 1) % 100 == 0:
                print(f"Epoch {epoch + 1}, Loss: {loss:.4f}")

    def predict(self, X):
        probs = self.forward(X)
        if self.output_activation == "softmax":
            return np.argmax(probs, axis=1)
        else:
            return (probs > 0.5).astype(int)


In [2]:
X = np.array([[0.5, 0.2, 0.1]])
Y = np.array([[1]])

nn = NeuralNetwork(layers=[3, 4, 1], activations=["relu"], output_activation="sigmoid", loss="binary_cross_entropy", lr=0.1)
nn.train(X, Y, epochs=500)
print("Prediction:", nn.predict(X))


Epoch 100, Loss: 0.1075
Epoch 200, Loss: 0.0536
Epoch 300, Loss: 0.0352
Epoch 400, Loss: 0.0260
Epoch 500, Loss: 0.0205
Prediction: [[1]]
