# Setting Up Activation Function 

In [1]:
import numpy as np

def layer(inputs, weights, bias, activation):
    z = np.dot(inputs, weights) + bias
    if activation == "sigmoid":
        return 1 / (1 + np.exp(-z))
    elif activation == "relu":
        return np.maximum(0, z)
    elif activation == 'softmax':
        exp_x = np.exp(z - np.max(z, axis=1, keepdims=True))  # for numerical stability
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)
    else:
        return z  # Linear (no activation)

# Example
inputs = np.array([2, 3])
weights = np.array([0.5, 1.2])
bias = -1

output = layer(inputs, weights, bias, "relu")
print("Neuron Output:", output)  # Output: 3.6

Neuron Output: 3.5999999999999996


# Building Single Neuron

In [2]:
# Example: Negative weighted sum
inputs = np.array([-1, -2])
weights = np.array([0.5, 1.2])
bias = 0.1
output2 = layer(inputs, weights, bias, "sigmoid")  # Output: 0.0 (ReLU kills negative z)

# Exmple: Sigmoid with z=0
output1 = layer(np.array([[2.0, -1.0]]), weights, bias=0, activation="relu")  # Output: 0.5
print("Neuron Output:", output2)

Neuron Output: 0.057324175898868755


# Building A Layer of Neuron 

In [3]:
X = np.array([[2.0, -1.0, 3.0,-5]])

weights = np.random.rand(4, 8) # shape (2, 3)
bias1 = np.random.rand(1,8) 
output3 = layer(X, weights = weights, bias=bias1, activation="sigmoid")  
print("Neuron Output:", output3)

Neuron Output: [[0.02662472 0.82093516 0.33889673 0.7970821  0.94727045 0.27951157
  0.74238858 0.04885937]]


In [4]:
X = np.array([[2.0, -1.0, 3.0,2.2]])
input_size = X.shape[1]     # 3
num_neurons = 4

w1 = np.random.randn(input_size, num_neurons)  # shape: (3, 5)
b1 = np.random.randn(1, num_neurons)              # shape: (1, 5)
out5 = layer(X, weights = w1, bias = b1 , activation="relu")


inp_size_2 = out5.shape[1]
w2 = np.random.randn(inp_size_2, 2)
b2 = np.random.randn(1, 2)              # shape: (1, 5)
out6 = layer(out5, w2, b2, activation="sigmoid") 
print(out6)

[[0.16570231 0.99531169]]


In [5]:
input_size

4

# Building Multi Layer Network

In [6]:
X = np.array([[2.0, -1.0, 3.0,2.2]])
input_size = X.shape[1]     # 3
num_neurons = 5

# Layer 1
w1 = np.random.randn(input_size, num_neurons)     
b1 = np.random.randn(1, num_neurons)              
out1 = layer(X, weights=w1, bias=b1, activation="relu")

#LAyer 2
input_size_2 = out1.shape[1]  
w2 = np.random.randn(input_size, num_neurons)     
b2 = np.random.randn(1, num_neurons)             
out2 = layer(X, weights=w2, bias=b2, activation="relu")

# Layer 3
input_size_3 = out1.shape[1]                      
w3 = np.random.randn(input_size_3, 3)   
b3 = np.random.randn(1, 3)              
out3 = layer(out1, weights=w3, bias=b3, activation="softmax")

print("Final Output :", out3)

Final Output : [[0.16758159 0.00117735 0.83124106]]


# Setting Up Iris Dataset

In [7]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import numpy as np

# Load data
iris = load_iris()
X = iris.data  
y = iris.target.reshape(-1, 1)

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(y)  

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (120, 4)
Test shape: (30, 4)


# Passing Our Data Through our 3 Layer Model

In [9]:
in_s = int(X_train.shape[1])
w1 = np.random.randn(in_s, 10) 
b1 = np.zeros((1, 10))
out1 = layer(X_train,w1,b1,"relu")

in_s2 = out1.shape[1]
w2 = np.random.randn(in_s2, 8)   
b2 = np.zeros((1, 8))           
out2 = layer(out1, weights=w2, bias=b2, activation="relu")

in_s3 = out2.shape[1]
w3 = np.random.randn(in_s3, 3)   
b3 = np.zeros((1, 3))           
y_pred = layer(out2, weights=w3, bias=b3, activation="softmax")


# Loss Function

In [10]:
def cross_entropy_loss(predictions, y_true):
    # Add epsilon for numerical stability
    eps = 1e-15
    predictions = np.clip(predictions, eps, 1 - eps)
    loss = -np.mean(np.sum(y_true * np.log(predictions + 1e-9), axis=1))
    return loss


In [11]:
cross_entropy_loss(y_train,y_pred)

13.117765631472084

In [12]:
y_pred.shape

(120, 3)

# Derivative Function

In [13]:
def relu_derivative(z):
    return np.where(z > 0, 1, 0)  # Return 1 if z > 0, else 0


def softmax(z):
    exp_x = np.exp(z - np.max(z, axis=1, keepdims=True))  # for numerical stability
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def softmax_derivative(z, y_true):
    softmax_output = softmax(z)

    return softmax_output - y_true


# Full Model with Backpropagation

In [19]:
input_size = X_train.shape[1]  # 4
hidden_size = 10
output_size = 3  # 3 classes in Iris

# Weights & Biases
w1 = np.random.randn(input_size, hidden_size)
b1 = np.zeros((1, hidden_size))
w2 = np.random.randn(hidden_size, hidden_size)
b2 = np.zeros((1, hidden_size))
w3 = np.random.randn(hidden_size, output_size)
b3 = np.zeros((1, output_size))

# ---- Training ----
lr = 0.01
epochs = 500

for epoch in range(epochs):
    # Forward Pass
    z1 = layer(X_train,w1,b1, activation = 'None')
    out1 = layer(X_train,w1,b1,activation ="relu")

    z2 = layer(out1, w2,b2,activation = 'None')
    out2 = layer(out1, w2, b2, activation = "relu")

    z3 = layer(out2, w3,b3,activation = 'None')
    out3 = layer(out2, w3,b3, activation = 'softmax')
    

    # Loss
    loss = cross_entropy_loss(out3, y_train)

    # Backward Pass
    dz3 = softmax_derivative(z3, y_train)  # (batch, 3)
    dw3 = np.dot(out2.T, dz3)
    db3 = np.sum(dz3, axis=0, keepdims=True)

    dz2 = np.dot(dz3, w3.T) * relu_derivative(z2)
    dw2 = np.dot(out1.T, dz2)
    db2 = np.sum(dz2, axis=0, keepdims=True)

    dz1 = np.dot(dz2, w2.T) * relu_derivative(z1)
    dw1 = np.dot(X_train.T, dz1)
    db1 = np.sum(dz1, axis=0, keepdims=True)

    # Update weights
    w3 -= lr * dw3
    b3 -= lr * db3
    w2 -= lr * dw2
    b2 -= lr * db2
    w1 -= lr * dw1
    b1 -= lr * db1
    
    if epoch == 10:
        print("\n--- Gradient Check on w1[0,0] ---")
        #gradient_check(X_train[:5], y_train[:5], w1, b1, w3, b3, dw1)
        #gradient_check(X_train, y_train, w1, b1, w2, b2, w3, b3, dw1)
        gradient_check(X_train, y_train, w1, b1, w2, b2, w3, b3, dw1,(0, 0))



    if epoch % 50 == 0:
        print(f"Epoch {epoch} - Loss: {loss:.4f}")
        
# ---- Accuracy on Test Set ----
def accuracy(X, y_true):
    z1 = np.dot(X, w1) + b1
    a1 = np.maximum(0, z1)

    z2 =  np.dot(a1, w2) + b2
    a2 = np.maximum(0, z2)

    z3 = np.dot(a2, w3) + b3
    probs = softmax(z3)

    predictions = np.argmax(probs, axis=1)
    labels = np.argmax(y_true, axis=1)

    return np.mean(predictions == labels)

acc = accuracy(X_test, y_test)
print(f"\nTest Accuracy: {acc * 100:.2f}%")

Epoch 0 - Loss: 2.5629

--- Gradient Check on w1[0,0] ---
--- Gradient Check on w1[0,0] ---
Analytical: 0.00000007 | Numerical: 0.00000000 | Diff: 0.00000007
Epoch 50 - Loss: 0.0539
Epoch 100 - Loss: 0.0494
Epoch 150 - Loss: 0.0457
Epoch 200 - Loss: 0.0323
Epoch 250 - Loss: 0.0265
Epoch 300 - Loss: 0.0220
Epoch 350 - Loss: 0.0187
Epoch 400 - Loss: 0.0156
Epoch 450 - Loss: 0.0141

Test Accuracy: 100.00%


In [20]:
def gradient_check(X, y, w1, b1, w2, b2, w3, b3, dw1, index):
    epsilon = 1e-5
    i, j = index
    
    original_val = w1[i, j]

    w1[i, j] = original_val + epsilon
    out1_plus = layer(X, w1, b1, 'relu')
    out2_plus = layer(out1_plus, w2, b2, 'relu')
    out3_plus = layer(out2_plus, w3, b3, 'softmax')
    loss_plus = cross_entropy_loss(out3_plus, y)

    w1[i, j] = original_val - epsilon
    out1_minus = layer(X, w1, b1, 'relu')
    out2_minus = layer(out1_minus, w2, b2, 'relu')
    out3_minus = layer(out2_minus, w3, b3, 'softmax')
    loss_minus = cross_entropy_loss(out3_minus, y)

    # Reset w1
    w1[i, j] = original_val

    # Numerical Gradient
    num_grad = (loss_plus - loss_minus) / (2 * epsilon)
    ana_grad = dw1[i, j]

    print(f"--- Gradient Check on w1[{i},{j}] ---")
    print(f"Analytical: {ana_grad:.8f} | Numerical: {num_grad:.8f} | Diff: {abs(ana_grad - num_grad):.8f}")


# Neural Network with L2 Regularization

In [21]:
def cross_entropy_loss_with_regularization(y_pred, y_true, model_weights, lambda_reg):
    m = y_true.shape[0]
    loss = -np.sum(y_true * np.log(y_pred + 1e-9)) / m
    
    # L2 Regularization Term
    l2_reg = lambda_reg * np.sum(np.square(model_weights))  # Sum of squares of all weights
    
    total_loss = loss + l2_reg  # Total loss with regularization
    return total_loss


In [33]:
input_size = X_train.shape[1]  # 4
hidden_size = 10
output_size = 3  # 3 classes in Iris

# Weights & Biases
w1 = np.random.randn(input_size, hidden_size)
b1 = np.zeros((1, hidden_size))
w2 = np.random.randn(hidden_size, hidden_size)
b2 = np.zeros((1, hidden_size))
w3 = np.random.randn(hidden_size, output_size)
b3 = np.zeros((1, output_size))

# Hyperparameter for regularization
lambda_reg = 0.3# You can tune this value
epochs = 500

# Training loop
for epoch in range(epochs):
    # Forward Pass
    z1 = layer(X_train, w1, b1, activation='None')
    out1 = layer(X_train, w1, b1, activation="relu")

    z2 = layer(out1, w2, b2, activation='None')
    out2 = layer(out1, w2, b2, activation="relu")

    z3 = layer(out2, w3, b3, activation='None')
    out3 = layer(out2, w3, b3, activation='softmax')

    # Loss Calculation with Regularization
    loss = cross_entropy_loss_with_regularization(out3, y_train, np.concatenate([w1.flatten(), w2.flatten(), w3.flatten()]), lambda_reg)

    # Backward Pass (with L2 gradient update)
    dz3 = softmax_derivative(z3, y_train)
    dw3 = np.dot(out2.T, dz3)
    db3 = np.sum(dz3, axis=0, keepdims=True)

    dz2 = np.dot(dz3, w3.T) * relu_derivative(z2)
    dw2 = np.dot(out1.T, dz2)
    db2 = np.sum(dz2, axis=0, keepdims=True)

    dz1 = np.dot(dz2, w2.T) * relu_derivative(z1)
    dw1 = np.dot(X_train.T, dz1)
    db1 = np.sum(dz1, axis=0, keepdims=True)

    # L2 Regularization on Gradients
    dw1 += 2 * lambda_reg * w1
    dw2 += 2 * lambda_reg * w2
    dw3 += 2 * lambda_reg * w3

    # Update weights
    w3 -= lr * dw3
    b3 -= lr * db3
    w2 -= lr * dw2
    b2 -= lr * db2
    w1 -= lr * dw1
    b1 -= lr * db1

    if epoch % 50 == 0:
        print(f"Epoch {epoch} - Loss: {loss:.4f}")


Epoch 0 - Loss: 46.9595
Epoch 50 - Loss: 38.7112
Epoch 100 - Loss: 24.1473
Epoch 150 - Loss: 15.9221
Epoch 200 - Loss: 11.9210
Epoch 250 - Loss: 9.8292
Epoch 300 - Loss: 8.6446
Epoch 350 - Loss: 7.8743
Epoch 400 - Loss: 7.3548
Epoch 450 - Loss: 7.0950


In [34]:
acc = accuracy(X_test, y_test)
print(f"\nTest Accuracy: {acc * 100:.2f}%")


Test Accuracy: 96.67%
