In [1]:
import numpy as np
import pandas as pd

## **Building a 2-Layered Neural Network From Scratch (NumPy Version)**

In [207]:
# xavier_init for tanh,sigmoid
def xavier_init(fan_in,fan_out):
  return np.random.randn(fan_in,fan_out) * np.sqrt(1.0/fan_in)

In [208]:
# he_init for relu
def he_init(fan_in,fan_out):
  return np.random.randn(fan_in,fan_out) * np.sqrt(2.0/fan_in)

In [2]:
# Weight Initialization
def init_params(input_dim,hidden_dim,output_dim):
  W1 = np.random.randn(input_dim,hidden_dim)*0.01 # here 2x2 matrix of random values of weight and multiplied by 0.01 to make it small value
  b1 = np.zeros((1,hidden_dim)) # bias can be zero
  W2 = np.random.randn(hidden_dim,output_dim)*0.01 # here also 4x4 matrix of hidden layer by output layer and made small
  b2 = np.zeros((1,output_dim)) # bias can be zero
  return W1,b1,W2,b2

In [210]:
# he Weight Initialization
def he_init_params(input_dim,hidden_dim,output_dim):
  W1 = he_init(input_dim,hidden_dim) # here 2x2 matrix of random values of weight and multiplied by 0.01 to make it small value
  b1 = np.zeros((1,hidden_dim)) # bias can be zero
  W2 = he_init(hidden_dim,output_dim) # here also 4x4 matrix of hidden layer by output layer and made small
  b2 = np.zeros((1,output_dim)) # bias can be zero
  return W1,b1,W2,b2

In [209]:
# Activation Functions
def relu(x):
  return np.maximum(0,x)  # returns x if greater than 0

def relu_deriv(x):
    return (x > 0).astype(float) # converts the bool into float like if x > 0 => True then float(True) is 1 same like derivative of relu

def tanh(x):
   return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))

def tanh_deriv(x):
   a = np.tanh(x)
   return (1 - a ** 2)

def sigmoid(x):
   return 1.0/(1 + np.exp(-x))

def sigmoid_deriv(s):
   return s * (1 - s)

def softmax(z):
  exp = np.exp(z - np.max(z, axis = 1, keepdims = True))
  return exp/np.sum(exp, axis = 1, keepdims = True)

In [4]:
# Forward pass
def forward(x, w1, b1, w2, b2):
  z1 = x @ w1 + b1 # calculation of node 1
  a1 = relu(z1) # output of node 1
  z2 = a1 @ w2 + b2 # calculation of node 2
  a2 = softmax(z2) # output of node 2 predictions (in probabilites)
  cache = (x,z1,a1,z2,a2)
  return a2,cache

In [5]:
# cross entropy loss function
def cross_entropy(y_pred, y_true):
  m = y_true.shape[0] # m means the number of samples in the y_true
  return - np.sum(y_true * np.log(y_pred + 1e-9))/ m # here y_true is multiplied with the predicted values so that only the true predicted values survive 

# for example if true label is class 0 [1,0,0] and y_pred is [0.7,0.2,0.1] where y_pred is confident about the class 0 so after y_true * np.log(y_pred) gives [-0.51,0,0]

In [6]:
# backpropagation

def backward(cache,w2,y_true):
 x,z1,a1,z2,a2 = cache
 m = x.shape[0]

 dz2 = a2 - y_true # calculate the loss of the output neuron

 dw2 = (a1.T @ dz2)/m # calculate the contribution and error by w2 for the output z2 with the output of hidden layer a1
 db2 = np.sum(dz2,axis = 0,keepdims = True)/m # the bias b2 contributes to z2 directly so we just sum up all the erros of z2 as dz2 and divide by the number of samples

 da1 = dz2 @ w2.T  # How much did each hidden neuron (A1) contribute to the output error (dZ2)? where W2 tells us how strongly each hidden neuron influences the output.
 dz1 = da1 * relu_deriv(z1) # how much is the hidden layers errors flowing through the neurons as da1 along with the applied relu as relu_deriv

 dw1 = (x.T @ dz1) / m # calculate the contribution and error by w1 for the output z1 with the input x
 db1 = np.sum(dz1, axis=0, keepdims=True) / m # the bias b1 contributes to z1 directly so we just sum up all the errors of z1 as dz1 and divide by the number of samples


 return dw1, db1, dw2, db2



In [227]:
def dropout_forward(a,p):
  mask = (np.random.randn(*a.shape) < p)
  out = a * mask/p
  return out,mask

def dropout_backward(dout,mask,p):
  da = dout * mask / p
  return da

In [7]:
# update rule

def update(w1,b1,w2,b2,grads,lr):
  dw1,db1,dw2,db2 = grads
  w1 -= lr * dw1
  b1 -= lr * db1
  w2 -= lr * dw2
  b2 -= lr * db2

  return w1,b1,w2,b2

In [229]:
def l2_update(W, dW, lr, lam):
    # lam = Î» (L2 coefficient)
    W -= lr * (dW + lam * W)
    return W

def l1_update(W, dW, lr, lam):
    W -= lr * (dW + lam * np.sign(W))
    return W

def momentum_update(W, dW, v, lr, beta):
    # v: velocity, same shape as W
    v[:] = beta * v + (1.0 - beta) * dW
    W -= lr * v
    return W, v


In [230]:
def adam_update(W, dW, m, v, t, lr,
                beta1=0.9, beta2=0.999, eps=1e-8):
    # m, v: first and second moment, same shape as W
    # t: time step (int) starting from 1

    m[:] = beta1 * m + (1.0 - beta1) * dW
    v[:] = beta2 * v + (1.0 - beta2) * (dW ** 2)

    m_hat = m / (1.0 - beta1 ** t)
    v_hat = v / (1.0 - beta2 ** t)

    W -= lr * m_hat / (np.sqrt(v_hat) + eps)
    return W, m, v

In [221]:
# Training Loop

def train(X,y,input_dim,hidden_dim,output_dim,lr = 0.1, epochs = 100):
  w1 , b1, w2, b2 = init_params(input_dim,hidden_dim,output_dim) # initialize parameters

  for epoch in range(epochs):
   y_pred, cache = forward(X, w1, b1, w2, b2) # Forward prop
   loss = cross_entropy(y_pred,y) # calculate loss
   grads = backward(cache,w2,y)
   w1,b1,w2,b2 = update(w1,b1,w2,b2,grads,lr)
   if epoch % 10 == 0:
     print(f'Epoch {epoch+10} Loss:{loss:.4f}')

  return w1,b1,w2,b2


## XOR Dataset

In [212]:
X_xor = np.array([[0, 0],
              [0, 1],
              [1, 0],
              [1, 1]])

In [213]:
y_labels = np.array([0,1,1,0])

In [214]:
y_labels.shape

(4,)

In [215]:
def to_one_hot(y, num_classes=2):
    m = y.shape[0]
    oh = np.zeros((m, num_classes))
    oh[np.arange(m), y] = 1
    return oh

y_xor = to_one_hot(y_labels, num_classes=2)  # shape: (4, 2)
y_xor

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.]])

In [226]:
w1,b1,w2,b2 = train(X_xor,y_xor,2,4,2,lr = 0.5,epochs = 1000)

y_pred_xor, _ = forward(X_xor,w1,b1,w2,b2)
print("Probabilities:\n", y_pred_xor)
print("Predicted classes:", np.argmax(y_pred_xor, axis=1))
print("True classes:", np.argmax(y_xor, axis=1))


Epoch 10 Loss:0.6931
Epoch 20 Loss:0.6930
Epoch 30 Loss:0.6927
Epoch 40 Loss:0.6918
Epoch 50 Loss:0.6889
Epoch 60 Loss:0.6799
Epoch 70 Loss:0.6552
Epoch 80 Loss:0.6050
Epoch 90 Loss:0.5502
Epoch 100 Loss:0.5155
Epoch 110 Loss:0.4986
Epoch 120 Loss:0.4900
Epoch 130 Loss:0.4760
Epoch 140 Loss:0.4423
Epoch 150 Loss:0.3425
Epoch 160 Loss:0.2137
Epoch 170 Loss:0.1219
Epoch 180 Loss:0.0860
Epoch 190 Loss:0.0614
Epoch 200 Loss:0.0501
Epoch 210 Loss:0.0403
Epoch 220 Loss:0.0344
Epoch 230 Loss:0.0293
Epoch 240 Loss:0.0261
Epoch 250 Loss:0.0231
Epoch 260 Loss:0.0207
Epoch 270 Loss:0.0186
Epoch 280 Loss:0.0170
Epoch 290 Loss:0.0157
Epoch 300 Loss:0.0145
Epoch 310 Loss:0.0135
Epoch 320 Loss:0.0126
Epoch 330 Loss:0.0119
Epoch 340 Loss:0.0111
Epoch 350 Loss:0.0105
Epoch 360 Loss:0.0099
Epoch 370 Loss:0.0094
Epoch 380 Loss:0.0089
Epoch 390 Loss:0.0086
Epoch 400 Loss:0.0082
Epoch 410 Loss:0.0078
Epoch 420 Loss:0.0075
Epoch 430 Loss:0.0071
Epoch 440 Loss:0.0069
Epoch 450 Loss:0.0066
Epoch 460 Loss:0.00

## Circle Dataset

In [181]:
from sklearn.datasets import make_circles

X_circle,y_circle = make_circles(n_samples = 1000,factor = .5, noise = 0.03,random_state = 4)

In [182]:
X_circle.shape

(1000, 2)

In [183]:
y_circle = to_one_hot(y_circle,num_classes = 2)

In [184]:
w1,b1,w2,b2 = train(X_circle,y_circle,2,4,2,lr = 0.5,epochs = 1000)


Epoch 10 Loss:0.6931
Epoch 20 Loss:0.6931
Epoch 30 Loss:0.6931
Epoch 40 Loss:0.6930
Epoch 50 Loss:0.6929
Epoch 60 Loss:0.6927
Epoch 70 Loss:0.6924
Epoch 80 Loss:0.6919
Epoch 90 Loss:0.6910
Epoch 100 Loss:0.6896
Epoch 110 Loss:0.6872
Epoch 120 Loss:0.6835
Epoch 130 Loss:0.6780
Epoch 140 Loss:0.6702
Epoch 150 Loss:0.6593
Epoch 160 Loss:0.6444
Epoch 170 Loss:0.6233
Epoch 180 Loss:0.5939
Epoch 190 Loss:0.5540
Epoch 200 Loss:0.5032
Epoch 210 Loss:0.4448
Epoch 220 Loss:0.3861
Epoch 230 Loss:0.3325
Epoch 240 Loss:0.2867
Epoch 250 Loss:0.2487
Epoch 260 Loss:0.2178
Epoch 270 Loss:0.1925
Epoch 280 Loss:0.1718
Epoch 290 Loss:0.1547
Epoch 300 Loss:0.1403
Epoch 310 Loss:0.1282
Epoch 320 Loss:0.1179
Epoch 330 Loss:0.1090
Epoch 340 Loss:0.1013
Epoch 350 Loss:0.0945
Epoch 360 Loss:0.0886
Epoch 370 Loss:0.0833
Epoch 380 Loss:0.0786
Epoch 390 Loss:0.0744
Epoch 400 Loss:0.0706
Epoch 410 Loss:0.0671
Epoch 420 Loss:0.0639
Epoch 430 Loss:0.0611
Epoch 440 Loss:0.0584
Epoch 450 Loss:0.0560
Epoch 460 Loss:0.05

In [185]:
y_pred_circle, _ = forward(X_circle,w1,b1,w2,b2)
print("Probabilities:\n", y_pred_circle)
print("Predicted classes:", np.argmax(y_pred_circle, axis=1))
print("True classes:", np.argmax(y_circle, axis=1))

Probabilities:
 [[0.03013057 0.96986943]
 [0.01246535 0.98753465]
 [0.00231727 0.99768273]
 ...
 [0.98324737 0.01675263]
 [0.99733314 0.00266686]
 [0.97706495 0.02293505]]
Predicted classes: [1 1 1 1 1 1 1 1 1 0 1 0 1 0 0 1 1 1 0 1 1 1 0 0 0 0 1 0 1 1 1 0 1 0 0 0 1
 1 0 1 1 0 0 1 1 1 1 0 0 1 0 1 1 0 0 0 0 1 1 1 1 0 0 1 1 1 0 1 0 1 0 0 1 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 1
 0 1 0 1 1 0 1 1 0 0 1 0 0 1 1 1 1 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1 1 1 0
 1 1 0 0 1 1 1 1 0 1 0 0 0 1 1 1 1 0 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 1 1
 0 1 0 1 1 0 1 0 0 1 0 1 1 1 0 1 1 1 0 1 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 1 0
 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 0 1 1 1 1 0 0 0
 1 1 1 0 0 1 0 1 0 0 0 1 1 1 0 1 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 0
 0 1 1 0 1 1 0 0 1 1 0 1 1 1 0 0 0 1 0 1 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 1
 1 0 0 1 1 0 1 1 0 0 1 0 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 0 0 1 1 1 1 1 0 1 0
 0 0 0 1 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 1 0 0 

In [187]:
predicted_class = np.argmax(y_pred_circle, axis = 1)
true_class = np.argmax(y_circle, axis = 1)

In [188]:
print(predicted_class == true_class)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  T

## MNIST Dataset

In [201]:
from tensorflow.keras.datasets import mnist
(x_train,y_train) , (x_test,y_test) = mnist.load_data()

In [202]:
x = x_test.reshape(x_test.shape[0],-1)
x.shape

(10000, 784)

In [203]:
y = to_one_hot(y_test,num_classes = 10)
y.shape

(10000, 10)

In [204]:
w1,b1,w2,b2 = train(x,y,784,20,10,lr = 0.9,epochs = 1000)


Epoch 10 Loss:2.6503
Epoch 20 Loss:2.3210
Epoch 30 Loss:2.3048
Epoch 40 Loss:2.3015
Epoch 50 Loss:2.3010
Epoch 60 Loss:2.3009
Epoch 70 Loss:2.3009
Epoch 80 Loss:2.3008
Epoch 90 Loss:2.3008
Epoch 100 Loss:2.3008
Epoch 110 Loss:2.3008
Epoch 120 Loss:2.3008
Epoch 130 Loss:2.3008
Epoch 140 Loss:2.3008
Epoch 150 Loss:2.3008
Epoch 160 Loss:2.3008
Epoch 170 Loss:2.3008
Epoch 180 Loss:2.3008
Epoch 190 Loss:2.3008
Epoch 200 Loss:2.3008
Epoch 210 Loss:2.3008
Epoch 220 Loss:2.3008
Epoch 230 Loss:2.3008
Epoch 240 Loss:2.3008
Epoch 250 Loss:2.3008
Epoch 260 Loss:2.3008
Epoch 270 Loss:2.3008
Epoch 280 Loss:2.3008
Epoch 290 Loss:2.3008
Epoch 300 Loss:2.3008
Epoch 310 Loss:2.3008
Epoch 320 Loss:2.3008
Epoch 330 Loss:2.3008
Epoch 340 Loss:2.3008
Epoch 350 Loss:2.3008
Epoch 360 Loss:2.3008
Epoch 370 Loss:2.3008
Epoch 380 Loss:2.3008
Epoch 390 Loss:2.3008
Epoch 400 Loss:2.3008
Epoch 410 Loss:2.3008
Epoch 420 Loss:2.3008
Epoch 430 Loss:2.3008
Epoch 440 Loss:2.3008
Epoch 450 Loss:2.3008
Epoch 460 Loss:2.30

In [199]:
y_pred, _ = forward(x,w1,b1,w2,b2)
print("Probabilities:\n", y)
print("Predicted classes:", np.argmax(y_pred, axis=1))
print("True classes:", np.argmax(y, axis=1))

Probabilities:
 [[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Predicted classes: [1 1 1 ... 1 1 1]
True classes: [7 2 1 ... 4 5 6]


In [200]:
print(predicted_class == true_class)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  T