# Constructing a Neural Network from Scratch 

### A Sample notebook which gives a method to construct a neural network from scratch, that means we are not using any of the predefined libraries like tensorflow, keras etc.

For this purpose I'm using real life Dataset.

Hope you like the work! 😊

## Importing Libs

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import math

In [None]:
dataset = pd.read_csv('../input/surgical-dataset-binary-classification/Surgical-deepnet.csv')
dataset = dataset.fillna(0)

In [None]:
dataset.head()

In [None]:
dataset.info()

### no null value found 👀

## Detiled info of dataset 

In [None]:
import pandas_profiling
pandas_profiling.ProfileReport(dataset)

## It's just a demonstration for building a neural network, so we'll not look for EDA in depth.


# Let's get started with our network

In [None]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1:].values


## Train test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Scaled data will help in training of neural network, i generally speeds up the gradient descent!!

# 1. Intializing Parameters

## He Initialization


In [None]:
def initialize_params(layer_dims):
    
    """
    Arguments:
    layer_dims -- python array (list) containing the size of each layer.
    
    Returns:
    parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
                    W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
                    b1 -- bias vector of shape (layers_dims[1], 1)
                    ...
                    WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
                    bL -- bias vector of shape (layers_dims[L], 1)
    """
    params = {}
    np.random.seed(42)
    L = len(layer_dims)-1
    for l in range (L):
        params["W"+str(l+1)] = np.random.randn(layer_dims[l+1],layer_dims[l])*(2/layer_dims[l])**0.5
        params["b"+str(l+1)] = np.zeros((layer_dims[l+1],1))
    
    return params

# 2. Forward Propagation

We are using a model with last layer having activation sigmoid and all other layers with ReLu as activation function

In [None]:
def Linear_forward(A,W,b):
    Z = np.dot(W,A)+b
    cache = (A,W,b)
    return Z,cache

In [None]:
def Activation_forward(A,W,b,Activation):
    if Activation == "relu":
        Z,Linear_cache = Linear_forward(A,W,b)
        A  = np.maximum(0,Z)
        A, activation_cache = A,Z
    elif Activation == 'sigmoid':
        Z,Linear_cache = Linear_forward(A,W,b)
        A,activation_cache = (1/(1+np.exp(-Z)),Z)
    cache= (Linear_cache,activation_cache)
    return A,cache

In [None]:
def forward_prop(X,params):
    A=X
    caches = []
    L = len(params)//2
    for l in range (L-1):
        A_prev = A
        A, cache = Activation_forward(A_prev,params["W"+str(l+1)],params["b"+str(l+1)],"relu")
        caches.append(cache)
    AL,cache = Activation_forward(A,params["W"+str(L)],params["b"+str(L)],"sigmoid")
    caches.append(cache)
    return AL,caches

In [None]:
def cost(AL,Y) :
    m = Y.shape[1]
    cost = -np.sum(Y*np.log(AL)+(1-Y)*np.log(1-AL))/m
    return np.squeeze(cost)

# 3. Backward Propagation

In [None]:
def linear_backward(dZ,cache):
    A_prev,W,b = cache
    m =A_prev.shape[1]
    dW = (np.dot(dZ,A_prev.T)/m) 
    db = np.sum(dZ,axis=1,keepdims=True)/m
    dA_prev = np.dot(W.T,dZ)
    return dA_prev , dW , db

In [None]:
def activation_backward(dA,cache,activation):
    linear_cache,activation_cache = cache
    Z=activation_cache
    if activation == "relu":
        dZ = (Z>0).astype(int)
        dZ = dA*dZ
        dA_prev, dW, db = linear_backward(dZ,linear_cache)
    elif activation == "sigmoid":
        dZ = np.multiply(dA,(1/(1+np.exp(-Z)))*(1-(1/(1+np.exp(-Z)))))
        dA_prev, dW, db = linear_backward(dZ,linear_cache)
    
    return dA_prev, dW, db

In [None]:
def backward_prop(AL,Y,caches):
    
    L = len(caches)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    grads = {}
    
    grads["dA"+str(L)] = -(np.divide(Y,AL)-np.divide(1-Y,1-AL))
    
    current_cache = caches[L-1]
    
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] =  activation_backward(grads["dA"+str(L)], current_cache, 'sigmoid')
    for l in reversed (range(L-1)):
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp =  activation_backward(grads['dA'+str(l+1)], current_cache, 'relu')
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
        
    return grads
        

# 4. Making Mini Batches

In [None]:
def random_mini_batches(X, Y, mini_batch_size ):
    """
    Creates a list of random minibatches from (X, Y)
    
    Arguments:
    X -- input data, of shape (input size, number of examples)
    Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (1, number of examples)
    mini_batch_size -- size of the mini-batches, integer
    
    Returns:
    mini_batches -- list of synchronous (mini_batch_X, mini_batch_Y)
    """
    

    m = X.shape[1]                  # number of training examples
    mini_batches = []
    np.random.seed(42)
    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((1,m))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = math.floor(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        
        mini_batch_X = shuffled_X[:,k*mini_batch_size:(k+1)*mini_batch_size]
        mini_batch_Y = shuffled_Y[:,k*mini_batch_size:(k+1)*mini_batch_size]
     
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        
        mini_batch_X = shuffled_X[:,num_complete_minibatches*mini_batch_size:m]
        mini_batch_Y = shuffled_Y[:,num_complete_minibatches*mini_batch_size:m]
        
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches

# 5. Gradient descent with ADAM optimization

In [None]:
def initialize_adam(parameters) :
    L = len(parameters) // 2 # number of layers in the neural networks
    v = {}
    s = {}
    
    # Initialize v, s. Input: "parameters". Outputs: "v, s".
    for l in range(L):
    ### START CODE HERE ### (approx. 4 lines)
        v["dW" + str(l+1)] = np.zeros(parameters["W"+str(l+1)].shape)
        v["db" + str(l+1)] = np.zeros(parameters["b"+str(l+1)].shape)
        s["dW" + str(l+1)] = np.zeros(parameters["W"+str(l+1)].shape)
        s["db" + str(l+1)] = np.zeros(parameters["b"+str(l+1)].shape)
    ### END CODE HERE ###
    
    return v, s

In [None]:
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01, beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8):
    L = len(parameters) // 2                 # number of layers in the neural networks
    v_corrected = {}                         # Initializing first moment estimate, python dictionary
    s_corrected = {}                         # Initializing second moment estimate, python dictionary
    
    # Perform Adam update on all parameters
    for l in range(L):
        # Moving average of the gradients. Inputs: "v, grads, beta1". Output: "v".
       
        v["dW" + str(l+1)] = beta1*v["dW"+str(l+1)]+(1-beta1)*grads["dW"+str(l+1)]
        v["db" + str(l+1)] = beta1*v["db"+str(l+1)]+(1-beta1)*grads["db"+str(l+1)]
       

        # Compute bias-corrected first moment estimate. Inputs: "v, beta1, t". Output: "v_corrected".
        
        v_corrected["dW" + str(l+1)] = v["dW"+str(l+1)]/(1-beta1**t)
        v_corrected["db" + str(l+1)] = v["db"+str(l+1)]/(1-beta1**t)
        
        # Moving average of the squared gradients. Inputs: "s, grads, beta2". Output: "s".
        s["dW" + str(l+1)] = beta2*s["dW"+str(l+1)]+(1-beta2)*grads["dW"+str(l+1)]**2
        s["db" + str(l+1)] = beta2*s["db"+str(l+1)]+(1-beta2)*grads["db"+str(l+1)]**2


        # Compute bias-corrected second raw moment estimate. Inputs: "s, beta2, t". Output: "s_corrected".
     
        s_corrected["dW" + str(l+1)] = s["dW"+str(l+1)]/(1-beta2**t)
        s_corrected["db" + str(l+1)] = s["db"+str(l+1)]/(1-beta2**t)
        

        # Update parameters. Inputs: "parameters, learning_rate, v_corrected, s_corrected, epsilon". Output: "parameters".

        parameters["W" + str(l+1)] = parameters["W"+str(l+1)] - learning_rate*(v_corrected["dW"+str(l+1)]/((s_corrected["dW"+str(l+1)]**0.5)+epsilon))
        parameters["b" + str(l+1)] = parameters["b"+str(l+1)] - learning_rate*(v_corrected["db"+str(l+1)]/((s_corrected["db"+str(l+1)]**0.5)+epsilon))
   

    return parameters, v, s

# 6. Model code (Combining Every thing) 

In [None]:
def model(X, Y, layers_dims, optimizer, learning_rate = 0.0007, mini_batch_size = 16, beta = 0.9,beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8, num_epochs = 700, print_cost = True):
    L = len(layers_dims)             # number of layers in the neural networks
    costs = []                       # to keep track of the cost
    t = 0  
    m =X.shape[1]
    
    params = initialize_params(layers_dims)
    
    v, s = initialize_adam(params)
    
    for i in range(num_epochs):
        minibatches = random_mini_batches(X, Y, mini_batch_size)
        cost_total = 0
        for minibatch in minibatches:
            
            (minibatch_X,minibatch_Y) = minibatch
            AL, caches = forward_prop(minibatch_X, params)
            
            cost_total += cost(AL, minibatch_Y)
            
            grads = backward_prop(AL, minibatch_Y, caches)
            t=t+1
            params, v, s = update_parameters_with_adam(params, grads, v, s, t, learning_rate, beta1, beta2,  epsilon)
            
        cost_avg = cost_total / m
            
        if print_cost and i % 50 == 0:
            print ("Cost after epoch %i: %f" %(i, cost_avg))
            costs.append(cost_avg)
                
    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('epochs (per 100)')
    plt.title("Learning rate = " + str(learning_rate))
    plt.show()

    return params

# 7. Making Predictions

In [None]:
def predict(X,y,parameters):
    
    pre, cache = forward_prop(X,parameters)
    predictions = (pre>0.5).astype(int)
    from sklearn.metrics import accuracy_score
    
    print(accuracy_score(predictions[0],y[0]))
    return predictions

    

# 8. Training our model

In [None]:
layers_dims = [X.shape[1],4,4,4,1]
params = model(X_train.T, y_train.T ,layers_dims, optimizer = "adam",learning_rate=7e-4)

# 9. Making Predictions

In [None]:
print("Train Accuracy:")
predictions_train = predict(X_train.T, y_train.T, params)
print("Test Accuracy:")
predictions_test = predict(X_test.T, y_test.T, params)

### We can further improve performance by Data cleaning and doing Hyperparameter tuning for parameters like Learning rate, Layer_dims, no. of Hidden units, mini_batch_size etc.

### If there's any bug I left in this so much messy code, please inform in comments, still a beginner 😅 

# Don't forget to upvote 😉