Neural netowrk funcs

Layers: an Integer value representing the total number of hidden layers in the network
(input and output layers are extra)

● Nodes: an integer array of size [0,..,Layers+1] containing the dimensions of the neural
network. Nodes[0] shall represent the input size (typically, 50), Nodes[Layers+1]

shall represent the number of output nodes (typically, 1). All other values Nodes[i]
represent the number of nodes in hidden layer i.
● NNodes: a possible alternative to the Nodes parameter for situations where you want
each hidden layer of the neural network to be of the same size. In this case, the size of
the output layer is assumed to be 1, and the size of the input layer can be inferred from
the dataset.
● Activations: an array of size [0,..,Layers+1] (for the sake of compatibility) in which
Activations[0] and Activations[Layers+1] are not used, while all other
Activations[i] values are labels indicating the activation function used in layer i.
This allows you to build neural networks with different activation functions in each layer.
● ActivationFn: a possible alternative to Activations when all hidden layers of your neural
network use the same activation function.

params: layers: # of hidden layers, 
	nodes: if int: number of nodes for each hidden layer (when creating hidden layer, add 1 extraa node for bias term)
	       if arr: number elems in arr should be same as layers, each num will be number of nodes for the hidden layer
	activation: apply to all hidden layers
	learning rate:
	batch size:


when initializing, need to generate random weights for all the layers

x: num_obs x num_features

w1+b: num_features(x) + 1 x width + 1 (of h1)

h1 = num_obs x width+1(of h2)

w2+b: width+1(of h1) x width+1 (of h2)
*****

h2 = num_obs x width+1(of h2) (of h2)

w3+b = width+1(of h2) x 1(we only want it output a single value per observation)


y = num_obs x 1

In [2]:
import numpy as np
import pandas as pd

In [596]:
class NeuralNetwork:
    
    def __init__(self, layers, nodes, activation, loss, learning_rate, batch_size):
        
        # number of hidden layers we are creating
        self.num_layers = layers
        
        # number of nodes at each layer
        self.nodes = self._create_nodes(nodes,layers)
        
        # activation function
        self.sigma = activation
                
        # loss function
        self.loss = loss
        
        # learning rate for stochastic gradient descent
        self.lr = learning_rate
        
        # batch size for stochastic gradient descent
        self.batch_size = batch_size

        # after fit, will contain a dictionary of weights for the layers
        self.weight_dict = None
        
        # store values for backwards propagation
        self.stored = None

        
    def _create_nodes(self, nodes, layers):
    
        if isinstance(nodes, int):
            nodes = nodes+1
            return [nodes]*(layers)
        
        else:
            return [i+1 for i in nodes]
        
    def _create_weights(self, num_features):
        
        # for 3 hidden layers
        
        #0 between input layer and hidden layer 1: 
        #1 hidden layer 1 to hidden layer 2: width x width
        #2 hidden layer 2 to hidden layer 3: width x width
        #3 hidden layer 3 to y :width x 1
        
        #so self.layers[0] will always be numfeatures x width(of first hidden layer)
        # self.layers[num_layers] = widthx1
        
        self.weight_dict = {
            0: np.random.rand(num_features, self.nodes[0])
        }
        
        for i in range(1,self.num_layers):
            self.weight_dict[i] = np.random.rand(self.nodes[i-1], self.nodes[i])
            
        self.weight_dict[self.num_layers] = np.random.rand(self.nodes[self.num_layers-1], 1)
        # b x n -> b x 1
        # h1 x n X nodes
        
    def _forward_propagation(self, X, y):
        # need to figure out what weights to save and how to backprop
        
        sigma_h_x = X
        self.stored = {}
        self.stored[-1] = [None, sigma_h_x]
        
        for i in range(0,self.num_layers):
            h_x = sigma_h_x @ self.weight_dict[i] 
            sigma_h_x = self.sigma(h_x) 
            self.stored[i] = [h_x, sigma_h_x]
        
        z_sigma_h_x = sigma_h_x @ self.weight_dict[self.num_layers]
        self.stored[self.num_layers] = [z_sigma_h_x]
        
        return np.sum(self.loss(z_sigma_h_x, y))/self.batch_size #summation of num_obs x 1 array divide by batchsize
        """
        2 hidden layer example
        weight -1:[None, x]
        weight 0: [h1(x), sigma(h1(x))] #X to h1
        weight 1: [h2(sigma(h1(x))), sigma(h2(sigma(h1(x))))] #h1 to h2 [bxn], [bxn]
        weight 2: [z(sigma(h2(sigma(h1(x)))))] #h2 to y [nx1]
        """
    def _backward_propagation(self, X,y):
        
        num_layers = self.num_layers
        expected = self.stored[self.num_layers][0] #z(sigma(h2(sigma(h1(x)))))
        
        # dL/dz(z(sigma(h2(sigma(h1(x))))))
        J = self.loss(expected, y, derivative = True)
        
        old_weights = self.weight_dict[num_layers] 
        self.weight_dict[num_layers] = self.weight_dict[num_layers] - self.lr*(self.stored[num_layers-1][1].T @ J)
        # dz/dsigma(sigma(h2(sigma(h1(x)))))
        
        for i in range(self.num_layers-1,-1,-1):
            
            J = J * self.sigma(self.stored[i][0], derivative = True) # activation layer derivative
            
            old_weights = self.weight_dict[i] 
            self.weight_dict[i] = self.weight_dict[i] - self.lr*(self.stored[i-1][1].T @ J) # weight update
            
            J = J @ old_weights.T  # dense layer derivative
            
            

    def _stochastic_gradient_descent(self, X, y, num_iterations, print_iter):
        # how to integrate forward and back vias sgd
        
        avg_err_arr = [np.nan]*num_iterations
        
        indices = np.random.choice(X.shape[0], self.batch_size, replace=False)
        x_batch = X[indices]
        y_batch =y[indices]
        
        avg_err_arr[0] = self._forward_propagation(x_batch,y_batch)
        print(self.weight_dict)
        for i in range(1,num_iterations):

            self._backward_propagation(x_batch,y_batch)
            indices = np.random.choice(X.shape[0], self.batch_size, replace=False)
            x_batch = X[indices]
            y_batch =y[indices]
        
            avg_err_arr[i] = self._forward_propagation(x_batch,y_batch)
            
        print(self.weight_dict)
        return avg_err_arr
        
    
    def fit(self, X,y, num_iterations = 1000, print_iter = False):
        
        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()
            
        if isinstance(y, pd.Series):
            y = y.to_numpy()
        
        # add intercept to X
        X = np.hstack([np.ones(len(X))[:, np.newaxis], X])  
        y = np.array([y]).T # makes it easier to calculate loss
        
        # generate the random weights for every layer in neural network
        self._create_weights(X.shape[1])
        
        
        return self._stochastic_gradient_descent(X,y,num_iterations, print_iter)
    
    # forward propagation without saved vals for speed
    def predict(self, X):
        
        sigma_h_x = np.hstack([np.ones(len(X))[:, np.newaxis], X])  
        
        for i in range(0,self.num_layers):
            h_x = sigma_h_x @ self.weight_dict[i]
            sigma_h_x = self.sigma(h_x) 
        
        z_sigma_h_x = sigma_h_x @ self.weight_dict[self.num_layers]
        return z_sigma_h_x.T[0]
        

In [603]:
def L2_loss(x,y, derivative = False):
    # assumes y is nx1 vector (n rows, 1 col)
    if derivative:
        return x-y
    else:
        return 0.5*((x-y)**2)
        
def ReLU(X, derivative = False):
    if derivative:
        # faster than np.where
        return np.greater(X, 0).astype(int)
    return np.maximum(X,0)

In [604]:
import sklearn.datasets
data = sklearn.datasets.load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df["target"] = df["sepal length (cm)"]
df = df.drop("sepal length (cm)", axis = 1)
df.head()

Unnamed: 0,sepal width (cm),petal length (cm),petal width (cm),target
0,3.5,1.4,0.2,5.1
1,3.0,1.4,0.2,4.9
2,3.2,1.3,0.2,4.7
3,3.1,1.5,0.2,4.6
4,3.6,1.4,0.2,5.0


In [605]:
df.target.mean()

5.843333333333335

In [610]:
layers = 1
nodes = 3
activation = ReLU
loss = L2_loss
learning_rate = 0.000001
batch_size = df.shape[0]


test_nn = NeuralNetwork(layers, nodes, activation, loss, learning_rate, batch_size)


errs = test_nn.fit(df.drop("target",axis=1),df["target"], num_iterations = 10000)
errs[9999]

{0: array([[0.53897071, 0.4893282 , 0.98013523, 0.72556696],
       [0.77881227, 0.63475744, 0.11329743, 0.4285618 ],
       [0.55041842, 0.07367494, 0.28825486, 0.17790043],
       [0.37596473, 0.81812006, 0.89347364, 0.23256879]]), 1: array([[0.24154859],
       [0.60031668],
       [0.99296506],
       [0.92170884]])}
{0: array([[ 0.55119245,  0.50154994,  0.99235697,  0.7377887 ],
       [ 0.76529072,  0.62123589,  0.09977588,  0.41504025],
       [ 0.38452862, -0.09221485,  0.12236507,  0.01201063],
       [ 0.19439878,  0.63655411,  0.71190769,  0.05100283]]), 1: array([[0.10829957],
       [0.46721531],
       [0.82363489],
       [0.88330904]])}


0.10231730979540928

In [613]:
preds

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [615]:
samp_size = 50
samp = df.sample(n=samp_size)


preds = test_nn.predict(samp.drop("target",axis=1))
mse = ((preds -samp.target)**2).sum()/samp_size
preds, mse

(array([4.49053441, 5.1134576 , 7.16064343, 5.06767709, 4.6659136 ,
        5.99321768, 4.6659136 , 5.92741863, 5.47521131, 5.75006732,
        4.57272634, 5.32811235, 5.88723591, 4.66771278, 7.47678957,
        7.0968438 , 5.3582342 , 5.06587791, 5.37265495, 6.54908877,
        4.72611486, 6.70030492, 4.57092716, 7.05646083, 6.27457864,
        5.74087116, 5.34903805, 4.75910086, 7.48978433, 5.54584663,
        5.78142708, 6.23979347, 6.34774736, 4.4777399 , 5.67887072,
        5.33624354, 6.06621346, 4.99648049, 5.53541257, 6.47734604,
        4.98368598, 7.45299972, 4.91248939, 5.29152799, 6.30196685,
        5.77728258, 6.04999353, 6.53989262, 5.13724744, 5.55089828]),
 0.19111385378760207)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [440]:
# test on iris versicolor tomorrow
e


test_nn._create_weights(X.shape[1])

In [None]:
test_nn._forward_propagation(X,y)

In [None]:
for i in range(0,20):
    test_nn._backward_propagation(X,y)
    test_nn._forward_propagation(X,y)
    




In [447]:
X_test= np.array([
    [1],
    [2],
    [3]
])


test_nn.predict(X_test)

array([[1.02339419],
       [1.7458805 ],
       [2.46836682]])

In [423]:
"""

#         Jw1 = self.stored[self.num_layers-2][1].T @ J #dh2/dw1(sigma(h1(x)))
#         J =  J @ self.sigma(self.stored[self.num_layers-2][1], derivative = True) #dh2/dsigma(sigma(h1(x)))
#         J =  J @ self.stored[self.num_layers-2][0].T  #dsigma/dh1(h1(x))
        

"""

'\n\n#         Jw1 = self.stored[self.num_layers-2][1].T @ J #dh2/dw1(sigma(h1(x)))\n#         J =  J @ self.sigma(self.stored[self.num_layers-2][1], derivative = True) #dh2/dsigma(sigma(h1(x)))\n#         J =  J @ self.stored[self.num_layers-2][0].T  #dsigma/dh1(h1(x))\n        \n\n'

In [None]:
        
        ################################################################################
        
        #dsigma/dh2(h2(sigma(h1(x))))) # i dont understand
        J = self.sigma(J, derivative = True)
        # J = self.sigma(self.stored[num_layers-1][0], derivative)
        
        #updating the weights
        #dh2/w1
        #dh2/sigma(sigma(h2(sigma(h1(x)))))
        old_weights = self.weight_dict[self.num_layers-1] 
        self.weight_dict[num_layers-1] = self.weight_dict[num_layers-1] -self.lr*(self.stored[num_layers-2][1].T @ J)
        
        #updating the weights
        #dh2/dsigma(sigma(h2(sigma(h1(x)))))
        J = J @ old_weights.T
        
        # network: X -w0-sigma-> h1 -w1-sigma-> h2 -w2-> y
        
        #1. J = loss_deriv(z(sigma(h2(sigma(h1(x)))))) 
        #2. Update weight w2 = w2 - learning_rate*(sigma(h2(sigma(h1(x)))) @ J)
        #3  Dense layer derivative on z AKAJ =J @ oldweights.T
        
        #4. Activation Layer derivative of J
        #5. Update weight w1 = w1 - learning_rate*(sigma(h1(x))) @ J)
        # dense layer derivative on z AKA J @ oldweights.T
        
        #6 Activation Layer derivative of J
        #7. Update weight w0 = w0 - learning_rate*(x) @ J)
        
        
        
        for i in range(self.num_layers,0,-1):
            
            
            
            
            
            
            
            
            
            
            
            old_weights = self.weight_dict[i]
            self.weight_dict[i] = self.weight_dict[i] -self.lr*(self.stored[i-1][1].T @ J) #dz/dw | dh/dw
            print(self.stored[i-1][1].T.shape)
            print(J.shape)
            J =  self.stored[i-1][1].T @ J #dz/dsigma  | dh/dsigma
            print("\n")
            print(J.shape)
            print(self.sigma(self.stored[i-1][0], derivative = True).shape)
            J =  J @ self.sigma(self.stored[i-1][0], derivative = True) #dsigma/dh
            

        #final weight update
        self.weight_dict[0] = X.T @ J #dh1/dw0(x)        
        ################################################################################
        
        #dsigma/dh2(h2(sigma(h1(x))))) # i dont understand
        J = self.sigma(J, derivative = True)
        # J = self.sigma(self.stored[num_layers-1][0], derivative)
        
        #updating the weights
        #dh2/w1
        #dh2/sigma(sigma(h2(sigma(h1(x)))))
        old_weights = self.weight_dict[self.num_layers-1] 
        self.weight_dict[num_layers-1] = self.weight_dict[num_layers-1] -self.lr*(self.stored[num_layers-2][1].T @ J)
        
        #updating the weights
        #dh2/dsigma(sigma(h2(sigma(h1(x)))))
        J = J @ old_weights.T
        
        # network: X -w0-sigma-> h1 -w1-sigma-> h2 -w2-> y
        
        #1. J = loss_deriv(z(sigma(h2(sigma(h1(x))))))
        #2. Update weight w2 = w2 - learning_rate*(sigma(h2(sigma(h1(x)))) @ J)
        #3  Dense layer derivative on z AKAJ =J @ oldweights.T
        
        #4. Activation Layer derivative of J
        #5. Update weight w1 = w1 - learning_rate*(sigma(h1(x))) @ J)
        # dense layer derivative on z AKA J @ oldweights.T
        
        #
        
        
        
        
        for i in range(self.num_layers,0,-1):
            
            
            
            
            
            
            
            
            
            
            
            old_weights = self.weight_dict[i]
            self.weight_dict[i] = self.weight_dict[i] -self.lr*(self.stored[i-1][1].T @ J) #dz/dw | dh/dw
            print(self.stored[i-1][1].T.shape)
            print(J.shape)
            J =  self.stored[i-1][1].T @ J #dz/dsigma  | dh/dsigma
            print("\n")
            print(J.shape)
            print(self.sigma(self.stored[i-1][0], derivative = True).shape)
            J =  J @ self.sigma(self.stored[i-1][0], derivative = True) #dsigma/dh
            

        #final weight update
        self.weight_dict[0] = X.T @ J #dh1/dw0(x)