```
Input: (Hours Slept, Hours Studied)
Output: (Test Score)
```

In [1]:
import numpy as np
from scipy import optimize
from matplotlib import pyplot as plt

In [2]:
X = np.array(([3,5], [5,1], [10,2]), dtype=float)
y = np.array(([75], [82], [93]), dtype=float)

### Scaling data

In [3]:
X = X / np.amax(X, axis=0)
y = y / 100 # Max test score is 100

In [4]:
class Neural_Network(object):
    def __init__(self, inputLayerSize: int = 2, hiddenLayerSize: int = 3, outputLayerSize: int = 1):
        # Define Hyperparameters
        self.inputLayerSize = inputLayerSize
        self.outputLayerSize = outputLayerSize
        self.hiddenLayerSize = hiddenLayerSize

        # Weights (parameters)
        self.W1 = np.random.randn(self.inputLayerSize, self.hiddenLayerSize)
        self.W2 = np.random.randn(self.hiddenLayerSize, self.outputLayerSize)

    def forward(self, X):
        '''
        Propagate inputs though network
        Returns a new array with the same shape as target array y.

        z2 = X.W1 where X is input matrix and W1 is weight matrix from input to hidden layer
        a2 = f(z2) where f is sigmoid activation function
        z3 = a2.W2 where a2 is hidden layer output matrix and W2 is weight matrix from hidden to output layer
        yHat = f(z3) where f is sigmoid activation function

        Parameters
        ----------
        X : ndarray
            Input data.

        Returns
        -------
        out : ndarray
            Predicted data.
        '''
        self.z2 = np.dot(X, self.W1)        # dot product of X (input) and first set of 2x3 weights
        self.a2 = self.sigmoid(self.z2)     # apply sigmoid activation function to z2
        self.z3 = np.dot(self.a2, self.W2)  # dot product of hidden layer (a2) and second set of 3x1 weights
        yHat = self.sigmoid(self.z3)        # apply sigmoid activation function to z3
        return yHat

    def sigmoid(self, z):
        '''
        Apply sigmoid activation function to scalar, vector, or matrix
        '''
        return 1 / (1 + np.exp(-z))

    def sigmoidPrime(self, z):
        '''
        Derivative of sigmoid function
        Returns the derivative of the sigmoid function evaluated at z
        '''
        return np.exp(-z)/((1 + np.exp(-z))**2)

    def costFunction(self, X, y):
        """
        Computes cost for given X,y using weights already stored in class.
        Returns a new array with the same shape as y.

        Parameters
        ----------
        X : ndarray
            Input data.
        y : ndarray
            Target data.

        Returns
        -------
        out : ndarray
            Cost of the predicted data with respect to true target data.

        """
        self.yHat = self.forward(X)
        J = 0.5* sum((y-self.yHat)**2)
        return J

    def costFunctionPrime(self, X, y):
        """
        Computes derivative with respect to W1 and W2 for a given X and y

        Parameters
        ----------
        X : ndarray
            Input data.
        y : ndarray
            Target data.

        Returns
        -------
        out : Tuple[ndarray, ndarray]
            Derivative of cost function with respect to W1 and W2 respectively.
        """
        self.yHat = self.forward(X)

        delta3 = np.multiply(-(y-self.yHat), self.sigmoidPrime(self.z3))
        dJdW2 = np.dot(self.a2.T, delta3)

        delta2 = np.dot(delta3, self.W2.T)*self.sigmoidPrime(self.z2)
        dJdW1 = np.dot(X.T, delta2)

        return dJdW1, dJdW2

    #Helper Functions for interacting with other classes:
    def getParams(self):
        #Get W1 and W2 unrolled into vector:
        params = np.concatenate((self.W1.ravel(), self.W2.ravel()))
        return params

    def setParams(self, params):
        #Set W1 and W2 using single paramater vector.
        W1_start = 0
        W1_end = self.hiddenLayerSize * self.inputLayerSize
        self.W1 = np.reshape(params[W1_start:W1_end], (self.inputLayerSize , self.hiddenLayerSize))
        W2_end = W1_end + self.hiddenLayerSize*self.outputLayerSize
        self.W2 = np.reshape(params[W1_end:W2_end], (self.hiddenLayerSize, self.outputLayerSize))

    def computeGradients(self, X, y):
        dJdW1, dJdW2 = self.costFunctionPrime(X, y)
        return np.concatenate((dJdW1.ravel(), dJdW2.ravel()))

In [5]:
class Trainer(object):
    def __init__(self, N):
        # Make Local reference to network:
        self.N = N

    def train(self, X, y):
        #Make an internal variable for the callback function:
        self.X = X
        self.y = y

        #Make empty list to store costs:
        self.J = []

        loss = self.N.costFunction(X, y)
        k = 0
        maxiter =  300000
        lr = 0.06

        while (loss > 1e-5) and (k < maxiter):
            grad = self.N.computeGradients(X, y)
            nW1s = self.N.inputLayerSize * self.N.hiddenLayerSize
            dJW1 = grad[:nW1s].reshape(self.N.inputLayerSize, self.N.hiddenLayerSize)
            dJW2 = grad[nW1s:].reshape(self.N.hiddenLayerSize, self.N.outputLayerSize)
            self.N.W1 -= lr * dJW1
            self.N.W2 -= lr * dJW2
            loss = self.N.costFunction(X, y)
            self.J.append(loss)
            k += 1
            if k % 1000 == 0:
                print("Iteration %d: loss = %f" % (k, loss[0]))

        print("Operation terminated successfully.")
        print("         Iterations: %d" % k)
        print("         Final loss function: %f" % loss)

In [6]:
NN = Neural_Network()
T = Trainer(NN)
T.train(X,y)

Iteration 1000: loss = 0.005854
Iteration 2000: loss = 0.003374
Iteration 3000: loss = 0.002033
Iteration 4000: loss = 0.001353
Iteration 5000: loss = 0.001015
Iteration 6000: loss = 0.000843
Iteration 7000: loss = 0.000748
Iteration 8000: loss = 0.000689
Iteration 9000: loss = 0.000647
Iteration 10000: loss = 0.000614
Iteration 11000: loss = 0.000586
Iteration 12000: loss = 0.000560
Iteration 13000: loss = 0.000537
Iteration 14000: loss = 0.000516
Iteration 15000: loss = 0.000496
Iteration 16000: loss = 0.000477
Iteration 17000: loss = 0.000460
Iteration 18000: loss = 0.000443
Iteration 19000: loss = 0.000428
Iteration 20000: loss = 0.000413
Iteration 21000: loss = 0.000399
Iteration 22000: loss = 0.000385
Iteration 23000: loss = 0.000373
Iteration 24000: loss = 0.000361
Iteration 25000: loss = 0.000349
Iteration 26000: loss = 0.000339
Iteration 27000: loss = 0.000328
Iteration 28000: loss = 0.000318
Iteration 29000: loss = 0.000309
Iteration 30000: loss = 0.000300
Iteration 31000: lo

### Predicted Train values vs Actual Train values

In [7]:
NN.forward(X), y

(array([[0.74998366],
        [0.82211746],
        [0.92606096]]),
 array([[0.75],
        [0.82],
        [0.93]]))