# Multilayer Perception

Our goal here is to implement a two-layer neural network for binary classification, train it using gradient descent and use it to classify the Iris dataset.
Our model is
$$
\hat{y} = \sigma \left ( W \sigma \left ( V x \right ) \right)
$$
where we have $M$ hidden units and $D$ input features -- that is $w \in \mathbb{R}^{M}$, and $V \in \mathbb{R}^{M \times D}$. For simplicity here we do not include a bias parameter for each layer. Key to our implementation is the gradient calculation. We follow the notation used in the slides here.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.debugger import set_trace
import warnings
warnings.filterwarnings('ignore')

In [6]:
logistic = lambda z: 1/(1+np.exp(z))

class MLP:
    
    def __init__(self, M=64):
        self.M = M
        
    def fit(self, x, y, optimizer):
        N,D = x.shape
        def gradient(x, y, parameters):
            v, w = parameters
            q = np.dot(v,x)
            z = logistic(q)
            u = np.dot(w,z)
            yh = logistic(u)
            dLdyh = - y / yh + (1-y)/(1-yh)
            dyhdu = yh*(1-yh)
            dudw = z
            dudz = w
            dzdq = z*(1-z)
            dqdv = x
            
            dLdw = dLdyh*dyhdu*dudw
            dLdv = dLdyh*dyhdu*dudz*dzdq*dqdv
            
            return [dLdw, dLdv]
        
        w = np.random.randn(self.M) * 0.01
        v = np.random.randn(D,self.M) * 0.01
        
        self.parameters = optimizer.run(gradient, x, y, [v,w])
        return self
    
    def predict(self, x):
        v, w = self.parameters
        z = logistic(np.dot(v,x))
        yh = logistic(np.dot(w,z))
        return yh
            

In [None]:
class GradientDescent:
    
    def __init__(self, learning_rate=0.001, max_iters=1e4, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.max_iters = max_iters
        self.epsilon = epsilon
        
    def run(self, gradient_fn, x, y, params):
        norms = np.array([np.inf])
        t = 1
        while np.any(norms > self.epsilon) and t < self.max_iters:
            grad = gradient_fn(x, y, params)
            for p in range(len(params)):
                params[p] -= self.learning_rate * grad[p]
            t += 1
            norms = np.array([])