In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

Linear regression and logistic regression can be viewed as simplest neural network without hidden layer.

The basic steps are similar: **feedforward**, **compute cost**, **Backpropagate**, **update weights**

Neural network units can perform as logic gates, and with billion logic gates we can build a computer.

| x1| x2|and|or |nand|nor|xor|
|:--:|:-:|:-:|:-:|:-:|:-:|:-:|
| 0 | 0 | 0 | 0 | 1 | 1 | 0 |  
| 0 | 1 | 0 | 1 | 1 | 0 | 1 | 
| 1 | 0 | 0 | 1 | 1 | 0 | 1 | 
| 1 | 1 | 1 | 1 | 0 | 0 | 0 | 

## Neural network

Build logic gates with neural network (sigmoid as active function). 

### Step 1: single unit

### 1. Feedforward
\begin{equation}
\mathbf{z} = XW
\end{equation}

\begin{equation}
\mathbf{\hat{y}} = \sigma(\mathbf{z})
\end{equation}

\begin{equation}
\sigma(z) = \frac{1}{1+e^{-z}}
\end{equation}

### 2. Compute cost function
\begin{equation}
L(\mathbf{y}, \mathbf{\hat{y}}) = -\frac{1}{m}\big(\mathbf{y}^T\log(\mathbf{\hat{y}})+(1-\mathbf{y})^T\log(1-\mathbf{\hat{y}})\big)
\end{equation}

### 3. Backpropagation
\begin{equation}
\frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial W} 
= \frac{\partial L(\mathbf{y}, \mathbf{\hat{y}})}{\partial \mathbf{\hat{y}}} \cdot \frac{\partial \mathbf{\hat{y}}}{\partial W} 
= \frac{1}{m}X^T(\mathbf{\hat{y}} - \mathbf{y})
\end{equation}

### 4. Gradient descent
\begin{equation}
W = W - \alpha  \frac{\delta Loss(y, \hat{y})}{\delta W}
\end{equation}

In [None]:
class NeuralNetwork:
    """ Simple neural network """
    def __init__(self):
        pass
    
    def _init_params(self, x, y, iterations, learning_rate):
        """ Initilize parameters. 
        
        ----------
        W : ndarray, shape (n_features+1,)
            Coefficient vector
        """
        self._X = np.hstack([np.ones((x.shape[0], 1)), x])
        self._y = y
        self._learning_rate = learning_rate
        
        self.weights_ = np.random.rand(self._X.shape[1], self._y.shape[1])
        self.costs_ = np.zeros(iterations)
        
    def _sigmoid(self, x):
        """ Computes sigmoid function. """
        return 1.0/(1 + np.exp(-x))
    
    def _feedforward(self):
        """ Computes np.dot(X, W). """
        self.output = self._sigmoid(self._X.dot(self.weights_))
        
    def _backprop(self):
        """ Update weights. """
        self.weights_ -= self._learning_rate * self._X.T.dot(self.output-self._y)/len(self._y) 
    
    def _get_cost(self):
        """ Compute loss. """
        return -1/len(self._y) *(self._y.T.dot(np.log(self.output))+(1-self._y.T).dot(np.log(1-self.output)))

    def fit(self, x, y, iterations=1000, learning_rate=0.5):
        """ Fit model.
        
        ----------
        x : ndarray, shape (n_samples, n_features)
            Training data
        y : ndarray, shape (n_samples,)
            Target data
        """
        self._init_params(x, y, iterations, learning_rate)
        
        # train model
        for i in range(iterations):
            self._feedforward()
            self._backprop()
            self.costs_[i] = self._get_cost()
            
        return self

In [None]:
# input vector
x = np.array([[0,0], [0,1], [1,0], [1,1]])
# logic gates
gates = {
    'AND':  np.array([[0],[0],[0],[1]]),
    'OR':   np.array([[0],[1],[1],[1]]),
    'NAND': np.array([[1],[1],[1],[0]]),
    'NOR':  np.array([[1],[0],[0],[0]]),
    'XOR':  np.array([[0],[1],[1],[0]])
}

for key,value in gates.items():
    nn = NeuralNetwork()
    model = nn.fit(x, value)
    print(f"{key}:\n", model.output)
    print(f'loss: {model.costs_[-1]:.2f}')

Neural network without hidden layer can be viewed as logistic regression unit. It's a simple linear classifier.

## Neural network

Build logic gates with neural network. 

### Step 2: with 1 hidden layer

### 1. Feedforward
\begin{equation}
z_1 = W_1^TX
\end{equation}

\begin{equation}
a_1 = \sigma(z_1)
\end{equation}

\begin{equation}
z_2 = W_2^Ta_1
\end{equation}

\begin{eqnarray}
\hat{y} &=& \sigma(z_2) \\
&=& \sigma(W_2^T\sigma(W_1^TX))
\end{eqnarray}

\begin{equation}
\sigma(z) = \frac{1}{1+e^{-z}}
\end{equation}

### 2. Compute cost function
\begin{equation}
Loss(y, \hat{y}) = \frac{1}{2n}(\hat{y} - y)^T \cdot (\hat{y} - y)
\end{equation}

### 3. Backpropagation
\begin{eqnarray}
\frac{\delta Loss(y, \hat{y})}{\delta W_2} &=& \frac{\delta Loss(y, \hat{y})}{\delta \hat{y}}\cdot\frac{\delta \hat{y}}{\delta z_2}\cdot\frac{\delta z_2}{\delta W_2} \\
&=& \frac{1}{n}(\hat{y}-y)\cdot \hat{y}(1-\hat{y})\cdot a_1
\end{eqnarray}

\begin{eqnarray}
\frac{\delta Loss(y, \hat{y})}{\delta W_1} &=& \frac{\delta Loss(y, \hat{y})}{\delta \hat{y}}\cdot\frac{\delta \hat{y}}{\delta z_2}\cdot\frac{\delta z_2}{\delta y_1}\cdot\frac{\delta y_1}{\delta z_1}\cdot\frac{\delta z_1}{\delta W_1} \\
&=& \frac{1}{n}(\hat{y}-y)\cdot \hat{y}(1-\hat{y})\cdot W_2 \cdot a_1(1-a_1)\cdot X
\end{eqnarray}

### 4. Gradient descent
\begin{equation}
W_1 = W_1 - \alpha  \frac{\delta Loss(y, \hat{y})}{\delta W_1}
\end{equation}

\begin{equation}
W_2 = W_2 - \alpha  \frac{\delta Loss(y, \hat{y})}{\delta W_2}
\end{equation}

In [None]:
class NeuralNetwork:
    """ Simple neural network with 1 hidden layer """
    def __init__(self, neurons):
        self._layers = len(neurons)
        self._weights = [np.random.rand(nex, pre+1) for pre, nex in zip(neurons[:-1], neurons[1:])]
    
    def _init_params(self, x, y, iterations, learning_rate):
        """ Initilize parameters. """
        self._x = x.T
        self._y = y.T
        self._learning_rate = learning_rate
        self.costs_ = np.zeros(iterations)
        
    def _sigmoid(self, x):
        """ Computes sigmoid function. """
        return 1.0/(1 + np.exp(-x))
    
    def _feedforward(self, a):  
        a_s = []
        for w in self._weights:
            a = np.vstack([np.ones((1, a.shape[1])), a])
            a_s.append(a)
            z = w.dot(a)
            a = self._sigmoid(z)
        a_s.append(a)
        return a_s
        
    def _backprop(self, a_s):
        """ Update weights. """
        delta_weights = [np.zeros(w.shape) for w in self._weights]
        # Update last layer delta
        delta = a_s[-1] - self._y
        delta_weights[-1] = delta.dot(a_s[-2].T)
        # Update all but the last layer delta
        for L in range(2, self._layers):
            delta = (self._weights[-L+1].T.dot(delta)*a_s[-L]*(1-a_s[-L]))[1:]
            delta_weights[-L] = delta.dot(a_s[-L-1].T) 
        # Update all weights
        self._weights = [w - self._learning_rate * dw for w, dw in zip(self._weights, delta_weights)]
    
    def _get_cost(self, a3):
        """ Compute loss. """
        m = self._y.shape[1]
        return 0.5/m * np.sum((self._y - a3)**2)

    def fit(self, x, y, iterations=1000, learning_rate=0.5):
        """ Fit model.
        
        ----------
        x : ndarray, shape (n_samples, n_features)
            Training data
        y : ndarray, shape (n_samples,)
            Target data
        """
        self._init_params(x, y, iterations, learning_rate)

        # train model
        for i in range(iterations):
            a_s = self._feedforward(self._x)
            self._backprop(a_s)
            # Check output
            self.output = a_s[-1].T
            self.costs_[i] = self._get_cost(a_s[-1])
            
        return self

In [None]:
hidden_neurons = 2

# input vector
x = np.array([[0,0], [0,1], [1,0], [1,1]])
# logic gates
gates = {
    'AND':  np.array([[0],[0],[0],[1]]),
    'OR':   np.array([[0],[1],[1],[1]]),
    'NAND': np.array([[1],[1],[1],[0]]),
    'NOR':  np.array([[1],[0],[0],[0]]),
    'XOR':  np.array([[0],[1],[1],[0]])
}

for key,value in gates.items():
    x_features = x.shape[1]
    y_features = value.shape[1]
    nn = NeuralNetwork([x_features, hidden_neurons, y_features])
    model = nn.fit(x, value)
    print(f"{key}:\n", model.output)
    print(f'loss: {model.costs_[-1]:.2f}')

plt.figure(figsize=(10,5))
ax = plt.gca()
ax.grid(color='#b7b7b7', linestyle='-', linewidth=0.5, alpha=0.5)
plt.plot(model.costs_, color='#212121')