# Extending Gradient Descent
This Multilayer Perceptron and the Backpropagation are still a part of Gradient Descent in Neural Networks.

### Multilayer Perceptron

In [4]:
import numpy as np

def sigmoid(x):
    """
    Calculate sigmoid
    """
    return 1/(1+np.exp(-x))

# Network size
N_input = 4
N_hidden = 3
N_output = 2

np.random.seed(42)
# Make some fake data
X = np.random.randn(4)

weights_input_to_hidden = np.random.normal(0, scale=0.1, size=(N_input, N_hidden))
weights_hidden_to_output = np.random.normal(0, scale=0.1, size=(N_hidden, N_output))


# TODO: Make a forward pass through the network

hidden_layer_in = np.dot(X, weights_input_to_hidden)
hidden_layer_out = sigmoid(hidden_layer_in)

print('Hidden-layer Output:')
print(hidden_layer_out)

output_layer_in = np.dot(hidden_layer_out, weights_hidden_to_output)
output_layer_out = sigmoid(output_layer_in)

print('Output-layer Output:')
print(output_layer_out)

Hidden-layer Output:
[0.41492192 0.42604313 0.5002434 ]
Output-layer Output:
[0.49815196 0.48539772]


#### Let's have a look at how the np.dot() thing works

Each perceptron (3 in total) receives every input (there are 4 inputs). So we need for weights per perceptron (one for every input to the perceptron). Each column in the matrix is one perceptron of the (first and only) hidden layer.

In [16]:
weights_input_to_hidden

array([[-0.02341534, -0.0234137 ,  0.15792128],
       [ 0.07674347, -0.04694744,  0.054256  ],
       [-0.04634177, -0.04657298,  0.02419623],
       [-0.19132802, -0.17249178, -0.05622875]])

In [20]:
weights_input_to_hidden.shape

(4, 3)

We have four input features.

In [17]:
X

array([ 0.49671415, -0.1382643 ,  0.64768854,  1.52302986])

In [19]:
X.shape

(4,)

In [15]:
X[:,None]

array([[ 0.49671415],
       [-0.1382643 ],
       [ 0.64768854],
       [ 1.52302986]])

In [21]:
X[:,None].shape

(4, 1)

The signals arrive at the perceptrons. They are modulated by the weights and then summed up. This is what the numpy dot product of two arrays is for (np.dot()).

But let's have a closer look of what is happening under the hood. We multiply the weights matrix with a column vector. So far our features are only present as a row vector. We can manually translate them to a column vector.

In [23]:
weighted_X = weights_input_to_hidden * X[:,None]
weighted_X

array([[-0.01163073, -0.01162991,  0.07844174],
       [-0.01061088,  0.00649115, -0.00750167],
       [-0.03001503, -0.03016478,  0.01567162],
       [-0.29139829, -0.26271014, -0.08563807]])

Finally we sum up the columns.

In [24]:
weighted_X.sum(axis=0)

array([-0.34365494, -0.29801368,  0.00097362])

This can be done more easily with the numpy dot product.

In [18]:
hidden_layer_in = np.dot(X, weights_input_to_hidden)
hidden_layer_in

array([-0.34365494, -0.29801368,  0.00097362])

### Multilayer Perceptron with Backpropagation

In [1]:
import numpy as np


def sigmoid(x):
    """
    Calculate sigmoid
    """
    return 1 / (1 + np.exp(-x))


x = np.array([0.5, 0.1, -0.2])
target = 0.6
learnrate = 0.5

weights_input_hidden = np.array([[0.5, -0.6],
                                 [0.1, -0.2],
                                 [0.1, 0.7]])

weights_hidden_output = np.array([0.1, -0.3])

## Forward pass
hidden_layer_input = np.dot(x, weights_input_hidden)
hidden_layer_output = sigmoid(hidden_layer_input)

output_layer_in = np.dot(hidden_layer_output, weights_hidden_output)
output = sigmoid(output_layer_in)

## Backwards pass
## TODO: Calculate output error
error = target - output

# TODO: Calculate error term for output layer
output_error_term = error * output * (1 - output)

# TODO: Calculate error term for hidden layer
hidden_error_term = np.dot(weights_hidden_output, output_error_term) * \
                    hidden_layer_output * (1 - hidden_layer_output)

# TODO: Calculate change in weights for hidden layer to output layer
delta_w_h_o = learnrate * output_error_term * hidden_layer_output

# TODO: Calculate change in weights for input layer to hidden layer
delta_w_i_h = learnrate * hidden_error_term * x[:, None]

print('Change in weights for hidden layer to output layer:')
print(delta_w_h_o)
print('Change in weights for input layer to hidden layer:')
print(delta_w_i_h)

Change in weights for hidden layer to output layer:
[0.00804047 0.00555918]
Change in weights for input layer to hidden layer:
[[ 1.77005547e-04 -5.11178506e-04]
 [ 3.54011093e-05 -1.02235701e-04]
 [-7.08022187e-05  2.04471402e-04]]


### Full implementation of a Neural Network with Forward Pass and Backpropagation

Data preparation

In [3]:
import numpy as np
import pandas as pd

admissions = pd.read_csv('../data/backpropagation.csv')

# Make dummy variables for rank
data = pd.concat([admissions, pd.get_dummies(admissions['rank'], prefix='rank')], axis=1)
data = data.drop('rank', axis=1)

# Standarize features
for field in ['gre', 'gpa']:
    mean, std = data[field].mean(), data[field].std()
    data.loc[:,field] = (data[field]-mean)/std
    
# Split off random 10% of the data for testing
np.random.seed(21)
sample = np.random.choice(data.index, size=int(len(data)*0.9), replace=False)
data, test_data = data.iloc[sample], data.drop(sample)

# Split into features and targets
features, targets = data.drop('admit', axis=1), data['admit']
features_test, targets_test = test_data.drop('admit', axis=1), test_data['admit']

Implementation of the algorithm

In [None]:
import numpy as np
from data_prep import features, targets, features_test, targets_test

np.random.seed(21)

def sigmoid(x):
    """
    Calculate sigmoid
    """
    return 1 / (1 + np.exp(-x))


# Hyperparameters
n_hidden = 2  # number of hidden units
epochs = 900
learnrate = 0.005

n_records, n_features = features.shape
last_loss = None
# Initialize weights
weights_input_hidden = np.random.normal(scale=1 / n_features ** .5,
                                        size=(n_features, n_hidden))
weights_hidden_output = np.random.normal(scale=1 / n_features ** .5,
                                         size=n_hidden)

for e in range(epochs):
    del_w_input_hidden = np.zeros(weights_input_hidden.shape)
    del_w_hidden_output = np.zeros(weights_hidden_output.shape)
    for x, y in zip(features.values, targets):
        ## Forward pass ##
        # TODO: Calculate the output
        hidden_input = np.dot(weights_input_hidden, features)
        hidden_output = None
        output = None

        ## Backward pass ##
        # TODO: Calculate the network's prediction error
        error = None

        # TODO: Calculate error term for the output unit
        output_error_term = None

        ## propagate errors to hidden layer

        # TODO: Calculate the hidden layer's contribution to the error
        hidden_error = None
        
        # TODO: Calculate the error term for the hidden layer
        hidden_error_term = None
        
        # TODO: Update the change in weights
        del_w_hidden_output += 0
        del_w_input_hidden += 0

    # TODO: Update weights  (don't forget to division by n_records or number of samples)
    weights_input_hidden += 0
    weights_hidden_output += 0

    # Printing out the mean square error on the training set
    if e % (epochs / 10) == 0:
        hidden_output = sigmoid(np.dot(x, weights_input_hidden))
        out = sigmoid(np.dot(hidden_output,
                             weights_hidden_output))
        loss = np.mean((out - targets) ** 2)

        if last_loss and last_loss < loss:
            print("Train loss: ", loss, "  WARNING - Loss Increasing")
        else:
            print("Train loss: ", loss)
        last_loss = loss

# Calculate accuracy on test data
hidden = sigmoid(np.dot(features_test, weights_input_hidden))
out = sigmoid(np.dot(hidden, weights_hidden_output))
predictions = out > 0.5
accuracy = np.mean(predictions == targets_test)
print("Prediction accuracy: {:.3f}".format(accuracy))
