# Appendix

## Cross entropy loss

In [None]:
def cross_entropy(prediction, Y, bprop=False):
    if bprop:
        return -1.0 * Y / prediction + (1.0 - Y) / (1.0 - prediction)
    else:
        return -1.0 * Y * np.log(prediction) - (1.0 - Y) * (np.log(1.0 - prediction))

In [None]:
nn_mnist_lr = NeuralNetworkActivation(
        hidden_neurons=[50],
        outputs=10,
        loss_function=cross_entropy, 
        learning_rate=0.1, 
        learning_rate_layer_decay=1,
        momentum=0.1,
        dropout=0.8, 
        activation_function=sigmoid)

In [None]:
train_learning_rate_decay(nn_mnist_lr, X_train, Y_train, epochs=1, print_msg=True,
                          learning_rate_anneal=1)
accuracy = net_accuracy(nn_mnist_lr, X_test, Y_test, predict=True)

In [None]:
nn_mnist_lr = NeuralNetworkActivation(
        hidden_neurons=[50],
        outputs=10,
        loss_function=mean_square_error, 
        learning_rate=0.1, 
        learning_rate_layer_decay=1,
        momentum=0.1,
        dropout=0.8, 
        activation_function=sigmoid)

In [None]:
train_learning_rate_decay(nn_mnist_lr, X_train, Y_train, epochs=1, print_msg=True,
                          learning_rate_anneal=1)
accuracy = net_accuracy(nn_mnist_lr, X_test, Y_test, predict=True)

In [None]:
nn_mnist_lr = NeuralNetworkActivation(
        hidden_neurons=[50],
        outputs=10,
        loss_function=cross_entropy, 
        learning_rate=0.1, 
        learning_rate_layer_decay=1,
        momentum=0.1,
        dropout=0, 
        activation_function=leaky_relu)

In [None]:
train_learning_rate_decay(nn_mnist_lr, X_train, Y_train, epochs=1, print_msg=True,
                          learning_rate_anneal=1)
accuracy = net_accuracy(nn_mnist_lr, X_test, Y_test, predict=True)

# Appendix

# Learning rate decay

We may want to decay the learning rate over time, so that the net learns faster at first and then slows its learning over time. See [here](http://cs231n.github.io/neural-networks-3/#anneal) for example. 

Here's how we might implement this:

In [None]:
def train_learning_rate_decay(net, X_train, Y_train, epochs=5, print_msg=True,
                              learning_rate_anneal=2.0):
    X_train, Y_train = shuffle_data(X_train, Y_train)
    
    learning_rate_schedule = np.linspace(net.learning_rate * learning_rate_anneal, 
                                         net.learning_rate / learning_rate_anneal, 
                                         epochs)
    for i in range(epochs):
        setattr(net, "learning_rate", learning_rate_schedule[i])
        if print_msg:
            print("Learning rate on epoch", i+1, 
                  "is", net.learning_rate)
        one_epoch(net, X_train, Y_train)
        if print_msg:
            print("Done with epoch", i+1)

## Testing learning rate decay

In [None]:
nn_mnist_lr = NeuralNetworkLR(
        hidden_neurons=[75, 25],
        outputs=10,
        loss_function=mean_square_error, 
        learning_rate=0.3)

In [None]:
if train_all:
    train_learning_rate_decay(nn_mnist_lr, X_train, Y_train, epochs=5, print_msg=True,
                              learning_rate_anneal=2)

## Testing learning rate decay

In [None]:
if train_all:
    accuracy = net_accuracy(nn_mnist_lr, X_test, Y_test)
    print("Neural Net MNIST Classification Accuracy:", round(accuracy, 3) * 100, "percent")

## Different Activation Functions

In [None]:
def leaky_relu(x, alpha=0.01, bprop=False):
    if bprop:
        dx = np.full(x.shape, alpha)
        dx[x >= 0] = 1
        return dx
    else:
        return np.maximum(alpha * x, x)

In [None]:
def tanh(x, bprop=False):
    if bprop:
        e = np.exp(2*cap_sigmoid_input(x))
        return (e-1)/(e+1)
    else:
        return np.tanh(x)

In [None]:
def cap_sigmoid_input(a):
    a[a < -100] = -100
    a[a > 100] = 100
    
    return a

In [None]:
def setup_layers(hidden_neurons, outputs, 
                 learning_rate=1.0, 
                 learning_rate_layer_decay=1.0, 
                 momentum=0.1,
                 activation_function=sigmoid):
    layers = []
    for i in range(len(hidden_neurons)):
        layer = FullyConnectedXavier(neurons=hidden_neurons[i], activation_function=activation_function)
        setattr(layer, "learning_rate", learning_rate / (learning_rate_layer_decay ** i))
        setattr(layer, "momentum", momentum)
        layers.append(layer)

    output_layer = FullyConnectedXavier(neurons=outputs, activation_function=sigmoid)
    setattr(output_layer, "learning_rate", learning_rate / (learning_rate_layer_decay ** (len(hidden_neurons) + 1)))
    setattr(output_layer, "momentum", momentum)
    layers.append(output_layer)
    return layers   

In [None]:
class NeuralNetworkActivation(NeuralNetworkDropout):
    def __init__(self, hidden_neurons, outputs, loss_function, learning_rate, 
                 learning_rate_layer_decay, momentum, dropout, activation_function):
        NeuralNetworkDropout.__init__(self, hidden_neurons, outputs, loss_function, 
                                      learning_rate, learning_rate_layer_decay, momentum, dropout)
        self.activation_function = activation_function

        
    def forwardpass(self, X, predict=False):
        """ Calculate an output Y for the given input X. """
        
        if not self.layers_setup:
            self.layers = setup_layers(self.hidden_neurons, 
                                       self.outputs, 
                                       self.learning_rate,
                                       self.learning_rate_layer_decay, 
                                       self.momentum, 
                                       self.activation_function)
            self.layers_setup = True

        X_next = X
        for i, layer in enumerate(self.layers):
            if self.dropout and not predict:
                zero_indices = np.random.choice(range(layer.n_neurons), 
                                                size=int(layer.n_neurons * (1 - self.dropout)), 
                                                replace=False)
                if X_next is None:
                X_next[:, zero_indices] = 0.0
            X_next = layer.fprop(X_next)
        prediction = X_next
        return prediction

In [None]:
nn_mnist_lr = NeuralNetworkActivation(
        hidden_neurons=[50],
        outputs=10,
        loss_function=mean_square_error, 
        learning_rate=0.1, 
        learning_rate_layer_decay=1,
        momentum=0.1,
        dropout=0.8, 
        activation_function=leaky_relu)

In [None]:
train_learning_rate_decay(nn_mnist_lr, X_train, Y_train, epochs=1, print_msg=True,
                          learning_rate_anneal=1)
accuracy = net_accuracy(nn_mnist_lr, X_test, Y_test, predict=True)