## Deep Learning

#### Import packages

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split

#### Neural network class

In [11]:

class NeuralNetwork(object):
    def __init__(self, input_size, hidden_size, output_size):        
        #Define Hyperparameters
        self.input_layer_size = input_size
        self.hidden_layer_size = hidden_size
        self.output_layer_size = output_size
        
        # Weights
        self.weight_input_hidden = np.random.randn(self.input_layer_size,self.hidden_layer_size)
        self.weight_hidden_output = np.random.randn(self.hidden_layer_size,self.output_layer_size)

        # Biases
        self.biases_input_hidden = np.zeros((1, self.hidden_layer_size))
        self.biases_hidden_output = np.zeros((1, self.output_layer_size))

    def sigmoid(self, z):
        # Sigmoid activation function
        return 1/(1+np.exp(-z))
    
    def sigmoid_derivative(self, x):
        # Derivative of the sigmoid activation function
        # Assumes x is the output of the sigmoid function
        return x * (1 - x)
        
    def forward_propagation(self, inputs):
        # Propagate inputs through network
        # Takes inputs which should be [ n x input_layer_size]
        self.hidden_input = np.dot(inputs, self.weight_input_hidden) + self.biases_input_hidden
        self.hidden_output = self.sigmoid(self.hidden_input)
        self.output_input = np.dot(self.hidden_output, self.weight_hidden_output) + self.biases_hidden_output
        self.output_output = self.sigmoid(self.output_input) 

        
    def backward_propagation(self, inputs, output, learn_rate):
        #Compute derivative with respect to each weight matrix for a given input output pair
        self.forward_propagation(inputs)
        
        delta2 = np.multiply(-(output-self.output_output), self.sigmoid_derivative(self.output_output))
        d_W2 = np.dot(self.hidden_output.T, delta2)
        d_bias2 = np.sum(delta2, axis=0)
        
        delta1 = np.dot(delta2, self.weight_hidden_output.T)*self.sigmoid_derivative(self.hidden_output)
        d_W1 = np.dot(inputs.T, delta1)
        d_bias1 = np.sum(delta1, axis=0)
            
        self.weight_input_hidden += (learn_rate * d_W1)
        self.biases_input_hidden += (learn_rate * d_bias1)
        self.weight_hidden_output += (learn_rate * d_W2)
        self.biases_hidden_output += (learn_rate * d_bias2)

    def train(self, inputs, outputs, epochs, learn_rate):
        # Train the neural network using gradient descent 
        for epoch in range(epochs):
            self.backward_propagation(inputs, outputs, learn_rate)


    def train_miniBatch(self, inputs, outputs, epochs, learn_rate, batch_size):
        # Determine the number of batches
        num_batches = len(inputs) // batch_size

        for epoch in range(epochs):
            # Shuffle the data at the beginning of each epoch
            indices = np.random.permutation(len(inputs))
            shuffled_inputs = inputs[indices]
            shuffled_outputs = outputs[indices]

            # Loop through each mini-batch
            for batch_id in range(num_batches):
                # Extract the current mini-batch
                start_id = batch_id * batch_size
                end_id = (batch_id + 1) * batch_size
                mini_batch_inputs = shuffled_inputs[start_id:end_id]
                mini_batch_outputs = shuffled_outputs[start_id:end_id]

                # Perform backward propagation on the mini-batch
                self.backward_propagation(mini_batch_inputs, mini_batch_outputs, learn_rate)

    
    def cost(self, inputs, outputs):
        # Calculate the loss function
        self.forward_propagation(inputs)
        return 0.5 * np.mean(np.square(outputs - self.output_output))
    
    def predict(self, inputs):
        # Make predictions for the test set
        self.forward_propagation(inputs)
        return self.output_output

#### Import and process data

Data source: https://archive.ics.uci.edu/dataset/275/bike+sharing+dataset
- instant: record index
- dteday : date
- season : season (1:springer, 2:summer, 3:fall, 4:winter)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday(1) or not(0) (extracted from http://dchr.dc.gov/page/holiday-schedule)
- weekday : day of the week (0-6) (sunday-saturday)
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
+ weathersit : 
    - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
    - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
    - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
    - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)
- casual: count of casual users
- registered: count of registered users
- cnt: count of total rental bikes including both casual and registered

This is a regression problem, we will try to use a neural network to establish a relationship between cnt (the number of bikes rented) and the other variables.

In [13]:
df = pd.read_csv('hour.csv')
#df = df.head(10)

# input_data is the input data in the shape (m x n) aka (examples x features) 
# Dropping some irrelevant / ill formatted columns 
input_data = df.drop(columns=['cnt']).drop(columns=['instant']).drop(columns=['dteday']).values

# output_data is true/target values (number of bikes rented)
output_data = df['cnt'].values

# Normalize data
transformer = Normalizer(norm='max').fit(input_data)
input_data = transformer.transform(input_data)

# Normalizing 1D output data
highest_val = output_data.max()
lowest_val = output_data.min()
print("Range of bikes rented: ")
print("max: ", highest_val)
print("min", lowest_val)
output_data = (output_data - lowest_val) / (highest_val - lowest_val)
output_data = output_data.reshape(-1, 1)
# Scaling output so the sigmoid used will actually get the right answer.  
# When we care what the real answer is, just undo the normalization.

# Split dataset into training set and test set
train_input, test_input, train_output, test_output = train_test_split(input_data, output_data, test_size=0.3) # 70% training and 30% test

Range of bikes rented: 
max:  977
min 1


#### Train and test model

In [14]:

# Initialize neural network
input_size = np.shape(input_data)[1] # Each input node is one feature of the data
hidden_size = 8 # Arbitrary number of hidden nodes
output_size = 1 # Looking for an output that a single value
learning_rate = -0.001
epochs = 1000

# Create an instance of the NeuralNetwork class
nn = NeuralNetwork(input_size, hidden_size, output_size)

# Train the neural network
nn.train(train_input, train_output, epochs, learning_rate)

# Calculate the cost after training
training_cost = nn.cost(train_input, train_output)
print("Training cost:", training_cost)

# Make predictions on the test set
predictions = nn.predict(test_input)

# Returning results to original range
predictions_whole = ((predictions) * (highest_val - lowest_val)) + lowest_val
# You cannot rent 1/2 a bike.
predictions_whole = np.round(predictions_whole)

print("Range of bikes rented predictions: ")
print("predicted max: ", predictions_whole.max())
print("predicted min: ", predictions_whole.min())

# Calculate the mean of absolute errors
test_output_whole = test_output * (highest_val - lowest_val)
mae = np.mean(np.abs(predictions_whole - test_output_whole))
print("MAE: ", mae)

Training cost: 0.006253569325818177
Range of bikes rented predictions: 
predicted max:  416.0
predicted min:  1.0
MAE:  67.00805523590334


#### Analysis of results 

WHOA! That result is very wrong... 

Why though? Well, I'll be honest, I don't know. I re-coded this twice and made no progress solving the issue. It seems that the delta values added to the weights are *always* positive so the weights and biases just converge to their maximum value of 1 rather than performing gradient descent as expected. The only thing that seems to help is making the learning rate negative. I don't think this indicates I missed a negative sign somewhere, though it is entirely possible. I think this just causes the wrong answer to be found slower so the less wrong answer of essentially random weights is still better. 

Here's what the issue probably isn't:

Data set: I tried an alternate dataset to make sure that wasn't the problem and it did not help.

Hyperparamers: Shrinking the learning rate just slows the speed at which our weights approach their maximum. Increasing the epochs just gives more time to approach their maximum. For this reason, setting both values to zero seems to be the best since random guessing on an untrained network is better than definitely wrong.


#### Explanation of choices:

Fortunately, the way this is coded at least gets the wrong answer pretty darn fast. Thank god for small miracles... Using matices allows efficient computation without any additional efficiency measures. 

It is more complicated and less efficient to use mini-batch gradient descent. Since linear algebra is really efficent, making the matrixes smaller shouldn't make a marked difference but being forced to train for epochs*batches rather than just epochs causes a non insignifigant slow down. To demonstrait this, batch gradient decent is presented below. Not that with 10 batches, it is about 10 times slower.


This series of videos was really fantastic for explaining the ideas and coding pricipals behind making a neural network from scratch.
https://www.youtube.com/watch?v=bxe2T-V8XRs&list=PLiaHhY2iBX9hdHaRr6b7XevZtgZRa1PoU&index=1

In [12]:
# Training mini batch 

# Initialize neural network
input_size = np.shape(input_data)[1] # Each input node is one feature of the data
hidden_size = 8 # Arbitrary number of hidden nodes
output_size = 1 # Looking for an output that a single value
learning_rate = -0.001
epochs = 1000

# Create an instance of the NeuralNetwork class
nn = NeuralNetwork(input_size, hidden_size, output_size)

# Train the neural network
nn.train_miniBatch(train_input, train_output, epochs, learning_rate, 10)

# Calculate the cost after training
training_cost = nn.cost(train_input, train_output)
print("Training cost:", training_cost)

# Make predictions on the test set
predictions = nn.predict(test_input)

# Returning results to original range
predictions_whole = ((predictions) * (highest_val - lowest_val)) + lowest_val
# You cannot rent 1/2 a bike.
predictions_whole = np.round(predictions_whole)

print("Range of bikes rented predictions: ")
print(" predicted max: ", predictions_whole.max())
print(" predicted min: ", predictions_whole.min())

# Calculate the mean of absolute errors
test_output_whole = test_output * (highest_val - lowest_val)
mae = np.mean(np.abs(predictions_whole - test_output_whole))
print("MAE: ", mae)


Training cost: 0.0034250489506197964
 predicted max:  549.0
 predicted min:  1.0
MAE:  53.27752205600307
