# Data and Libraries loading  

In [None]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np

# Plotting library
from matplotlib import pyplot

# Optimization module in scipy
from scipy import optimize

# will be used to load MATLAB mat datafile format
from scipy.io import loadmat

# library written for this exercise providing additional functions for assignment submission, and others
import utils

### Loading the data 

In [2]:
#  training data stored in arrays X, y
data = loadmat(os.path.join('Data', 'ex4data1.mat'))
X, y = data['X'], data['y'].ravel()

# set the zero digit to 0, rather than its mapped 10 in this dataset
# This is an artifact due to the fact that this dataset was used in 
# MATLAB where there is no index 0
y[y == 10] = 0

# Number of training examples
m = y.size

## Neural network

### Model representation
![](Figures/neural_network.png)

### Setting up the neural network 

##### input layer size : 
The first layer (input) consists of $400$ neurons(without counting the bias). 
Because given a $\mathbb{R}^{20*20}$ by image, we have then a matrix of $\mathbb{R}^{20*20}$ which we can then turn into a vector of dimension $\mathbb{R}^{400}$.

In such way every image in our dataset X can be turned into a vector of dimension $\mathbb{R}^{400}$. 
Finally our dataset of $5000$ images will be a $\mathbb{R}^{5000*400}$ matrix. 

`m` : being the number of training examples 
every row of the matrix `X` : represents a training example 


$$ X = \begin{bmatrix} - \left(x^{(1)} \right)^T - \\
- \left(x^{(2)} \right)^T - \\
\vdots \\
- \left(x^{(m)} \right)^T - \\
\end{bmatrix}
$$


We will have a matrix later that represents the y expected outcome 

In [3]:
input_layer_size  = 400  

### Second layer 

In [4]:
# 25 hidden units (neurons)
hidden_layer_size = 25  

### Third layer (input layer): 

In [5]:
# 10 labels, from 0 to 9 (this is )
num_labels = 10          

### Weights Loading 

In [3]:
# Load the weights into variables Theta1 and Theta2
weights = loadmat(os.path.join('Data', 'ex4weights.mat'))

# Theta1 has size 25 x 401
# Theta2 has size 10 x 26
Theta1, Theta2 = weights['Theta1'], weights['Theta2']

# swap first and last columns of Theta2, due to legacy from MATLAB indexing, 
# since the weight file ex3weights.mat was saved based on MATLAB indexing
Theta2 = np.roll(Theta2, 1, axis=0)

# Unroll parameters 
nn_params = np.concatenate([Theta1.ravel(), Theta2.ravel()])

### Cost function 

#### Without regularization : 

$$ J(\theta) = \frac{1}{m} \sum_{i=1}^{m}\sum_{k=1}^{K} \left[ - y_k^{(i)} \log \left( \left( h_\theta \left( x^{(i)} \right) \right)_k \right) - \left( 1 - y_k^{(i)} \right) \log \left( 1 - \left( h_\theta \left( x^{(i)} \right) \right)_k \right) \right]$$


##### With regularization : 

$$ J(\theta) = \frac{1}{m} \sum_{i=1}^{m}\sum_{k=1}^{K} \left[ - y_k^{(i)} \log \left( \left( h_\theta \left( x^{(i)} \right) \right)_k \right) - \left( 1 - y_k^{(i)} \right) \log \left( 1 - \left( h_\theta \left( x^{(i)} \right) \right)_k \right) \right] + \frac{\lambda}{2 m} \left[ \sum_{j=1}^{25} \sum_{k=1}^{400} \left( \Theta_{j,k}^{(1)} \right)^2 + \sum_{j=1}^{10} \sum_{k=1}^{25} \left( \Theta_{j,k}^{(2)} \right)^2 \right] $$


#### Feedforward Propagation 
To implement the cost function we will have first to implement the Feedforward propagation function 

Given a dataset X, parameters Thetat1 and Theata2. We will be able to compute 
h(X)=a_3 for the a whole dataset or for just one training example 

In [6]:
def feedForwardProp(Theta1, Theta2 , X): 
    if X.ndim == 1:
        X = X[None]  # promote to 2-dimensions
    
    # useful variables
    m = X.shape[0]
    num_labels = Theta2.shape[0]
    
    X = np.concatenate([np.ones((m, 1)), X], axis=1)
    
    z_2 = Theta1 @  (X.T)
    a_2 = utils.sigmoid(z_2)
    
    # adding ones 
    a_2 = np.concatenate([np.ones((1, m)), a_2], axis=0)
    
    z_3 = Theta2 @ a_2
    a_3 = utils.sigmoid(z_3)
    
    return (a_3,z_3,a_2,z_2,X)  

#### cost function 

In [7]:
def nnCostFunction(nn_params,
                   input_layer_size,
                   hidden_layer_size,
                   num_labels,
                   X, y, lambda_=0.0):
    Theta1 = np.reshape(nn_params[:hidden_layer_size * (input_layer_size + 1)],
                        (hidden_layer_size, (input_layer_size + 1)))

    Theta2 = np.reshape(nn_params[(hidden_layer_size * (input_layer_size + 1)):],
                        (num_labels, (hidden_layer_size + 1)))
    
    
    # Calculating cost function - non regularized  
    
    m = y.size
    
    # we will transform the y vector which contains the expected outputs from the 
    # training examples in a matrix of dimensions K x m 
    # each column in this matrix represents expected outputs for each unit from layer 3
    
    y_transformed = np.zeros((m,num_labels))
    
    for idx in range(m): 
        y_transformed[idx,y[idx]] = 1 
    
    
        J = 0
    
    for x in range(m): 
        a_3 = feedForwardProp(Theta1, Theta2, X[x,:])[0]
        ''' 
        This is a matrix implementation of the second sum for in the cost function 
        The first sum sum is done by this for loop 
        we have here transformed y vector in a matrix : y_transformed with dimension (m,K)
        where each row of the y_transformed matrix represents the expected output 
        from a single neuron in the last layer
        
        The purpose of this transformation is that with a single dot product of the vector a_3 and one row 
        from the transformed_y matrix we will be able to calculte the second sum without a for loop 
        
        i will try later to make this cost function later as a single matrix multiplication (if possible)
        '''
        J += ((-y_transformed[x, :] @ np.log(a_3)) - ((1-y_transformed[x, :])@(np.log(1-a_3))))
        
    J = J/m 
    
    
    # Calculating cost function - regularized 
    
    sum_1 = 0 

    for neuron in range(hidden_layer_size): 
        sum_1 += np.dot(Theta1[neuron, 1:]   , Theta1[neuron, 1:] )
    
    
    sum_2 = 0 
    
    for neuron in range(num_labels): 
        sum_2 += np.dot(Theta2[neuron, 1:]   , Theta2[neuron, 1:] )

    final_sum = sum_1 + sum_2 
    last_therm = ((final_sum*(lambda_))/(2*m))
    J += last_therm
    
    
    
    
    # Calculating the gradient 
    
    delta_1 = 0
    delta_2 = 0
        
    for t in range(m):
        curr_y = np.array(y_transformed[t, :])
        a_3 = feedForwardProp(Theta1, Theta2, X[t,:])[0]
        layer3_error = a_3.T - curr_y
        layer3_error = layer3_error.T
        
        
        a_2 = feedForwardProp(Theta1, Theta2, X[t,:])[2]
        a_2_term = np.multiply(a_2 ,(1-a_2))
    
        layer2_error = ((Theta2.T)@ layer3_error) * a_2_term
        
        
        
        delta_2 += layer3_error@ (a_2.T) 
        
        a_1 = feedForwardProp(Theta1, Theta2, X[t,:])[4]
        layer2_error = layer2_error[1:]
        delta_1 += layer2_error@ (a_1)
        
        
    
    Theta1_grad = np.zeros(Theta1.shape)
    Theta2_grad = np.zeros(Theta2.shape)
    
    Theta1_grad[:,0] = delta_1[:,0]/m 
    Theta2_grad[:,0] = delta_2[:,0]/m 
    
    Theta1_grad[:,1:] = delta_1[:,1:]/m + (lambda_/m) * Theta1[:,1:]
    Theta2_grad[:,1:] = delta_2[:,1:]/m + (lambda_/m) * Theta2[:,1:]
    
    
    grad = np.concatenate([Theta1_grad.ravel(), Theta2_grad.ravel()])

    return J, grad
    