# Digit Dataset

In [1]:
from sklearn.datasets import load_digits # The MNIST data set is in scikit learn data set
from sklearn.preprocessing import StandardScaler  # It is important in neural networks to scale the date
from sklearn.model_selection import train_test_split  # The standard - train/test to prevent overfitting and choose hyperparameters
from sklearn.metrics import accuracy_score # 
import numpy as np
import numpy.random as r # We will randomly initialize our weights
import matplotlib.pyplot as plt 
from math import exp

In [2]:
digits=load_digits()
X = digits.data
y = digits.target

In [3]:
X_scale = StandardScaler()
X = X_scale.fit_transform(digits.data)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [16]:
def convert_y_to_vect(y):
    y_vect = np.zeros((len(y), 10))
    for i in range(len(y)):
        y_vect[i, y[i]] = 1
    return y_vect

In [17]:
y_v_train = convert_y_to_vect(y_train)
y_v_test = convert_y_to_vect(y_test)

In [18]:
#sigmoid 
def f(z):
    return 1 / (1 + np.exp(-z))

#sigmoid derivative 
def f_deriv(z):
    return f(z) * (1 - f(z))

#softmax 
def softmax(z):
    return np.exp(z - np.max(z))/np.sum(np.exp(z - np.max(z)), axis=0, keepdims=True) 

### Creating and initialing W and b

In [19]:
def setup_and_init_weights(nn_structure, mode = False):
    W = {} #creating a dictionary i.e. a set of key: value pairs
    b = {}
    if mode:
        print("He normal innitialization")
        for l in range(1, len(nn_structure)):
            W[l] = r.random_sample((nn_structure[l], nn_structure[l-1]))*np.sqrt(2/nn_structure[l-1]) #He normal innitialization
            b[l] = r.random_sample((nn_structure[l],))
    else:  
        print("Uniform innitialization")
        for l in range(1, len(nn_structure)):
            W[l] = r.random_sample((nn_structure[l], nn_structure[l-1])) #Return “continuous uniform” random floats in the half-open interval [0.0, 1.0). 
            b[l] = r.random_sample((nn_structure[l],))
    return W, b

### Initializing $\triangledown W$ and $\triangledown b$

In [24]:
def init_tri_values(nn_structure):
    tri_W = {}
    tri_b = {}
    for l in range(1, len(nn_structure)):
        tri_W[l] = np.zeros((nn_structure[l], nn_structure[l-1]))
        tri_b[l] = np.zeros((nn_structure[l],))
    return tri_W, tri_b

### Feed forward 

In [48]:
def feed_forward(x, W, b):
    a = {1: x} # create a dictionary for holding the a values for all levels
    z = { } # create a dictionary for holding the z values for all the layers
    for l in range(1, len(W) + 1): # for each layer
        node_in = a[l]
        z[l+1] = W[l].dot(node_in) + b[l]  # z^(l+1) = W^(l)*a^(l) + b^(l)
        a[l+1] = f(z[l+1]) # a^(l+1) = f(z^(l+1))
    return a, z

### Feed forward with softmax

In [45]:
def feed_forward_softmax(x, W, b, nn_structure):
    a = {1: x} # create a dictionary for holding the a values for all levels
    z = { } # create a dictionary for holding the z values for all the layers
    for l in range(1, len(W) + 1): # for each layer
        node_in = a[l]
        z[l+1] = W[l].dot(node_in) + b[l]  # z^(l+1) = W^(l)*a^(l) + b^(l)
        #check if last layer is softmax layer
        if z[l+1].shape[0] == nn_structure[-1]:
            a[l+1] = softmax(z[l+1])
        else:
            a[l+1] = f(z[l+1]) # a^(l+1) = f(z^(l+1))
    return a, z

## Compute $\delta$

In [33]:
def calculate_out_layer_delta(y, a_out, z_out):
    # delta^(nl) = -(y_i - a_i^(nl)) * f'(z_i^(nl))
    return -(y-a_out) * f_deriv(z_out) 

def calculate_out_layer_delta_softmax(y, a_out, z_out):
    return -(y-a_out)

def calculate_hidden_delta(delta_plus_1, w_l, z_l):
    # delta^(l) = (transpose(W^(l)) * delta^(l+1)) * f'(z^(l))
    return np.dot(np.transpose(w_l), delta_plus_1) * f_deriv(z_l)

## The Back Propagation Algorithm 

In [57]:
def train_nn(nn_structure, X, y, iter_num=3000, alpha=0.25, innit_mode = False):
    W, b = setup_and_init_weights(nn_structure, mode=innit_mode)
    cnt = 0
    N = len(y)
    avg_cost_func = []
    print('Starting gradient descent for {} iterations'.format(iter_num))
    while cnt < iter_num:
        if cnt%1000 == 0:
            print('Iteration {} of {}'.format(cnt, iter_num))
        tri_W, tri_b = init_tri_values(nn_structure)
        avg_cost = 0
        for i in range(N):
            delta = {}
            # perform the feed forward pass and return the stored a and z values, to be used in the
            # gradient descent step
            a, z = feed_forward(X[i, :], W, b)
            # loop from nl-1 to 1 backpropagating the errors
            for l in range(len(nn_structure), 0, -1):
                if l == len(nn_structure):
                    delta[l] = calculate_out_layer_delta(y[i,:], a[l], z[l])
                    avg_cost += np.linalg.norm((y[i,:]-a[l]))
                else:
                    if l > 1:
                        delta[l] = calculate_hidden_delta(delta[l+1], W[l], z[l])
                    # triW^(l) = triW^(l) + delta^(l+1) * transpose(a^(l))
                    tri_W[l] += np.dot(delta[l+1][:,np.newaxis], np.transpose(a[l][:,np.newaxis]))# np.newaxis increase the number of dimensions
                    # trib^(l) = trib^(l) + delta^(l+1)
                    tri_b[l] += delta[l+1]
        # perform the gradient descent step for the weights in each layer
        for l in range(len(nn_structure) - 1, 0, -1):
            W[l] += -alpha * (1.0/N * tri_W[l])
            b[l] += -alpha * (1.0/N * tri_b[l])
        # complete the average cost calculation
        avg_cost = 1.0/N * avg_cost
        avg_cost_func.append(avg_cost)
        cnt += 1
    return W, b, avg_cost_func


def predict_y(W, b, X, n_layers, function = 0):
    N = X.shape[0]
    y = np.zeros((N,))
    for i in range(N):
        a, z = feed_forward(X[i, :], W, b)
        y[i] = np.argmax(a[n_layers])
    return y

## The Back Propagation Algorithm with Softmax Extension


In [58]:
def train_nn_softmax(nn_structure, X, y, iter_num=3000, alpha=0.25):
    W, b = setup_and_init_weights(nn_structure, mode=False)
    cnt = 0
    N = len(y)
    avg_cost_func = []
    print('Starting gradient descent for {} iterations'.format(iter_num))
    while cnt < iter_num:
        if cnt%1000 == 0:
            print('Iteration {} of {}'.format(cnt, iter_num))
        tri_W, tri_b = init_tri_values(nn_structure)
        avg_cost = 0
        for i in range(N):
            delta = {}
            # perform the feed forward pass and return the stored a and z values, to be used in the
            # gradient descent step
            a, z = feed_forward_softmax(X[i, :], W, b, nn_structure)
            # loop from nl-1 to 1 backpropagating the errors
            for l in range(len(nn_structure), 0, -1):
                if l == len(nn_structure):
                    delta[l] = calculate_out_layer_delta_softmax(y[i,:], a[l], z[l])
                    avg_cost += np.linalg.norm((y[i,:]-a[l]))
                else:
                    if l > 1:
                        delta[l] = calculate_hidden_delta(delta[l+1], W[l], z[l])
                    # triW^(l) = triW^(l) + delta^(l+1) * transpose(a^(l))
                    tri_W[l] += np.dot(delta[l+1][:,np.newaxis], np.transpose(a[l][:,np.newaxis]))# np.newaxis increase the number of dimensions
                    # trib^(l) = trib^(l) + delta^(l+1)
                    tri_b[l] += delta[l+1]
        # perform the gradient descent step for the weights in each layer
        for l in range(len(nn_structure) - 1, 0, -1):
            W[l] += -alpha * (1.0/N * tri_W[l])
            b[l] += -alpha * (1.0/N * tri_b[l])
        # complete the average cost calculation
        avg_cost = 1.0/N * avg_cost
        avg_cost_func.append(avg_cost)
        cnt += 1
    return W, b, avg_cost_func

def predict_y_softmax(W, b, X, n_layers, nn_structure):
    N = X.shape[0]
    y = np.zeros((N,))
    for i in range(N):
        a, z = feed_forward_softmax(X[i, :], W, b, nn_structure)
        y[i] = np.argmax(a[n_layers])
    return y

## Assessing accuracy 

In [59]:
nn_structure = [64, 30, 10]
W, b, avg_cost_func = train_nn(nn_structure, X_train, y_v_train, 3000)

Uniform innitialization
Starting gradient descent for 3000 iterations
Iteration 0 of 3000
Iteration 1000 of 3000
Iteration 2000 of 3000


In [60]:
y_pred = predict_y(W, b, X_test, 3)
print('Prediction accuracy is {}%'.format(accuracy_score(y_test, y_pred) * 100))

Prediction accuracy is 89.98609179415855%


## Accuracy softmax

In [61]:
nn_structure = [64, 30, 10]
W, b, avg_cost_func = train_nn_softmax(nn_structure, X_train, y_v_train, 3000)

Uniform innitialization
Starting gradient descent for 3000 iterations
Iteration 0 of 3000
Iteration 1000 of 3000
Iteration 2000 of 3000


In [62]:
# get the prediction accuracy and print
y_pred = predict_y_softmax(W, b, X_test, 3, nn_structure)
print('Prediction accuracy is {}%'.format(accuracy_score(y_test, y_pred) * 100))

Prediction accuracy is 95.96662030598053%


# Wines dataset

In [64]:
from sklearn.datasets import load_wine

In [65]:
wines=load_wine()
X = wines.data
y = wines.target
X_scale = StandardScaler()
X = X_scale.fit_transform(wines.data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [71]:
def convert_y_to_vect(y):
    y_vect = np.zeros((len(y), 3))
    for i in range(len(y)):
        y_vect[i, y[i]] = 1
    return y_vect

In [78]:
y_v_train = convert_y_to_vect(y_train)
y_v_test = convert_y_to_vect(y_test)
print("The shape of the wines dataset:") 
print(X_train.shape)

The shape of the wines dataset:
(106, 13)


## Accuracy

In [75]:
nn_structure = [13, 30, 3]

In [76]:
W, b, avg_cost_func = train_nn(nn_structure, X_train, y_v_train, 1000)

Uniform innitialization
Starting gradient descent for 1000 iterations
Iteration 0 of 1000


In [77]:
y_pred = predict_y(W, b, X_test, 3)
print('Prediction accuracy is {}%'.format(accuracy_score(y_test, y_pred) * 100))

Prediction accuracy is 61.111111111111114%


## Accuracy softmax

In [79]:
W, b, avg_cost_func = train_nn_softmax(nn_structure, X_train, y_v_train, 1000)

Uniform innitialization
Starting gradient descent for 1000 iterations
Iteration 0 of 1000


In [80]:
# get the prediction accuracy and print
y_pred = predict_y_softmax(W, b, X_test, 3, nn_structure)
print('Prediction accuracy is {}%'.format(accuracy_score(y_test, y_pred) * 100))

Prediction accuracy is 97.22222222222221%
