# Using Neural Networks for Classification on MNIST and Iris Datasets

I will be using Neural Networks with Softmax in the last layer on classification problems: MNIST and Iris datasets. First I will solve them "by hand" (not using Neural Networks's implementations on any library). Later, I will solve the problems using the Scikit library and compare the results obtained from my "by hand" solution with the results obtained from the solution using the Scikit library

# By Hand

### Defining functions to be used

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score # 
from sklearn import preprocessing
import numpy as np
import numpy.random as r
import matplotlib.pyplot as plt 
import timeit

def convert_y_to_vect(y, outputs):
    y_vect = np.zeros((len(y), outputs))
    for i in range(len(y)):
        y_vect[i, y[i]] = 1
    return y_vect

def f(z): 
#sigmoid
    return 1 / (1 + np.exp(-z))
    

def f_deriv(z):
#sigmoid
    return f(z) * (1 - f(z))

def softmax(z):
    sum=0

    for i in range(len(z)):
        sum = sum  + np.exp(z[i])
    z=np.exp(z)/sum

    return z

def softmax_deriv(z):
    return softmax(z)*(1-softmax(z))

def setup_and_init_weights(nn_structure):
    W = {} 
    b = {}
    for l in range(1, len(nn_structure)):
        W[l] = r.random_sample((nn_structure[l], nn_structure[l-1]))
        b[l] = r.random_sample((nn_structure[l],))
    return W, b

def init_tri_values(nn_structure):
    tri_W = {}
    tri_b = {}
    for l in range(1, len(nn_structure)):
        tri_W[l] = np.zeros((nn_structure[l], nn_structure[l-1]))
        tri_b[l] = np.zeros((nn_structure[l],))
    return tri_W, tri_b

def feed_forward(x, W, b):
    a = {1: x} # create a dictionary for holding the a values for all levels
    z = { } # create a dictionary for holding the z values for all the layers
    for l in range(1, len(W) + 1): # for each layer
        node_in = a[l]
        z[l+1] = W[l].dot(node_in) + b[l]  # z^(l+1) = W^(l)*a^(l) + b^(l)
        a[l+1] = f(z[l+1]) # a^(l+1) = f(z^(l+1))

    return a, z

def calculate_out_layer_delta(y, a_out, z_out):
    # delta^(nl) = -(y_i - a_i^(nl)) * f'(z_i^(nl))
    return -(y-a_out) * softmax_deriv(z_out) 


def calculate_hidden_delta(delta_plus_1, w_l, z_l):
    # delta^(l) = (transpose(W^(l)) * delta^(l+1)) * f'(z^(l))
    return np.dot(np.transpose(w_l), delta_plus_1) * f_deriv(z_l)


def train_nn(nn_structure, X, y, lamb, iter_num=3000, alpha=0.25):
    W, b = setup_and_init_weights(nn_structure)
    cnt = 0
    N = len(y)
    avg_cost_func = []
    print('Starting gradient descent for {} iterations'.format(iter_num))
    while cnt < iter_num:
        if cnt%1000 == 0:
            print('Iteration {} of {}'.format(cnt, iter_num))
        tri_W, tri_b = init_tri_values(nn_structure)
        avg_cost = 0
        for i in range(N):
            delta = {}
            # perform the feed forward pass and return the stored a and z values, to be used in the
            # gradient descent step
            a, z = feed_forward(X[i, :], W, b)
            # loop from nl-1 to 1 backpropagating the errors
            for l in range(len(nn_structure), 0, -1):
                if l == len(nn_structure):
                    delta[l] = calculate_out_layer_delta(y[i,:], a[l], z[l])
                    
                    avg_cost += np.linalg.norm((y[i,:]-a[l]))
                else:
                    if l > 1:
                        delta[l] = calculate_hidden_delta(delta[l+1], W[l], z[l])
                    # triW^(l) = triW^(l) + delta^(l+1) * transpose(a^(l))
                    tri_W[l] += np.dot(delta[l+1][:,np.newaxis], np.transpose(a[l][:,np.newaxis]))# np.newaxis increase the number of dimensions
                    # trib^(l) = trib^(l) + delta^(l+1)
                    tri_b[l] += delta[l+1]
        # perform the gradient descent step for the weights in each layer
        for l in range(len(nn_structure) - 1, 0, -1):
            #Question a
            W[l] += -alpha * (1.0/N * tri_W[l] +  lamb * W[l])
            #W[l] += -alpha * (1.0/N * tri_W[l])
            b[l] += -alpha * (1.0/N * tri_b[l])
        # complete the average cost calculation
        avg_cost = 1.0/N * avg_cost
        avg_cost_func.append(avg_cost)
        cnt += 1
    return W, b, avg_cost_func


def predict_y(W, b, X, n_layers):
    N = X.shape[0]
    y = np.zeros((N,))
    for i in range(N):
        a, z = feed_forward(X[i, :], W, b)
        y[i] = np.argmax(a[n_layers])
    return y


### MNIST Dataset

In [6]:
from sklearn.datasets import load_digits 

# Load data
digits=load_digits()
X = digits.data
y = digits.target

# Scale data
X_scale = StandardScaler()
X = X_scale.fit_transform(digits.data)

# Split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# convert digits to vectors
y_v_train = convert_y_to_vect(y_train, 10)
y_v_test = convert_y_to_vect(y_test, 10)

# Neural Network
nn_structure = [64, 30, 10]
    
# train the NN
start = timeit.default_timer()
W, b, avg_cost_func = train_nn(nn_structure, X_train, y_v_train, 0.001, 3000, 0.25)
stop = timeit.default_timer()

print('Time: ', stop - start)

# get the prediction accuracy and print
y_pred = predict_y(W, b, X_test, 3)
print('Prediction accuracy is {}%'.format(accuracy_score(y_test, y_pred) * 100))

Starting gradient descent for 3000 iterations
Iteration 0 of 3000
Iteration 1000 of 3000
Iteration 2000 of 3000
Time:  235.37646590000077
Prediction accuracy is 94.1585535465925%


### Iris Dataset

In [7]:
import pandas as pd 

# Reading data
df = pd.read_csv('datasets/iris.csv', header=None, names = ["sepal length[cm]","sepal width[cm]","petal length[cm]", "petal width", "label"])
df['label'] = df.label.map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2})

# Splitting data
names = ["sepal length[cm]","sepal width[cm]","petal length[cm]", "petal width"]
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df[names],df['label'], random_state=0)

X_train=df_X_train.to_numpy()
X_test=df_X_test.to_numpy()
y_train=df_y_train.to_numpy()
y_test=df_y_test.to_numpy()

# Scaling data
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# converting to vectors
y_v_train = convert_y_to_vect(y_train, 3)
y_v_test = convert_y_to_vect(y_test, 3)

nn_structure = [4, 3, 3]
    
# training the NN
start = timeit.default_timer()
W, b, avg_cost_func = train_nn(nn_structure, X_train, y_v_train, 0.001, 3000, 0.25)
stop = timeit.default_timer()

print('Time: ', stop - start)

# getting the prediction accuracy and printing
y_pred = predict_y(W, b, X_test, 3)
print('Prediction accuracy is {}%'.format(accuracy_score(y_test, y_pred) * 100))

Starting gradient descent for 3000 iterations
Iteration 0 of 3000
Iteration 1000 of 3000
Iteration 2000 of 3000
Time:  18.313312599999335
Prediction accuracy is 97.36842105263158%


# Scikit Implementation

### MNIST Dataset

In [8]:
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
import numpy as np
import numpy.random as r
from sklearn.neural_network import MLPClassifier

# Loading data
digits=load_digits()
X = digits.data
y = digits.target

# Scaling data
X_scale = StandardScaler()
X = X_scale.fit_transform(digits.data)

# Split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Function to convert y to vector
def convert_y_to_vect(y, outputs):
    y_vect = np.zeros((len(y), outputs))
    for i in range(len(y)):
        y_vect[i, y[i]] = 1
    return y_vect

# convert to vectors
y_v_train = convert_y_to_vect(y_train, 10)
y_v_test = convert_y_to_vect(y_test, 10)

# Running the algorithm
clf = MLPClassifier(solver='sgd', activation='logistic', alpha=1e-5, hidden_layer_sizes=(30,), random_state=1, learning_rate_init=0.5)
clf.fit(X_train, y_v_train)
clf.out_activation_ = 'softmax'

# Predicting on test data
score = clf.score(X_test, y_v_test)
print("Accuracy:", score * 100)

Accuracy: 97.77468706536857


### Iris Dataset

In [14]:
import pandas as pd 

# Reading data
df = pd.read_csv('datasets/iris.csv', header=None, names = ["sepal length[cm]","sepal width[cm]","petal length[cm]", "petal width", "label"])
df['label'] = df.label.map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2})

# Splitting data
names = ["sepal length[cm]","sepal width[cm]","petal length[cm]", "petal width"]
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df[names],df['label'], random_state=0)

X_train=df_X_train.to_numpy()
X_test=df_X_test.to_numpy()
y_train=df_y_train.to_numpy()
y_test=df_y_test.to_numpy()

# Scaling data
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# convert to vectors
y_v_train = convert_y_to_vect(y_train, 3)
y_v_test = convert_y_to_vect(y_test, 3)

# Running the algorithm
clf = MLPClassifier(solver='sgd', alpha=1e-5,activation='logistic', hidden_layer_sizes=(3,), random_state=1, learning_rate_init=2)
clf.fit(X_train, y_v_train)
clf.out_activation_ = 'softmax'

# Predicting on test data
score = clf.score(X_test, y_v_test)
print("Accuracy:", score * 100)

Accuracy: 97.36842105263158


### Results

The results obtained were the same for the Iris dataset (97.36% accuracy). However, the Scikit implementation performed better than the "by hand" implementation for the MNIST dataset (97.77% vs 94.15% accuracy)