In [34]:
import numpy as np
import time
from sklearn.utils import shuffle

[Activation functions](https://github.com/Kulbear/deep-learning-nano-foundation/wiki/ReLU-and-Softmax-Activation-Functions)

In [38]:
#Sigmoid
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
    
def deriv_sigmoid(x):  
    sig=sigmoid(x)
    return sig*(1-sig)
    
#ReLU
def ReLU(x):
    return x*(x>0)

def deriv_ReLU(x):
    return np.where(x <= 0, 0, 1)

#Softmax
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

def deriv_softmax(x):  
    soft=softmax(x)
    return soft*(1-soft)

    
    

In [39]:
import os
from sklearn.datasets import fetch_mldata

# Fetch MNIST dataset and create a local copy.
if os.path.exists('mnist.npz'):
    with np.load('mnist.npz', 'r') as data:
        X = data['X']
        y = data['y']
else:
    mnist = fetch_mldata("mnist-original")
    X, y = mnist.data / 255.0, mnist.target
    np.savez('mnist.npz', X=X, y=y)

In [40]:
print(np.shape(y.reshape(-1,1)))

(70000, 1)


In [41]:
from sklearn.preprocessing import OneHotEncoder
# Convert labels to One Hot Encoded
num_digits = 10
encoder = OneHotEncoder(categorical_features =[0]) 
y_one_hot = encoder.fit_transform(y.reshape(-1, 1)).toarray()
print(y_one_hot[50000,:])

# def to_one_hot(y): 
#     one_hot_y = np.zeros((len(y), num_digits))
#     for i in range(len(y)):
#         one_hot_y[i, int(one_hot_y[i])] = 1
#     return one_hot_y

[0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]


In [42]:
from sklearn.utils import shuffle
X,y=shuffle(X,y)
#Split into test , train based on Kaggle
X_train=X[:60000,:]
y_train=y_one_hot[:60000,:]

X_test=X[60000:,:]
y_test=y_one_hot[60000:,:]
print(np.shape(X_train))
print(np.shape(y_test))

(60000, 784)
(10000, 10)


In [43]:
def he_initialization(size_prev, size_next):  # HE initialization for matrix weights
    return np.random.randn(size_prev, size_next) * np.sqrt(2.0/size_prev)

In [44]:
#Expects weights and bias to be a matrix and a vector, activation_function to be a value= 1,2 or 3
#Use when need to store z, h at each layer (for deltas)
def layer_predict(inputs, weights, bias, activation_function):
    #main sum counting z
    #bias as vector adds to each row of weight matrix
    #example: weights (300, 784), inputs (10000,784), bias (300,) then out (10000, 300)
    out = np.add(np.dot(weights, inputs.T).T, bias)
    # return z and h=f(z), f activation as parameter
    if activation_function == 1:
        return out, sigmoid(out)
    if activation_function == 2:
        return out, ReLU(out)
    if activation_function == 3:
        return out, softmax(out)
#Expects z to be a matrix , activation_function to be a value= 1,2 or 3
#Use when counting delta back propagate using derivatives
def layer_back_propagate_derivatives(z, activation_function):
    if activation_function == 1:
        return deriv_sigmoid(z)
    if activation_function == 2:
        return deriv_ReLU(z)
    if activation_function == 3:
        return deriv_softmax(z)
    
#Expects weights and bias to be a list of matrixes and a list of vectors 
#Use when given input data gets output without storing hidden layers values
def neural_predict(x_data, weights, bias, neural_architecture):  
    #for each layer in neural architecture (same dim as weights)
    activation_functions=neural_architecture[:,2]
    
    for layer in range(0, len(neural_architecture)):  # forward one batch
    
        #initial layer then take x_data as input
        if layer == 0:
            z_temp, h_temp = layer_predict(x_data, weights[layer],biases[layer], activation_functions[layer])
            h = h_temp
        else:
            #take h-previous layer output as input
            z_temp, h_temp = layer_predict(h, weights[layer],biases[layer], activation_functions[layer])
            h = h_temp

    return h
    
def convert_to_binary(vector):
    row_maxes = vector.max(axis=1).reshape(-1, 1)
    vector[:] = np.where(vector == row_maxes, 1, 0)
    return vector

def root_mean_square_error(y_predict,y_actual):
    y_predict=convert_to_binary(y_predict)
    mse=np.sum(np.power(y_actual-y_predict,2))/len(y_predict)
    return np.sqrt(mse)


In [124]:
def mini_batch_gradient_descent(X_train,y_train,neural_architecture,learning_rate,num_epoches,batch_size):
    #A list of matrixes size of (new_layer_neurons_amount,prev_layer_neurons_amount)
    weights=[]
    #A list of vectors size of (new_layer_neurons_amount,1)
    biases=[]
   
    for layer in range(0, len(neural_architecture)):
        #For first time forward propagation random initialization
        weights.append(he_initialization(neural_architecture[layer,1],neural_architecture[layer,0]))#(300, 784) (100, 300) (10, 100)
        biases.append(np.zeros(neural_architecture[layer][1])) #(300,) (100,) (10,)
        #Just to store as list
        activation_functions=neural_architecture[:,2]
        
    it=0
    for it in range(num_epoches):
        #A list of outputs of each layer
        h =[]
        #A list of weightned sum for each layer
        z=[]
    
        X_train,y_train=shuffle(X_train,y_train)
        n_minibatches = X_train.shape[0] // batch_size #1875
        i = 0
        for i in range(n_minibatches):
            X_mini=X_train[i * batch_size:(i + 1)*batch_size, :] #(32,784)
            y_mini=y_train[i * batch_size:(i + 1)*batch_size,:] #(32,10)
            #print(np.shape(X_mini))   
            #print(np.shape(y_mini)) 
            N=X_mini.shape[0] #32
            #PARAMETERS
            
            #Forward propagation
            layers_amount=len(neural_architecture)
            for layer in range(0, layers_amount):
                #For initial layer take X as input
                if layer==0:
                    inputs=X_mini
                    #z_temp, h_temp= layer_predict(X_mini, weights[layer], biases[layer], activation_functions[layer])
                #For any other layer take previous layer output as input
                else:
                    inputs=h[layer-1]
                    #z_temp, h_temp= layer_predict(h[layer-1], weights[layer], biases[layer], activation_functions[layer])
                #Store as lists
                z_temp, h_temp= layer_predict(inputs, weights[layer], biases[layer], activation_functions[layer])
                z.append(z_temp) #0=(32,300) 1=(32,100) 2=(32, 10)
                h.append(h_temp)
                
            #y_predict=neural_predict(X_mini,weights,biases,neural_architecture) same as h[layers_amount - 1]
            # z,y_predict=layer_predict(h[layer_amount-1],weights[layer_amount-1],biases[layer_amount-1],
            #activation_functions[layer_amount-1])
            #error=y_mini-y_predict 
            
            #DELTA INITIAL AND HIDDEN HERE
            
            #Initialize empty list size as amount of layers in network
            deltas= np.empty(layers_amount, dtype=object)
            #Count last delta based on (1)
            #(32,10)
            delta_last = np.multiply((h[layers_amount - 1] - y_mini),layer_back_propagate_derivatives(z[layers_amount - 1], activation_functions[layers_amount - 1])) 
            #Store last delta in list                                                                               
            deltas[layers_amount-1]=delta_last # matrix size (dataset_size,amount_of_neurons_on_this_layer) transpose weights
            #Back propagating with step=1 , stop=-1 (need count on layer=0) 
            for layer in range(layers_amount-2,-1,-1):
                #Counting delta on hidden layers based on (2)
                deltas[layer]=np.multiply((deltas[layer+1]).dot(weights[layer+1]),layer_back_propagate_derivatives(z[layer], activation_functions[layer]))    
            #PARAMETERS UPDATES USING DELTAS
            for layer in range(0, layers_amount):
                #For initial layer take X as input
                if layer==0:
                    inputs=X_mini
                #For any other layer take previous layer output as input
                else:
                    inputs=h[layer-1]
                #update weights
                biases[layer]=biases[layer]-learning_rate*np.sum(deltas[layer],axis=0)/N
                weights[layer]=weights[layer]-learning_rate*((deltas[layer].T).dot(inputs))/N
                #biases[layer]=biases[layer]-learning_rate*np.sum(deltas[layer],axis=0)
                #biases[layer]=biases[layer]-0.001*np.sum(deltas[layer],axis=0)
        y_predict=neural_predict(X_train,weights,biases,neural_architecture)
        rmse.append(root_mean_square_error(y_predict,y_train))
        print(rmse)
    return weights,biases

In [None]:
rmse=[]
omega=[]
omega0=[]
rmse_train=[]

#Already initialized
#X_test (10000, 784)
#y_test (10000, 10)

#X_train (60000, 784)
#y_train (60000, 10)
      
start = time.time()
    
neural_architecture= np.array([[784, 300, 1, 0],[300, 100, 1, 0],[100, 10, 3, 0]])

weights_item,bias_item=mini_batch_gradient_descent(X_train,y_train,neural_architecture,learning_rate = 0.01,num_epoches=30,batch_size=32)
stop = time.time()
duration = stop-start
print(duration)
    
        
y_predict=neural_predict(X_test,weights_item,bias_item,neural_architecture)
rmse.append(root_mean_square_error(y_predict,y_test))
#NEED TO PLOT RMSE

[1.3426342266852378]
[1.3426342266852378, 1.3438253854823055]
[1.3426342266852378, 1.3438253854823055, 1.3430065772983641]
[1.3426342266852378, 1.3438253854823055, 1.3430065772983641, 1.3422369388450015]
[1.3426342266852378, 1.3438253854823055, 1.3430065772983641, 1.3422369388450015, 1.3428328265275615]
[1.3426342266852378, 1.3438253854823055, 1.3430065772983641, 1.3422369388450015, 1.3428328265275615, 1.342994167274502]
[1.3426342266852378, 1.3438253854823055, 1.3430065772983641, 1.3422369388450015, 1.3428328265275615, 1.342994167274502, 1.3431306712304651]
[1.3426342266852378, 1.3438253854823055, 1.3430065772983641, 1.3422369388450015, 1.3428328265275615, 1.342994167274502, 1.3431306712304651, 1.3426218132196919]
[1.3426342266852378, 1.3438253854823055, 1.3430065772983641, 1.3422369388450015, 1.3428328265275615, 1.342994167274502, 1.3431306712304651, 1.3426218132196919, 1.341777428140251]
[1.3426342266852378, 1.3438253854823055, 1.3430065772983641, 1.3422369388450015, 1.3428328265275

In [114]:
neural_architecture= np.array([[784, 300, 1, 0],[300, 100, 1, 0],[100, 10, 3, 0]])
activation_functions=neural_architecture[:,2]


In [None]:
deltas= np.empty(5, dtype=object)
print(deltas)
omega=np.full((3, 2), 0.5)
deltas[2]=omega
print(deltas)

In [115]:
inputs=X_test 
weights=[]

biases=[]
#A list of outputs of each layer
h =[]
#A list of weightned sum for each layer
z=[]
deltas= np.zeros(len(neural_architecture), dtype=object)    
for layer in range(0, len(neural_architecture)):
    #For first time forward propagation random initialization
    weights.append(he_initialization(neural_architecture[layer,1],neural_architecture[layer,0]))
    biases.append(np.zeros(neural_architecture[layer][1]))
    #Just to store as list
    activation_functions=neural_architecture[:,2]
    biases[layer]=biases[layer]-0.001*np.sum(deltas[layer],axis=0)
out = np.add(np.dot(weights[0], inputs.T).T, biases[0])
print(np.shape(weights))
print( len(neural_architecture))
print(len(weights))


(3,)
3
3


In [118]:
for layer in range(0, len(neural_architecture)):
    #print(np.shape(weights[layer]))
    print(np.shape(biases[layer]))

(300,)
(100,)
(10,)
