In [13]:
import numpy as np
import time
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
import math

[Activation functions](https://github.com/Kulbear/deep-learning-nano-foundation/wiki/ReLU-and-Softmax-Activation-Functions)

In [14]:
#Sigmoid
def sigmoid_f(x):
    return 1 / (1 + np.exp(-x))
    
def sigmoid_df(x):  
    sig=sigmoid_f(x)
    return sig*(1-sig)
    
#ReLU
def ReLU_f_over(x):
    return x*(x>0)

def ReLU_f(x): 
    return np.where(x > 0, x, 0)


def ReLU_df(x):
    return np.where(x <= 0, 0, 1)

#Softmax
def softmax_my(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

def softmax_df(x):  
    soft=softmax_f(x)
    return soft*(1-soft)

def softmax_f(x):  # работает
    temp = math.e**x
    div = temp.sum(axis=1)
    return np.transpose((np.divide(np.transpose(temp), np.transpose(div))))

    
    

In [15]:
import os
from sklearn.datasets import fetch_mldata

# Fetch MNIST dataset and create a local copy.
if os.path.exists('mnist.npz'):
    with np.load('mnist.npz', 'r') as data:
        X = data['X']
        y = data['y']
else:
    mnist = fetch_mldata("mnist-original")
    X, y = mnist.data / 255.0, mnist.target
    np.savez('mnist.npz', X=X, y=y)

In [16]:
print(np.shape(y.reshape(-1,1)))

(70000, 1)


In [17]:
from sklearn.preprocessing import OneHotEncoder
# Convert labels to One Hot Encoded
num_digits = 10
encoder = OneHotEncoder(categorical_features =[0]) 
y_one_hot = encoder.fit_transform(y.reshape(-1, 1)).toarray()
print(y_one_hot[50000,:])

# def to_one_hot(y): 
#     one_hot_y = np.zeros((len(y), num_digits))
#     for i in range(len(y)):
#         one_hot_y[i, int(one_hot_y[i])] = 1
#     return one_hot_y

[0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]


In [18]:
from sklearn.utils import shuffle
X,y=shuffle(X,y)
#Split into test , train based on Kaggle
X_train=X[:60000,:]
y_train=y_one_hot[:60000,:]

X_test=X[60000:,:]
y_test=y_one_hot[60000:,:]
print(np.shape(X_train))
print(np.shape(y_test))

(60000, 784)
(10000, 10)


In [19]:
def he_initialization(size_prev, size_next):  # HE initialization for matrix weights
    return np.random.randn(size_prev, size_next) * np.sqrt(2.0/size_prev)

In [20]:
#Expects weights and bias to be a matrix and a vector, activation_function to be a value= 1,2 or 3
#Use when need to store z, h at each layer (for deltas)
def layer_predict(inputs, weights, bias, activation_function):
    #main sum counting z
    #bias as vector adds to each row of weight matrix
    #example: weights (300, 784), inputs (10000,784), bias (300,) then out (10000, 300)
    out = np.add(np.dot(weights, inputs.T).T, bias)
    # return z and h=f(z), f activation as parameter
    if activation_function == 1:
        return out, sigmoid_f(out)
    if activation_function == 2:
        return out, ReLU_f(out)
    if activation_function == 3:
        return out, softmax_f(out)
#Expects z to be a matrix , activation_function to be a value= 1,2 or 3
#Use when counting delta back propagate using derivatives
def layer_back_propagate_derivatives(z, activation_function):
    if activation_function == 1:
        return sigmoid_df(z)
    if activation_function == 2:
        return ReLU_df(z)
    if activation_function == 3:
        return softmax_df(z)
    
#Expects weights and bias to be a list of matrixes and a list of vectors 
#Use when given input data gets output without storing hidden layers values
def neural_predict(x_data, weights_in, biases_in, neural_architecture):  
    #for each layer in neural architecture (same dim as weights)
    activation_functions=neural_architecture[:,2]
    
    for layer in range(0, len(neural_architecture)):  
    
        #initial layer then take x_data as input
        if layer == 0:
            z_temp, h_temp = layer_predict(x_data, weights_in[layer],biases_in[layer], activation_functions[layer])
            h = h_temp
        else:
            #take h-previous layer output as input
            z_temp, h_temp = layer_predict(h, weights_in[layer],biases_in[layer], activation_functions[layer])
            h = h_temp

    return h
    
def convert_to_binary(vector):
    row_maxes = vector.max(axis=1).reshape(-1, 1)
    vector[:] = np.where(vector == row_maxes, 1, 0)
    return vector

def root_mean_square_error(y_predict,y_actual):
    y_predict=convert_to_binary(y_predict)
    mse=np.sum(np.power(y_actual-y_predict,2))/len(y_predict)
    return np.sqrt(mse)


In [31]:
def one_epoch_mini_batch_gradient_step(X_train,y_train,w_weights,b_biases,neural_architecture,activation_functions,learning_rate,num_epoches,batch_size):
    #A list of outputs of each layer
    h =[]
    #A list of weightned sum for each layer
    z=[]
    weights = w_weights.copy()
    biases = b_biases.copy()
    X_train,y_train=shuffle(X_train,y_train)
    n_minibatches = X_train.shape[0] // batch_size #1875
    i = 0
    for i in range(n_minibatches):
        #A list of outputs of each layer
        h =[]
        #A list of weightned sum for each layer
        z=[]
        deltas=[]
        X_mini=X_train[i * batch_size:(i + 1)*batch_size, :] #(32,784)
        y_mini=y_train[i * batch_size:(i + 1)*batch_size,:] #(32,10)
        #print(np.shape(X_mini))   
        #print(np.shape(y_mini)) 
        N=X_mini.shape[0] #32...
        #PARAMETERS
            
        #Forward propagation
        layers_amount=len(neural_architecture)
        #weights=w_weights.copy()
        #biases=b_biases.copy()
        for layer in range(0, layers_amount):
            #For initial layer take X as input
            if layer==0:
                inputs=X_mini
                #z_temp, h_temp= layer_predict(X_mini, weights[layer], biases[layer], activation_functions[layer])
            #For any other layer take previous layer output as input
            else:
                inputs=h[layer-1]
                #z_temp, h_temp= layer_predict(h[layer-1], weights[layer], biases[layer], activation_functions[layer])
            #Store as lists
            z_temp, h_temp= layer_predict(inputs, weights[layer], biases[layer], activation_functions[layer])
            z.append(z_temp) #0=(32,300) 1=(32,100) 2=(32,10)
            h.append(h_temp)
                
        #y_predict=neural_predict(X_mini,weights,biases,neural_architecture) same as h[layers_amount - 1]
        # z,y_predict=layer_predict(h[layer_amount-1],weights[layer_amount-1],biases[layer_amount-1],
        #activation_functions[layer_amount-1])
        #error=y_mini-y_predict 
            
        #DELTA INITIAL AND HIDDEN HERE
            
        #Initialize empty list size as amount of layers in network
        deltas= np.empty(layers_amount, dtype=object)
        #Count last delta based on (1)
        #(32,10)
        delta_last = np.multiply((h[layers_amount - 1] - y_mini),layer_back_propagate_derivatives(z[layers_amount - 1], activation_functions[layers_amount - 1])) 
        #Store last delta in list
#         print("Delta Last")
#         print(h[layers_amount - 1])
#         print(z[layers_amount - 1])
        deltas[layers_amount-1]=delta_last # matrix size (dataset_size,amount_of_neurons_on_this_layer) transpose weights
        #Back propagating with step=1 , stop=-1 (need count on layer=0) 
        for layer in range(layers_amount-2,-1,-1):
            #Counting delta on hidden layers based on (2)
            deltas[layer]=np.multiply((deltas[layer+1]).dot(weights[layer+1]),layer_back_propagate_derivatives(z[layer], activation_functions[layer]))    
#         for layer in range(0, layers_amount):
#             print("Deltas")
#             print(i)
#             print(deltas[layer][::100])
        #PARAMETERS UPDATES USING DELTAS
        for layer in range(0, layers_amount):
            #For initial layer take X as input
            if layer==0:
                inputs=X_mini
            #For any other layer take previous layer output as input
            else:
                inputs=h[layer-1]
            #update weights
            lN=learning_rate/N
            biases[layer]=biases[layer]-lN*np.sum(deltas[layer],axis=0)
            weights[layer]=weights[layer]-lN*((deltas[layer].T).dot(inputs))
#             print("Learned")
#             print(layer)
#             print(learning_rate*np.sum(deltas[layer],axis=0)/N )
#             print(layer)
#             print("Weights")
            #print(learning_rate*((deltas[layer].T).dot(inputs))/N )
#             print(learning_rate*((deltas[layer].T).dot(inputs))/N)
    return weights,biases

In [34]:
def mini_batch_gradient_descent(X_train,y_train,neural_architecture,learning_rate,num_epoches,batch_size):
    #A list of matrixes size of (new_layer_neurons_amount,prev_layer_neurons_amount)
    w_weights=[]
    #A list of vectors size of (new_layer_neurons_amount,1)
    b_biases=[]
    accuracy=[]
   
    for layer in range(0, len(neural_architecture)):
        #For first time forward propagation random initialization
        w_weights.append(np.full((neural_architecture[layer][1], neural_architecture[layer][0]), 0))
        #w_weights.append(he_initialization(neural_architecture[layer,1],neural_architecture[layer,0]))#(300, 784) (100, 300) (10, 100)
        b_biases.append(np.zeros(neural_architecture[layer][1])) #(300,) (100,) (10,)
    #Just to store as list    
    activation_functions=neural_architecture[:,2]
#     print("Initial weights random")
#     print(w_weights[0][::100,::50])
#     print(b_biases[0][::100])
    it=0
    for it in range(num_epoches):
        #print(weights[0][::100,::50])
        #print(biases[0][::100])
        w1=w_weights
        b1=b_biases
        w_weights,b_biases=one_epoch_mini_batch_gradient_step(X_train,y_train,w_weights,b_biases,neural_architecture,activation_functions,learning_rate,num_epoches,batch_size)
#         print("Epoches changed from initial?")
#         print(w_weights[0][::100,::50])
#         print(b_biases[0][::100])
        y_predict=neural_predict(X_train,w_weights,b_biases,neural_architecture)
        convert_to_binary(y_predict)
        print(y_predict[::10000])
        print(it)
        print(np.array_equal(w1, w_weights) )
        print(np.array_equal(b1, b_biases) )
        accuracy.append(accuracy_score(y_train,y_predict)*100)
        print(accuracy)
    return w_weights,b_biases

In [35]:
rmse=[]
omega=[]
omega0=[]
rmse_train=[]

#Already initialized
#X_test (10000, 784)
#y_test (10000, 10)

#X_train (60000, 784)
#y_train (60000, 10)
      
start = time.time()
    
neural_architecture= np.array([[784, 300, 2, 0],[300, 10, 3, 0]])

weights_item,bias_item=mini_batch_gradient_descent(X_train,y_train,neural_architecture,learning_rate = 50,num_epoches=30,batch_size=32)
stop = time.time()
duration = stop-start
print(duration)
    
        
y_predict=neural_predict(X_test,weights_item,bias_item,neural_architecture)
rmse.append(root_mean_square_error(y_predict,y_test))
#NEED TO PLOT RMSE

[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]
0
False
False
[9.751666666666667]
[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]
1
False
False
[9.751666666666667, 9.035]
[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]
2
False
False
[9.751666666666667, 9.035, 9.751666666666667]
[[0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]
3
False
False
[9.751666666666667, 9.035, 9.751666666666667, 9.7516666

  
  return umr_maximum(a, axis, None, out, keepdims, initial)


[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
10
False
False
[9.751666666666667, 9.035, 9.751666666666667, 9.751666666666667, 9.915000000000001, 10.441666666666666, 11.236666666666666, 9.915000000000001, 9.035, 9.915000000000001, 0.0]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
11
False
False
[9.751666666666667, 9.035, 9.751666666666667, 9.751666666666667, 9.915000000000001, 10.441666666666666, 11.236666666666666, 9.915000000000001, 9.035, 9.915000000000001, 0.0, 0.0]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
12
False
False
[9.75

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
29
False
False
[9.751666666666667, 9.035, 9.751666666666667, 9.751666666666667, 9.915000000000001, 10.441666666666666, 11.236666666666666, 9.915000000000001, 9.035, 9.915000000000001, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
469.01992893218994


In [None]:
from matplotlib import pyplot as plt

def mse_plot(rmse):  
    plt.figure(figsize=(7, 7))
    plt.plot(rmse, label="MSE train")
    plt.legend()
    plt.show()
    
mse_plot(rmse)    

In [None]:
deltas= np.empty(5, dtype=object)
print(deltas)
omega=np.full((3, 2), 0.5)
deltas[2]=omega
print(deltas)

In [None]:
X_test,y_test=shuffle(X_test,y_test)
inputs=X_test[:100,:]  
weights=[]
neural_architecture= np.array([[784, 300, 1, 0],[300, 100, 1, 0],[100, 10, 3, 0]])
activation_functions=neural_architecture[:,2]
biases=[]
#A list of outputs of each layer
h =[]
#A list of weightned sum for each layer
z=[]
deltas= np.zeros(len(neural_architecture), dtype=object)    
for layer in range(0, len(neural_architecture)):
    #For first time forward propagation random initialization
    #weights.append(he_initialization(neural_architecture[layer,1],neural_architecture[layer,0]))
    #weights.append(np.zeros(neural_architecture[layer][1],neural_architecture[layer][0]))
    weights.append(np.full((neural_architecture[layer][1], neural_architecture[layer][0]), 0))
    biases.append(np.zeros(neural_architecture[layer][1]))
    #Just to store as list

    
out = np.add(np.dot(weights[0], inputs.T).T, biases[0])
#print(np.shape(weights))
#print( len(neural_architecture))
#print(len(weights))

z,y_predict=layer_predict(inputs,weights[0],biases[0],1)
z,y2=layer_predict(y_predict,weights[1],biases[1],1)
z,y3=layer_predict(y2,weights[2],biases[2],3)

y3_2=neural_predict(inputs,weights, biases, neural_architecture)
#print(np.shape(y3_2))
#convert_to_binary(y_predict)
#print(softmax_f(out3))
layers_amount=len(neural_architecture)
        #weights=w_weights.copy()
        #biases=b_biases.copy()
        #A list of outputs of each layer
h =[]
#A list of weightned sum for each layer
z=[]
for layer in range(0, layers_amount):
            #For initial layer take X as input
    if layer==0:
        inputs1=X_test[:100,:]
                #z_temp, h_temp= layer_predict(X_mini, weights[layer], biases[layer], activation_functions[layer])
            #For any other layer take previous layer output as input
    else:
        inputs1=h[layer-1]
                #z_temp, h_temp= layer_predict(h[layer-1], weights[layer], biases[layer], activation_functions[layer])
            #Store as lists
    z_temp, h_temp= layer_predict(inputs1, weights[layer], biases[layer], activation_functions[layer])
    z.append(z_temp) #0=(32,300) 1=(32,100) 2=(32,10)
    h.append(h_temp)
#print(np.shape(h[layers_amount - 1] - y_test[:100,:]))
#print(np.shape(layer_back_propagate_derivatives(z[layers_amount - 1], activation_functions[layers_amount - 1])))
delta_last = np.multiply((h[layers_amount - 1] - y_test[:100,:]),layer_back_propagate_derivatives(z[layers_amount - 1], activation_functions[layers_amount - 1]))
#print( h[layers_amount - 1] - y_test[:100,:])
#print(delta_last)
deltas[layers_amount-1]=delta_last

for layer in range(layers_amount-2,-1,-1):
    #Counting delta on hidden layers based on (2)
    #print(np.shape(layer_back_propagate_derivatives(z[layer], activation_functions[layer])))
    deltas[layer]=np.multiply((deltas[layer+1]).dot(weights[layer+1]),layer_back_propagate_derivatives(z[layer], activation_functions[layer])) 

print(np.shape(deltas[2]))

for layer in range(0, layers_amount):
    #For initial layer take X as input
    if layer==0:
        inputs2=X_test[:100,:]
    #For any other layer take previous layer output as input
    else:
        inputs2=h[layer-1]
                #update weights
    N=100
    #print(np.shape(weights[layer]))
    biases[layer]=biases[layer]-0.001*np.sum(deltas[layer],axis=0)/N
    weights[layer]=weights[layer]-0.001*((deltas[layer].T).dot(inputs2))/N
    #print(weights[layer])
print(deltas[2].T)
print(h[1])
print((deltas[2].T).dot(h[1]))
print(weights[2])


In [None]:

layer=0
for layer in range(len(neural_architecture)):
    #print(np.shape(weights[layer]))
    print(len(neural_architecture))

In [None]:
xs = np.array([[0.5, 0.5, 0.5, 0.5,0.5,0.5, 0.5, 0.5, 0.5,0.5],
             [0.5, 0.5, 0.5, 0.5,0.5,0.5, 0.5, 0.5, 0.5,0.5],
             [0.5, 0.5, 0.5, 0.5,0.5,0.5, 0.5, 0.5, 0.5,0.5]])
print(softmax_f(xs)) 

In [None]:
print(X_train[::2000,::50])

In [None]:
#print(-5.62601022e-051 )
numbers = [1.74408899e-012, 2.37943956e-004, 2.63039636e-006, 2.55056039e-010,
  3.21408160e-051, 9.99759377e-001, 2.54537571e-008, 2.25822913e-008,
  2.00556809e-037, 2.19572601e-128]

for number in numbers:
    print(f'{number:9.51f}')