In [100]:
import numpy as np
import h5py
import tensorflow as tf
import matplotlib.pyplot as plt
import scipy
import decimal
# decimal.getcontext().prec = 1000
# np.seterr(divide='ignore', invalid='ignore')

In [101]:
def load_data():
    (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
    return x_train, y_train, x_test, y_test

In [102]:
def network_initialization(num_input, hidden_units, num_classes, x_train):
    #number of classes is 10 --> 0-9
    #number of input features are 28 * 28
    #xTw + b ==> x = [1, number of features][number of features, number of class]
    # ==> [1, number of classes] <== b
    w1 = np.random.rand(num_input, hidden_units)
    b1 = np.random.rand(1, num_classes)
    w2 = np.random.rand(hidden_units, hidden_units)
    b2 = np.random.rand(1, hidden_units)
    w_out = np.random.rand(hidden_units, num_classes)
    b_out = np.random.rand(1, num_classes)
    param = {}
    param["w1"] = w1
    param["w2"] = w2
    param["w_out"] = w_out
    param["b1"] = b1
    param["b2"] = b2
    param["b_out"] = b_out
                     
    return param

In [103]:
def oneHot(y):
    m = y.shape[0]
    OH = scipy.sparse.csr_matrix((np.ones(m), (y, np.array(range(m)))))
    OH = np.array(OH.todense()).T
    return OH

In [104]:
def test_preparation(x_train, x_test, y_train, y_test):
    hyperparam = {}
    hyperparam["num_epochs"] = 1000
    hyperparam["batch_size"] = 64
    hyperparam["num_samples"] = x_train.shape[0]
    hyperparam["mini_BGD"] = False
    hyperparam["num_batch"] = int(hyperparam["num_samples"] / hyperparam["batch_size"])
    hyperparam["learning_rate"] = 0.001
    last_batch = hyperparam["num_samples"] % hyperparam["batch_size"]

    y_train = oneHot(y_train)
    y_test = oneHot(y_test)

    loss_epoch_report = 0
    loss_batch_report = 0
    x_train = x_train.reshape((x_train.shape[0], x_train.shape[1] * x_train.shape[2]))
    x_test = x_test.reshape((x_test.shape[0], x_test.shape[1] * x_test.shape[2]))
    return x_train, x_test, y_train, y_test, hyperparam

In [105]:
def logloss(y_hat, y):
    eps = np.finfo(float).eps
    predictions = np.clip(y_hat, eps, 1 - eps)
#     print("q")
    predictions /= np.sum(predictions)[np.newaxis]
    rows = y.shape[0]
    vsota = np.sum(y * np.log(predictions))
#     print("l")
    value = (-1.0 / rows * vsota)
#     print("c")
    return value

In [118]:
def softmax(z):
#     z = np.exp(z - np.max(z, axis = 1)[:, np.newaxis])
#     z = z / np.sum(z, axis = 1)[:, np.newaxis] #normalizing for softmax
    a = np.exp(z) / np.sum(np.exp(z), axis = 1)[:, np.newaxis]
    a_output = np.zeros_like(a)
    a_output[np.arange(len(a)), a.argmax(1)] = 1
    return a_output, a

In [119]:
def ReLU(x):
    return x * (x > 0)

def dReLU(x):
    return 1. * (x > 0)

In [151]:
x_train, y_train, x_test, y_test = load_data()
from sklearn import preprocessing
param = network_initialization(784, 10, 10, x_train)

x_train, x_test, y_train, y_test, hyperparam = test_preparation(
    x_train, x_test, y_train, y_test)

# print(x_train[0])
x_train = x_train / np.sum(x_train, axis = 1)[:, np.newaxis]
# x_train.shape
# print(x_train[0])

In [152]:
np.sum(x_train[2])

1.0

In [153]:
# param = network_initialization(784, 10, x_train)
# def fit_SGD(x_train, y_train, y_test, hyperparam, param):
import copy
loss_epoch_report = 0
accuracy = 0
loss1 = 0
loss2 = 0
learning_rate = 0.01 #hyperparam["learning_rate"]
# y_epoch = copy.deepcopy(y_train)
# x_epoch = copy.deepcopy(x_train)
for epoch in range(hyperparam["num_epochs"]):

    shuffling = np.arange(hyperparam["num_samples"])
    np.random.RandomState(seed=1).shuffle(shuffling)
    x_epoch = x_train[shuffling]
    y_epoch = y_train[shuffling]

    #calculating xTw + b
#     print(param["w1"][0])
    z1 = np.dot(x_epoch, param["w1"]) # + param["b1"]
    a1 = np.tanh(z1) # 6000 * 10
#     print(a1[0])
  
    z_out = np.dot(a1, param["w_out"]) # + param["b_out"]
    a_out, a_out_prob = softmax(z_out)
#     print(a_out_prob[0])

#     z2 = np.dot(a1, param["w2"]) # + param["b2"]
#     a2 = np.tanh(z2)


#     z_out = np.dot(a2, param["w_out"]) # + param["b_out"]
#     a_out, a_out_prob = softmax(z_out)


    #softmax class output
    y_hat = np.argmax(a_out, axis = 1)[:, np.newaxis]
    
    #class output of true labels
    y_label = np.argmax(y_epoch, axis = 1)[:, np.newaxis]
    print(y_hat, y_label)

    #calculating accuracy of the model
    accuracy = (y_hat == y_label).all(axis = 1).mean()
    
    #back_propagation calculation
    #it is like calculating the layer values based on its output
    # and its supposed output --> based on this we can calculate
    # the weights change
#     dz_out = a_out_prob - y_epoch #6000*10
#     dw_out = np.matmul(a2.T, dz_out) #10*10
    
    dz_out = a_out_prob - y_epoch #6000*10
    print(np.sum(dz_out))
#     print(dz_out[0])
#     print(np.sum(a1.T))
    dw_out = np.dot(a1.T, dz_out) #10*10
#     print(dw_out[0])

    #now again, we have to calculate the layer values change based
    # on its output and its supposed output --> then calculate the
    # weights
#     dz2 = np.dot(dz_out, param["w_out"].T) * (1 - np.power(a2, 2))
#     dw2 = np.dot(a1.T, dz2)  #784*10
    
    print(dz_out.shape, param["w_out"].T[0])
    dz1 = np.dot(dz_out, param["w_out"].T) * a1 *(1 - a1)
#     print(dz1[0])
    dw1 = np.dot(x_epoch.T, dz1)
#     dz1 = np.dot(dz2, param["w2"].T) * (1 - np.power(a1, 2))
#     dw1 = np.dot(x_epoch.T, dz1)
#     print(dw1[0])
#     print(x_epoch.T[0])
    
#     print("a")
    
    param["w1"] = param["w1"] - learning_rate * dw1
#     print(param["w1"][0])
#     param["w2"] = param["w2"] - learning_rate * dw2
    param["w_out"] = param["w_out"] - learning_rate * dw_out
    
#     print("f")
    #calling logloss function
    loss_epoch_report = logloss(y_epoch, a_out)

    #calculating gradient of weight and bias
#     weight_grad = np.dot(x_epoch.T, softmax_out - y_epoch)
#     bias_grad = -(softmax_out - y_epoch)

    #updating parameters
#     param["weight"] =  param["weight"] - learning_rate * weight_grad
#     param["bias"] = param["bias"] - (learning_rate * bias_grad)
#     print("g")
    #report so far
    print("accuracy is: ", accuracy)
    print("loss for epoch ", epoch, " is: ", loss_epoch_report)
    
    if epoch % 60 == 0:
        learning_rate /= 2
    loss2 = loss_epoch_report
    loss_epoch_report = 0
    accuracy = 0
# return param

[[8]
 [1]
 [9]
 ...
 [5]
 [9]
 [9]] [[3]
 [2]
 [7]
 ...
 [0]
 [9]
 [0]]
-59990.00000000001
(60000, 10) [0.99359561 0.84545361 0.46470173 0.53080517 0.26132875 0.38718854
 0.58506732 0.09596762 0.69085826 0.76676904]
accuracy is:  0.15011666666666668
loss for epoch  0  is:  41.635000129058426
[[5]
 [5]
 [5]
 ...
 [5]
 [5]
 [5]] [[3]
 [2]
 [7]
 ...
 [0]
 [9]
 [0]]
-59989.99999999999
(60000, 10) [27.69001251 27.94495759 25.09298358 28.62185457 27.56237093 26.59642753
 28.0718432  26.38011257 27.1840102  26.3820183 ]
accuracy is:  0.09665
loss for epoch  1  is:  43.562134130263246
[[1]
 [1]
 [5]
 ...
 [1]
 [1]
 [1]] [[3]
 [2]
 [7]
 ...
 [0]
 [9]
 [0]]
-59990.00000000001
(60000, 10) [46.74530519 45.56201311 41.69536974 47.03865151 45.22105546 45.17584214
 46.69360477 41.92082172 45.02519218 44.90436844]
accuracy is:  0.1258
loss for epoch  2  is:  42.511461633970455


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]] [[3]
 [2]
 [7]
 ...
 [0]
 [9]
 [0]]
nan
(60000, 10) [76.35433579 75.1713449  71.30423977 76.64810527 74.83060009 74.78481212
 76.30293103 71.53039014 74.63453344 74.51337894]
accuracy is:  0.09871666666666666
loss for epoch  3  is:  43.48764391325905
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]] [[3]
 [2]
 [7]
 ...
 [0]
 [9]
 [0]]
nan
(60000, 10) [nan nan nan nan nan nan nan nan nan nan]
accuracy is:  0.09871666666666666
loss for epoch  4  is:  43.48764391325905
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]] [[3]
 [2]
 [7]
 ...
 [0]
 [9]
 [0]]
nan
(60000, 10) [nan nan nan nan nan nan nan nan nan nan]
accuracy is:  0.09871666666666666
loss for epoch  5  is:  43.48764391325905
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]] [[3]
 [2]
 [7]
 ...
 [0]
 [9]
 [0]]
nan
(60000, 10) [nan nan nan nan nan nan nan nan nan nan]
accuracy is:  0.09871666666666666
loss for epoch  6  is:  43.48764391325905
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]] [[3]
 [2]
 [7]
 ...
 [0]
 [9]
 [0]]
nan
(60000, 10) [nan na

KeyboardInterrupt: 