In [5]:
###################
# Pick An Example #
###################

#[Input Vars]
#  1. <int> i: the index of the training dataset
#  2. <dataframe> DF_TRAIN: Dataframe-like format
#
#[Output Vars]
#  1. <ndarray> X: Image
#  2. <ndarray> Y: The label to the image

import numpy as np

def pick_an_example(i, DF_TRAIN):
    X = DF_TRAIN[i:i+1].values[0][1:].reshape(28,28,1)
    Y = DF_TRAIN[i:i+1].values[0][0]
    return X, Y

#i = np.random.randint(len(DF_TRAIN))
#X, Y = pick_an_example(i, DF_TRAIN)
#print(f"The Label of the {i}-th example is {Y}\nThe corresponding data is at below.")
#plt.imshow(X, cmap=plt.get_cmap('gray'))
#plt.show()

In [6]:
################
# Zero Padding #
################

#[Input Vars]
#  1. <ndarray> X: Unpadded image. The shape is (n_H_prev, n_W_prev, n_C_prev).
#  2. <int> pad: expected number of pads on each side. The shape is (n_H_prev + 2 * pad, n_W_prev + 2 * pad, n_C_prev).
#
#[Output Vars]
#  1. <ndarray> X_pad: Padded image.

import numpy as np

def __zero_pad(X, pad):
    X_pad = np.pad(X, ((pad, pad), (pad, pad),(0,0)), "constant", constant_values = 0)
    return X_pad

#X_pad = __zero_pad(X, 2)

#plt.imshow(X_pad, cmap=plt.get_cmap('gray'))
#plt.show()

In [7]:
####################
# Conv Single Step #
####################

#[Input Vars]
#  1. <ndarray> a_slice_prev: slice of previous feature maps. The shape is (f, f, n_C_prev).
#  2. <ndarray> K: A single weight matrix (kernel). The shape is (f, f, n_C_prev).
#  3. <ndarray> b: A single bias term. The shape is (1, 1, 1).
#
#[Output Vars]
#  1. <float> Z: a scalar derived from convolution operation.

import numpy as np

def __conv_single_step(s_slice, K, b):
    
    S = np.multiply(s_slice, K)
    Z = np.sum(S)
    Z = Z + float(b)
    
    return Z

In [8]:
############################
# Conv Forward Propagation #
############################

#[Input Vars]
#  1. <ndarray> S_prev: The previous feature maps (after activation and pooling). The shape is (n_H_prev, n_W_prev, n_C_prev).
#  2. <ndarray> K: Kernels in a layer. The shape is (f, f, n_C_prev, n_C).
#  3. <ndarray> b: biases in a layer. THe shape is (1, 1, 1, n_C).
#  4. <dictionary> hparam: this contains hyper parameters like "pad" and "stride".
#
#[Output Vars]
#  1. <ndarray> C: This would be the feature map in the next layer (but before activation). The shape is (n_H, n_W, n_C).
#  2. <dictionary> cache: Cache the values needed for backward propagation.

import numpy as np

def conv_forward(S_prev, K, b, hparam):
    
    # 1. Retrieve shape of A_prev. We need this to compute the shape of the feature map in the next layer.
    (n_H_prev, n_W_prev, n_C_prev) = S_prev.shape
    
    # 2. Retrieve shape of K. We also need this (i.e. f) to compute the shape of the feature map in the next layer.
    (f, f, n_C_prev, n_C) = K.shape
    
    # 3. Retrieve info. from hyper parameters. We need them to compute the shape of the feature map in the next layer, too.
    stride = hparam["stride"]
    pad = hparam["pad"]
    
    # 4. With info from 1. ~ 3., we can compute the dimension for the feature map in the next layer.
    n_H = int((n_H_prev - f + 2 * pad) / stride) + 1
    n_W = int((n_W_prev - f + 2 * pad) / stride) + 1
    
    # 5. Initialize feature maps in the next layer with zeros. Note #Kernel is equal to #Channel of the feature map.
    C = np.zeros((n_H, n_W, n_C))
    
    # 6. Pad S_prev
    S_prev_pad = __zero_pad(S_prev, pad)
    
    # 7. Do Cross-Relation Operation. Note the shape of the output feature map would be (n_H, n_W, n_C).
    for h in range(n_H):
        for w in range(n_W):
            for c in range(n_C):
                
                # Define the corners in the S_prev_pad.
                vert_head = h * stride
                vert_tail = vert_head + f
                hori_head = w * stride
                hori_tail = hori_head + f
                
                # Get the slice.
                S_prev_slice = S_prev_pad[vert_head:vert_tail, hori_head:hori_tail, :]
                
                # Feed it into __conv_single_step(a_slice, K, b). Note we use one kernel and one bias term at once.
                C[h, w, c] = __conv_single_step(S_prev_slice, K[:,:,:,c], b[:,:,:,c])
    
    # 8. Check if the output feature map have the valid shape.
    assert(C.shape == (n_H, n_W, n_C))
    
    # 9. Store the cache for backward propagation
    cache = (S_prev, K, b, hparam)
    
    return C, cache

In [9]:
############################
# Pool Forward Propagation #
############################

#[Input Vars]
#  1. <ndarray> A_prev: The previous feature maps (after activation). The shape is (n_H_prev, n_W_prev, n_C_prev).
#  2. <dictionary> hparam: It contains "f" and "stride".
#  3. <string> mode: Switch between "maxpooling" and "avgpooling". The shape is (n_H, n_W, n_C). (n_C = n_C_prev)
#
#[Output Vars]
#  1. <ndarray> S: The output feature map after pooling operation.

import numpy as np

def pool_forward(A_prev, hparam, mode = "maxpooling"):
    # 1. Retrieve shape of A_prev.
    (n_H_prev, n_W_prev, n_C_prev) = A_prev.shape

    # 2. Retrieve info from hyper parameter
    f = hparam["f"]
    stride = hparam["stride"]

    # 3. Define the shape of output of pooling operation.
    n_H = int(1 + (n_H_prev - f) / stride)
    n_W = int(1 + (n_W_prev - f) / stride)
    n_C = n_C_prev

    # 4. Initialize the output feature map after pooling operation with zeros.
    S = np.zeros((n_H, n_W, n_C))
    
    # 5. Do Pooling Operation
    for h in range(n_H):
        for w in range(n_W):
            for c in range(n_C):
                
                # Define the corners in the A_prev_pad.
                vert_head = h * stride
                vert_tail = vert_head + f
                hori_head = w * stride
                hori_tail = hori_head + f
                
                # Get the slice. (Note that there's only one channel involved. Not like conv_forward)
                A_prev_slice = A_prev[vert_head:vert_tail, hori_head:hori_tail, c]
                
                # Pooling operation
                if mode == "maxpooling":
                    S[h, w, c] = np.max(A_prev_slice)
                elif mode == "avgpooling":
                    S[h, w, c] = np.mean(A_prev_slice)
                    
    # 6. Check if the output feature map have the valid shape.
    assert(S.shape == (n_H, n_W, n_C))
    
    # 7. Store the cache for backward propagation
    cache = (A_prev, hparam)
    
    return S, cache

In [10]:
#############################
# Conv Backward Propagation #
#############################

#[Input Vars]
#  1. <ndarray> dC: gradient of the cost with respect to the output of the conv layer (C). The shape is (n_H, n_W, n_C).
#  2. <dictionary> cache: Cache of output of conv_forward()
#
#[Output Vars]
#  1. <ndarray> dS_prev: gradient of the cost w.r.t. the input of the conv layer (S). The shape is (n_H_prev, n_W_prev, n_C_prev).
#  2. <ndarray> dK: gradient of the cost w.r.t. the weights of the conv layer (K). The shape is (f, f, n_C_prev, n_C).
#  3. <ndarray> db: gradient of the cost w.r.t. the biases of the conv layer (b). The shape is (1, 1, 1, n_C).

def conv_backward(dC, cache):
    
    # 1. Retrieve info. from cache.
    (S_prev, K, b, hparam) = cache
    
    # 2. Retrieve the shape of S_prev.
    (n_H_prev, n_W_prev, n_C_prev) = S_prev.shape
    
    # 3. Retrieve the shape of Kernel.
    (f, f, n_C_prev, n_C) = K.shape
    
    # 4. Retieve info. from hyper parameters.
    stride = hparam["stride"]
    pad = hparam["pad"]
    
    # 5. Retrieve the shape of dC
    (n_H, n_W, n_C) = dC.shape
    
    # 6. Initialize dS_prev, dK, db with the correct shapes.
    dS_prev = np.zeros((n_H_prev, n_W_prev, n_C_prev))
    dK = np.zeros((f, f, n_C_prev, n_C))
    db = np.zeros((1, 1, 1, n_C))
    
    # 7. Pad dS_prev and S_prev
    S_prev_pad = __zero_pad(S_prev, pad)
    dS_prev_pad = __zero_pad(dS_prev, pad)
    
    # 8. Do backward pass operation
    for h in range(n_H):
        for w in range(n_W):
            for c in range(n_C):
                                
                # Define the corners in the A_prev_pad.
                vert_head = h * stride
                vert_tail = vert_head + f
                hori_head = w * stride
                hori_tail = hori_head + f
                    
                # Get the slice.
                S_prev_slice = S_prev_pad[vert_head:vert_tail, hori_head:hori_tail, :]
                
                # Update Gradients (dS_prev, dK, db) for the window
                dS_prev_pad[vert_head:vert_tail, hori_head:hori_tail, :] += K[:,:,:,c] * dC[h, w, c]
                dK[: , :, :, c] += S_prev_slice * dC[h, w, c]
                db[: , :, :, c] += dC[h, w, c]
                
    # 9. Unpad dS_prev_pad
    if (pad == 0):
        dS_prev = dS_prev_pad
    else:
        dS_prev[:, :, :] = dS_prev_pad[pad:-pad, pad:-pad, :]
    
    
    # 10 Check the validity of the shape
    assert (dS_prev.shape == (n_H_prev, n_W_prev, n_C_prev))
    
    return dS_prev, dK, db

In [11]:
############################
# Max Pool Backward helper #
############################

import numpy as np

def __create_mask_from_window(s):
    mask = (s == np.max(s))
    return mask

############################
# Avg Pool Backward helper #
############################

def __distribute_value(ds, shape):
    
    # 1. Retrieve dimensions from shape
    (n_H, n_W) = shape
    
    # 2. Compute the value to distribute on the matrix
    average = ds / (n_H * n_W)
    
    # 3. Create a matrix where each entry is the avg. value.
    a = np.ones(shape) * average
    return a

In [14]:
#############################
# Pool Backward Propagation #
#############################

#[Input Vars]
#  1. <ndarray> dS: gradient of cost w.r.t. the output of the pooling layer. The shape is the same as the shape of S.
#  2. <dictionary> cache: It contaions the output from the forward pass.
#  3. <string> mode: Switch between "maxpooling" and "avgpooling".
#
#[Output Vars]
#  1. <ndarray> dA_prev: gradient of cost w.r.t. the input of the pooling layer. The shape is the same as the shape of A_prev.

import numpy as np

def pool_backward(dS, cache, mode = "maxpooling"):
    
    # 1. Retrieve info. from cache
    (A_prev, hparam) = cache
    
    # 2. Retrieve hyper parameters
    stride = hparam["stride"]
    f = hparam["f"]
    
    # 3. Retrieve the shapes of A_prev and dS
    n_H_prev, n_W_prev, n_C_prev = A_prev.shape
    n_H, n_W, n_C = dS.shape
    
    # 4. Initialize dA_prev with zeros.
    dA_prev = np.zeros((n_H_prev, n_W_prev, n_C_prev))
    
    # 5. Do Backward Pass Operation
    for h in range(n_H):
        for w in range(n_W):
            for c in range(n_C):
                                
                # Define the corners in the A_prev_pad.
                vert_head = h * stride
                vert_tail = vert_head + f
                hori_head = w * stride
                hori_tail = hori_head + f
                
                # Compute the backward propagation in both modes
                if mode == "maxpooling":
                    # Use the corners and the specific "c" tp defome the current slice of A_prev
                    A_prev_slice = A_prev[vert_head:vert_tail, hori_head:hori_tail, c]
                    
                    # Create the mask from A_prev_slice
                    mask = __create_mask_from_window(A_prev_slice)
                    
                    # Update dA_prev
                    dA_prev[vert_head:vert_tail, hori_head:hori_tail, c] += np.multiply(mask, dS[h, w, c])
                elif mode == "avgpooling":
                    # Get the entry ds from dS
                    ds = dS[h, w, c]
                    
                    # Define the shape of the kernel as (f, f).
                    shape = (f, f)
                    
                    # Distribute it (ds) to the correct slice of dA_prev
                    dA_prev[vert_head:vert_tail, hori_head:hori_tail, c] += __distribute_value(ds, shape)
    
    # 6. Check the dA_prev has the valid shape 
    assert (dA_prev.shape == A_prev.shape)
    
    return dA_prev

In [25]:
#####################################
# Functions for Forward Propagation #
#####################################

# [Input Vars]
#   1. <ndarray> Z
#
# [Output Vars]
#   1. <ndarray> A

import numpy as np

def activation_forward(Z, mode):
    if mode == "sigmoid":
        A = 1/(1 + np.exp(-Z))
    elif mode == "relu":
        A = Z * (Z > 0)
    return A

def activation_backward(X, mode):
    if mode == "sigmoid":
        D_Z_local = np.multiply(1 - X, X)
    elif mode == "relu":
        D_Z_local = X
        D_Z_local[X<=0] = 0
        D_Z_local[X>0] = 1
    return D_Z_local

# [Input Vars]
#   1. <ndarray> A
#
# [Output Vars]
#   1. <ndarray> Y_pred
def __softmax(A):
    Y_pred = np.exp(A-np.max(A))/np.sum(np.exp(A-np.max(A)))
    return Y_pred

In [16]:
# Initiallize the Kernels, Biases, and hparams

def Initialize_Parameters(low, high):
    
    # C1
    K_C1 = np.random.uniform(low=low, high=high, size=(5, 5, 1, 6))
    b_C1 = np.random.uniform(low=low, high=high, size=(1, 1, 1, 6))
    hparam_C1 = {"stride": 1, "pad": 2}

    # S2
    hparam_S2 = {"f": 2, "stride": 2}

    # C3
    K_C3 = np.random.uniform(low=low, high=high, size=(5, 5, 6, 16))
    b_C3 = np.random.uniform(low=low, high=high, size=(1, 1, 1, 16))
    hparam_C3 = {"stride":1, "pad": 0}

    # S4
    hparam_S4 = {"f": 2, "stride": 2}

    # C5
    K_C5 = np.random.uniform(low=low, high=high, size=(5, 5, 16, 120))
    b_C5 = np.random.uniform(low=low, high=high, size=(1, 1, 1, 120))
    hparam_C5 = {"stride":1, "pad": 0}

    # W7
    W7 = np.random.uniform(low=low, high=high, size=(120, 84))

    # W8
    W8 = np.random.uniform(low=low, high=high, size=(84, 10))
    
    return K_C1, b_C1, hparam_C1, hparam_S2, K_C3, b_C3, hparam_C3, hparam_S4, K_C5, b_C5, hparam_C5, W7, W8

In [17]:

# LeNet5 - Forward Propagation

def LeNet5_forward(X, K_C1, b_C1, hparam_C1, hparam_S2, K_C3, b_C3, hparam_C3, hparam_S4, K_C5, b_C5, hparam_C5, W7, W8, pool_mode = "avgpooling", act_mode = "sigmoid"):

    #print(f"[X : Input] The shape of X is {X.shape}.")

    X_C1, cache_C1 = conv_forward(X, K_C1, b_C1, hparam_C1)
    #print(f"[C1: Padding and Convolution] The shape becomes {X_C1.shape}.")

    X_A1 = activation_forward(X_C1, act_mode)
    #print(f"[A1: Activatiion] The shape remain {X_A1.shape}")

    X_S2, cache_S2 = pool_forward(X_A1, hparam_S2, pool_mode)
    #print(f"[S2: Pooling] The shape becomes {X_S2.shape}.")

    X_C3, cache_C3 = conv_forward(X_S2, K_C3, b_C3, hparam_C3)
    #print(f"[C3: Padding and Convolution] The shape becomes {X_C3.shape}.")
    
    X_A3 = activation_forward(X_C3, act_mode)

    #print(f"[A3: Activatiion] The shape remain {X_A3.shape}.")

    X_S4, cache_S4 = pool_forward(X_A3, hparam_S4, pool_mode)
    #print(f"[S4: Pooling] The shape becomes {X_S4.shape}.")

    X_C5, cache_C5 = conv_forward(X_S4, K_C5, b_C5, hparam_C5)
    #print(f"[C5: Perform Padding and Convolution] The shape becomes {X_C5.shape}.")
    
    X_A5 = activation_forward(X_C5, act_mode)
    #print(f"[A5: Activatiion] The shape remain {X_A5.shape}.")

    X_A6 = X_A5.reshape(1, 120)
    #print(f"[A6: Reshape] The shape becomes {X_A6.shape}.")

    X_Z7 = np.dot(X_A6, W7)
    #print(f"[Z7: Linear] The shape becomes {X_Z7.shape}.")
    
    X_A7 = activation_forward(X_Z7, act_mode)
    #print(f"[A7: Activatiion] The shape remain {X_A7.shape}.")

    X_Z8 = np.dot(X_A7, W8)
    #print(f"[Z7: Linear] The shape becomes {X_Z8.shape}.")

    X_A8 = activation_forward(X_Z8, act_mode)
    #print(f"[A8: Activatiion] The shape remain {X_A8.shape}.")

    Y_pred = __softmax(X_A8)
    #print(f"[Y_pred: Softmax] The shape remain {Y_pred.shape}")
    
    return cache_C1, X_A1, cache_S2, cache_C3, X_A3, cache_S4, cache_C5, X_A5, X_A6, X_A7, X_A8, Y_pred

In [18]:
def cross_entropy(Y_pred, Y_truth):
    Error = (-1 * Y_truth * np.log(Y_pred)).sum()
    return Error

In [19]:
# LeNet5 - Backward Propagation
def LeNet5_backward(cache_C1, X_A1, cache_S2, cache_C3, X_A3, cache_S4, cache_C5, X_A5, X_A6, X_A7, X_A8, Y_pred, Y_truth, pool_mode = "avgpooling", act_mode = "sigmoid"):
    D_A8 = Y_pred - Y_truth
    #print(f"[D_A8:    Softmax] The shape is {D_A8.shape}.")

    D_Z8_local = activation_backward(X_A8, act_mode)
    #if act_mode == "sigmoid":
    #    D_Z8_local = np.multiply(1 - X_A8, X_A8)
    #elif act_mode == "relu":
    #    pass

    D_Z8 = np.multiply(D_Z8_local, D_A8)
    #print(f"[D_Z8: Activation] The shape is {D_Z8.shape}.")

    D_W8 = np.outer(X_A7, D_Z8)
    #print(f"[D_W8:    Product] The shape is {D_W8.shape}.")

    D_A7 = np.dot(D_Z8, D_W8.T)
    #print(f"[D_A7:    Product] The shape is {D_A7.shape}.")

    D_Z7_local = activation_backward(X_A7, act_mode)
    #if act_mode == "sigmoid":
    #    D_Z7_local = np.multiply(1 - X_A7, X_A7)
    #elif act_mode == "relu":
    #    pass

    D_Z7 = np.multiply(D_Z7_local, D_A7)
    #print(f"[D_Z7: Activation] The shape is {D_Z7.shape}.")

    D_W7 = np.outer(X_A6, D_Z7)
    #print(f"[D_W7:    Product] The shape is {D_W7.shape}.")

    D_A6 = np.dot(D_Z7, D_W7.T)
    #print(f"[D_A6:    Product] The shape is {D_A6.shape}.")

    D_A5 = D_A6.reshape(1,1,120)
    #print(f"[D_A5:    Reshape] The shape is {D_A5.shape}.")

    D_C5_local = activation_backward(X_A5, act_mode)
    #if act_mode == "sigmoid":
    #    D_C5_local = np.multiply(1 - X_A5, X_A5)
    #elif act_mode == "relu":
    #    pass

    D_C5 = np.multiply(D_C5_local, D_A5)
    #print(f"[D_C5: Activation] The shape is {D_C5.shape}.")

    D_S4, D_K_C5, D_b_C5 = conv_backward(D_C5, cache_C5)
    #print(f"[D_S4: Conv. Back] The shape is {D_S4.shape}.")
    #print(f"[D_K_C5:   Kernel] The shape is {D_K_C5.shape}.")
    #print(f"[D_b_C5:     Bias] The shape is {D_b_C5.shape}.")

    D_A3 = pool_backward(D_S4, cache_S4, pool_mode)
    #print(f"[D_A3: Conv. Back] The shape is {D_A3.shape}.")

    D_C3_local = activation_backward(X_A3, act_mode)
    #if act_mode == "sigmoid":
    #    D_C3_local = np.multiply(1 - X_A3, X_A3)
    #elif act_mode == "relu":
    #    pass

    D_C3 = np.multiply(D_C3_local, D_A3)
    #print(f"[D_C3: Activation] The shape is {D_C3.shape}.")

    D_S2, D_K_C3, D_b_C3 = conv_backward(D_C3, cache_C3)
    #print(f"[D_S2: Conv. Back] The shape is {D_S2.shape}.")
    #print(f"[D_K_C3:   Kernel] The shape is {D_K_C3.shape}.")
    #print(f"[D_b_C3:     Bias] The shape is {D_b_C3.shape}.")

    D_A1 = pool_backward(D_S2, cache_S2, pool_mode)
    #print(f"[D_A1: Conv. Back] The shape is {D_A1.shape}.")

    D_C1_local = activation_backward(X_A1, act_mode)
    #if act_mode == "sigmoid":
    #    D_C1_local = np.multiply(1 - X_A1, X_A1)
    #elif act_mode == "relu":
    #    pass

    D_C1 = np.multiply(D_C1_local, D_A1)
    #print(f"[D_C1: Activation] The shape is {D_C1.shape}.")

    D_X, D_K_C1, D_b_C1 = conv_backward(D_C1, cache_C1)
    #print(f"[D_X:  Conv. Back] The shape is {D_X.shape}.")
    #print(f"[D_K_C1:   Kernel] The shape is {D_K_C1.shape}.")
    #print(f"[D_b_C1:     Bias] The shape is {D_b_C1.shape}.")
    
    return D_W8, D_W7, D_K_C5, D_b_C5, D_K_C3, D_b_C3, D_K_C1, D_b_C1

In [20]:
def update_trainable_parameters(lr, D_W8, W8, D_W7, W7, D_K_C5, K_C5, D_b_C5, b_C5, D_K_C3, K_C3, D_b_C3, b_C3, D_K_C1, K_C1, D_b_C1, b_C1):
    
    W8 = W8 - lr * D_W8
    W7 = W7 - lr * D_W7
    K_C5 = K_C5 - lr * D_K_C5
    b_C5 = b_C5 - lr * D_b_C5
    K_C3 = K_C3 - lr * D_K_C3
    b_C3 = b_C3 - lr * D_b_C3
    K_C1 = K_C1 - lr * D_K_C1
    b_C1 = b_C1 - lr * D_b_C1
    
    return W8, W7, K_C5, b_C5, K_C3, b_C3, K_C1, b_C1

In [34]:
# Import packages needed to read and print our data
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

# Declare PATH_to_Dataset
ROOT_PATH = "C:/Users/USER/Desktop/Datasets/Kaggle_Digit_Recognizer/"

# List the files in the ROOT_PATH
print(f"In the Root Path, there are {os.listdir(ROOT_PATH)}")

# Store the file names in the ROOT_PATH
CSV_TEST = ROOT_PATH + "test.csv"
CSV_TRAIN= ROOT_PATH + "train.csv"

# Initialize Parameters
K_C1, b_C1, hparam_C1, hparam_S2, K_C3, b_C3, hparam_C3, hparam_S4, K_C5, b_C5, hparam_C5, W7, W8 = Initialize_Parameters(-0.01, 0.01)

# Read CSV File Through Pandas
DF_TRAIN = pd.read_csv(CSV_TRAIN)
DF_TRAIN_NO_LABLE = DF_TRAIN.drop(columns=["label"], axis=0)
Epoch = 5
loss_list = []

i = 1
lr = 0.1
for epoch in range(Epoch):
    tmp_list = []
    tic = time.time()
    for i in range(len(DF_TRAIN)):
        
        # 1. Pick one example
        X, Y = pick_an_example(i, DF_TRAIN)
        #X = X / 255.0
        Y_truth = np.zeros((1,10))
        Y_truth[0][Y] = 1
    
        # 2. Forward Pass
        cache_C1, X_A1, cache_S2, cache_C3, X_A3, cache_S4, cache_C5, X_A5, X_A6, X_A7, X_A8, Y_pred = LeNet5_forward(X, K_C1, b_C1, hparam_C1, hparam_S2, K_C3, b_C3, hparam_C3, hparam_S4, K_C5, b_C5, hparam_C5, W7, W8, "avgpooling", "relu")
    
        # 3. Cross Entropy Loss
        tmp_list.append(cross_entropy(Y_pred, Y_truth))
        
        # 4. Backward Pass
        D_W8, D_W7, D_K_C5, D_b_C5, D_K_C3, D_b_C3, D_K_C1, D_b_C1 = LeNet5_backward(cache_C1, X_A1, cache_S2, cache_C3, X_A3, cache_S4, cache_C5, X_A5, X_A6, X_A7, X_A8, Y_pred, Y_truth, "avgpooling", "relu")
    
        # 5. Update Weights
        W8, W7, K_C5, b_C5, K_C3, b_C3, K_C1, b_C1 = update_trainable_parameters(lr, D_W8, W8, D_W7, W7, D_K_C5, K_C5, D_b_C5, b_C5, D_K_C3, K_C3, D_b_C3, b_C3, D_K_C1, K_C1, D_b_C1, b_C1)
        #if epoch % 10 == 0: lr = lr * 0.3
        
        print(f"[{round(cross_entropy(Y_pred, Y_truth),4)}] The {i}-th example: \nY_pred: {Y_pred},\nY_truth: {Y_truth}\n")
    loss_list.append(np.mean(tmp_list))
    toc = time.time()
    print(f"[{round(toc - tic, 2)} (s)] \nIn The {epoch}-th epoch: the avg. loss is {np.mean(tmp_list)}\nThe {i}-th example: \nY_pred: {Y_pred},\nY_truth: {Y_truth}\n")
    
    
plt.plot(loss_list)

In the Root Path, there are ['sample_submission.csv', 'test.csv', 'train.csv']
[2.3026] The 0-th example: 
Y_pred: [[0.10000071 0.09999954 0.10000022 0.09999914 0.10000015 0.10000048
  0.09999922 0.10000106 0.10000035 0.09999914]],
Y_truth: [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]

[2.3026] The 1-th example: 
Y_pred: [[0.09999982 0.09999898 0.10000174 0.09999891 0.10000116 0.10000128
  0.09999891 0.10000052 0.09999975 0.09999891]],
Y_truth: [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

[2.3026] The 2-th example: 
Y_pred: [[0.10000033 0.0999994  0.10000066 0.0999994  0.09999995 0.10000063
  0.0999994  0.10000087 0.09999994 0.0999994 ]],
Y_truth: [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]

[2.3026] The 3-th example: 
Y_pred: [[0.10000044 0.09999924 0.10000127 0.09999902 0.10000067 0.1000009
  0.09999902 0.10000045 0.09999995 0.09999902]],
Y_truth: [[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]

[2.3026] The 4-th example: 
Y_pred: [[0.09999947 0.099999   0.10000141 0.099999   0.10000155 0.10000117
  0.099999   0.10000041 0.1   

KeyboardInterrupt: 

In [21]:
import pandas as pd
import os
import collections

# Declare PATH_to_Dataset
ROOT_PATH = "C:/Users/USER/Desktop/Datasets/Kaggle_Digit_Recognizer/"

# List the files in the ROOT_PATH
print(f"In the Root Path, there are {os.listdir(ROOT_PATH)}")

# Store the file names in the ROOT_PATH
CSV_TEST = ROOT_PATH + "test.csv"
CSV_TRAIN= ROOT_PATH + "train.csv"

DF_TRAIN = pd.read_csv(CSV_TRAIN)

print(collections.Counter(DF_TRAIN["label"][:50]))

In the Root Path, there are ['sample_submission.csv', 'test.csv', 'train.csv']
Counter({1: 8, 3: 7, 4: 6, 9: 6, 0: 5, 7: 5, 2: 5, 8: 3, 6: 3, 5: 2})
