In [None]:
import numpy as np
from torchvision.datasets import MNIST


def download_mnist(is_train: bool):
    dataset = MNIST(root='./data',
                    transform= lambda x: np.array(x).flatten(),
                    download=True,
                    train=is_train)
    mnist_data= []
    mnist_labels= []
    for image, label in dataset:
        mnist_data.append(image)
        mnist_labels.append(label)

    return mnist_data, mnist_labels

def process_data(mnist_data, mnist_labels):
    mnist_data = np.array(mnist_data).reshape(-1, 784)
    mnist_labels = np.array(mnist_labels).reshape(-1, 1)

    processed_mnist_labels = np.zeros((len(mnist_labels), 10))
    for idx,mnist_label in enumerate(mnist_labels):
        # One hot encoding
        value = mnist_label[0]
        processed_mnist_labels[idx, value] = 1
    
    # normalization
    normalize_mnist_data = mnist_data / 255
    return normalize_mnist_data, processed_mnist_labels

u = 0 , sigma = radical(2/fain_in + fan_out)

In [182]:
# split in batches
# 2. Split data into batches
def split_in_batches(data, labels, batch_size=100):
    # Shuffle the data and labels in unison
    permutation = np.random.permutation(len(data))
    shuffled_data = data[permutation]
    shuffled_labels = labels[permutation]
    
    # Split into batches
    batched_data = []
    batched_labels = []
    for i in range(0, len(shuffled_data), batch_size):
        batched_data.append(shuffled_data[i:i+batch_size])
        batched_labels.append(shuffled_labels[i:i+batch_size])
    
    # Convert lists to numpy arrays
    batched_data = np.array(batched_data)
    batched_labels = np.array(batched_labels)
    
    return batched_data, batched_labels 

In [None]:
# weight and biass initialization with Xavier


def xavier_init():
    fan_out = 10
    fan_in = 784
    fan_hidden = 100
    # we use normal distribution with function np.random.randn
    w1 = np.random.randn(fan_in, fan_hidden) * np.sqrt(2 / (fan_in + fan_hidden))
    b1 = np.random.randn(1, fan_hidden) * np.sqrt(2 / (1 + fan_hidden))
    w2 = np.random.randn(fan_hidden, fan_out) * np.sqrt(2 / (fan_hidden + fan_out))
    b2 = np.random.randn(1, fan_out) * np.sqrt(2 / (1 + fan_out))
    return w1, b1, w2, b2 


In [226]:
def dropout_function(layer_output, dropout_rate):
    # Generate a mask that will zero out neurons with probability dropout_rate
    mask = (np.random.rand(*layer_output.shape) < (1- dropout_rate)).astype(float)
    # Apply the mask to the layer output and scale neurons that remain active 
    return layer_output * mask / (1 - dropout_rate)

In [517]:
#2 activation function 
#  tanh for hidden layer
def activation_tanh(z):
    return  np.tanh(z) #(np.exp(z) - np.exp(z))/(np.exp(z) + np.exp(-z)) # or : np.tanh(z)
def activation_softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True) # normalization 

# derivative of activation function
def derivative_tanh(z):
    return 1 - np.power(activation_tanh(z), 2) # derivata lui tanh este (1 - a1^2) unde a1 este rezultatul functiei de activare

# z is the output of the activation function
def derivative_softmax(a2_softmax, y_target):
    """Derivative of the softmax function with respect to the cross-entropy loss"""
    return a2_softmax - y_target

def forward_propagation(x_img,w1,b1,w2,b2):
    z1 = x_img @ w1 + b1
    a1 = activation_tanh(z1)
    z2 = a1 @ w2 +  b2
    a2 = activation_softmax(z2)
    return z1, a1, z2, a2

# optimization technique
def forward_propagation_with_dropout(x_img,w1,b1,w2,b2):
    z1 = x_img @ w1 + b1
    a1 = activation_tanh(z1) 
    # here we apply dropout
    a1_dropped = dropout_function(a1, 0.10) 
    z2 = a1_dropped @ w2 + b2 
    a2 = activation_softmax(z2) 
    return z1, a1_dropped, z2, a2

# def regulariation_L1_L2(w1,w2,b1,b2):
#     # L1 regularization
#     w1 = w1 - 0.01*()

A = np.array([[1,2,3],[4,5,6]]) 
print(np.exp(A)) # compute e^A

[[  2.71828183   7.3890561   20.08553692]
 [ 54.59815003 148.4131591  403.42879349]]


In [178]:
# loss function
def cross_entropy_loss(y_target,y_prediction_after_softmax):
    length = y_target.shape[0] 
    return -np.sum(y_target * np.log(y_prediction_after_softmax)) / length




In [1]:

def train_epoch(process_train_X, process_train_Y, w1,b1,w2,b2):
    batched_train_data, batched_train_labels = split_in_batches(process_train_X, process_train_Y)
    
    learning_rate = 0.01
    lambda_L1 = 0.0001  
    lambda_L2 = 0.0001  # Coeficient de regularizare pentru L2

    epoch_loss = 0
    

    for batch_data, batch_labels in zip(batched_train_data, batched_train_labels):
        size_batch = batch_data.shape[1] # Dimension of the batch_data is [600,100,784]
        z1, a1_tanh, z2, a2_softmax = forward_propagation_with_dropout(batch_data,w1,b1,w2,b2)

        # We apply chain rule
        
        # Derivates for Loss function and softmax
        dC_dz2 = derivative_softmax(a2_softmax,batch_labels) # a2 - y , y -> target ,a2 -> prediction 
        
        # dC_dz2 = a2 - y
        # dz2_da1 =  ...
        # da1_dw2 = ....
        # dC_dw2 = dC_dz2 * dz2_da1*da1_w2
        
        
        # Derivates for w2,b2
        dC_dw2 = (a1_tanh.T @ dC_dz2) / size_batch 
        dC_db2 = np.sum(dC_dz2,axis=0,keepdims=True) / size_batch 
        
        # Derivates for activation function tanh and z1
        dC_da1 = dC_dz2 @ w2.T 
        dC_dz1 = dC_da1 * derivative_tanh(a1_tanh) 
      
        # Derivates for w1,b1 ( batch_data = x ,inputul de pe primul layer
        dC_dw1 = (batch_data.T @ dC_dz1) / size_batch  
        dC_db1 = np.sum(dC_dz1,axis=0,keepdims=True)/size_batch 
        
        # Apply regularization L1, L2

# Update weight-urile cu Elastic Net (L1 + L2)
    ##L1 regularization penalize weight that approach to 0 and L2 regularization penalize weight that are too large
        
    # lambda_L1 * np.sign(w1) -> L1 regularization
        # np.sign(w1) -> return 1 if w1 > 0, -1 if w1 < 0, 0 if w1 = 0  --> actually its show the direction of the weight(derivative) 
    # lambda_L2 * w1 -> L2 regularization ,is the sum of the square of the weights but in gradient descent we use the derivative of the square of the weights(2*w1) but 2 is absorbed in the learning rate, so we use only w1
        # L2 regularization is the sum of the square of the weights 
        
        # Gradients calculus
          # Weight and biass actualization
            
        w1 -= learning_rate * (dC_dw1 + lambda_L1 * np.sign(w1) + 2 * lambda_L2 * w1)  
        # w1 -= learning_rate * dC_dw1 
        b1 -= learning_rate * dC_db1 
        # w2 -= learning_rate * dC_dw2 
        w2 -= learning_rate * (dC_dw2 + lambda_L2 * np.sign(w2) + 2 * lambda_L2 * w2)
        b2 -= learning_rate * dC_db2
        
        # Compute the loss
        C_loss = cross_entropy_loss(batch_labels, a2_softmax)
        epoch_loss += C_loss 
        
    regularization_loss = (lambda_L1 * np.sum(np.abs(w1)) 
                      + lambda_L2 * np.sum(w1**2) 
                      + lambda_L1 * np.sum(np.abs(w2)) 
                      + lambda_L2 * np.sum(w2**2))
    
    print(f"Regularization loss: {regularization_loss} , {regularization_loss.shape}")
    epoch_loss /= len(batched_train_data) + regularization_loss
    # epoch_loss /= len(batched_train_data)
    print(f"Epoch loss: {epoch_loss}")

    return epoch_loss

In [645]:
# main
train_X, train_Y = download_mnist(True)
test_x, test_y = download_mnist(False)

process_train_X, process_train_Y = process_data(train_X, train_Y)
process_test_x, process_test_y = process_data(test_x, test_y)

# initialization
w1,b1,w2,b2 = xavier_init() # weight and biass initialization with Xavier on each layer one time


In [681]:

# train
import time
print("Training")
# print("--------w1:",w1 ,"---b1---",b1)
# print("--------w2:",w2 ,"---b2---",b2)
start_time = time.time()
for epoch in range(50):
    print(f"Epoch :{epoch}")
    train_epoch(process_train_X, process_train_Y, w1,b1,w2,b2)
stop_time= time.time() - start_time
print(f"Training time: {stop_time} seconds")


Training
Epoch :0
Regularization loss: 0.2952854013853178 , ()
Epoch loss: 0.1124608941857557
Epoch :1
Regularization loss: 0.2920543443864804 , ()
Epoch loss: 0.11613324196783607
Epoch :2
Regularization loss: 0.28886418841883027 , ()
Epoch loss: 0.1177100528388101
Epoch :3
Regularization loss: 0.2857165543043999 , ()
Epoch loss: 0.11912745021712076
Epoch :4
Regularization loss: 0.2826123806839247 , ()
Epoch loss: 0.11944634226790196
Epoch :5
Regularization loss: 0.27954765409494403 , ()
Epoch loss: 0.12085512770218906
Epoch :6
Regularization loss: 0.27650987855755926 , ()
Epoch loss: 0.12088858648350775
Epoch :7
Regularization loss: 0.2735235770935996 , ()
Epoch loss: 0.12052700606633086
Epoch :8
Regularization loss: 0.27057880970914994 , ()
Epoch loss: 0.12561384522515356
Epoch :9
Regularization loss: 0.26767004884852863 , ()
Epoch loss: 0.12422413787747127
Epoch :10
Regularization loss: 0.26479249992410403 , ()
Epoch loss: 0.1258532097501346
Epoch :11
Regularization loss: 0.26197206

In [212]:
def accuracy(data,labels,w1,b1,w2,b2):
    w1_copy= w1.copy()
    b1_copy= b1.copy()
    w2_copy= w2.copy()
    b2_copy= b2.copy()
    _,_,_,a2 = forward_propagation(data,w1_copy,b1_copy,w2_copy,b2_copy)
    predictions = np.argmax(a2, axis=1)
    labels = np.argmax(labels, axis=1)
    accuracy = np.mean(predictions == labels)
    return f"{accuracy * 100}%"

In [680]:
print("Accuracy : ",accuracy(process_test_x,process_test_y,w1,b1,w2,b2))

Accuracy :  97.28999999999999%


In [579]:
import pickle
# Save the model parameters to a file

# Create a dictionary with the model parameters
model_parameters = {
    "w1": w1,
    "b1": b1,
    "w2": w2,
    "b2": b2
}

# Save the model parameters to a file
with open("model_parameters_after_dropout.pkl", "wb") as file:
    pickle.dump(model_parameters, file) 


In [669]:
import pickle

# Load the model parameters from a file


# with open("model_parameters_after_dropout.pkl", "rb") as file:
#     model_parameters = pickle.load(file)

with open("model_parameters_before_dropout.pkl", "rb") as file:
    model_parameters = pickle.load(file)

# Extract the model parametersZ
w1 = model_parameters["w1"]
b1 = model_parameters["b1"]
w2 = model_parameters["w2"]
b2 = model_parameters["b2"]


In [191]:
print(process_train_Y.shape)
print(process_train_X.shape)
print("A")

(60000, 10)
(60000, 784)
A


<img src="./img/tanh.png" alt="Alt text" width="500"/>