# Machine Learning - Task 6
## Neural Network for Mnist dataset
### Or segal, ID: 313151862

In [1]:
import numpy as np
import math
import time
from sklearn.model_selection import train_test_split

In [2]:
class NeuralNetwork():
    def __init__(self, epochs = 10, batch_size = 16, learning_rate = 0.1, nof_classes = 10, hidden_size_1 = 512, hidden_size_2 = 256, verbose = False):
        self.epochs = epochs
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.nof_classes = nof_classes
        self.hidden_size_1 = hidden_size_1
        self.hidden_size_2 = hidden_size_2
        self.Verbose = verbose
        self.Ws = []
        self.biases = []
        return
    
    def xavier_init(self, shape):
        fan_in = shape[0]
        fan_out = shape[1]
        limit = np.sqrt(6 / (fan_in + fan_out))
        return np.random.uniform(-limit, limit, shape)
    
    def init_weights(self, X, y):
        if(self.Verbose):
            print("Starts initializing weights...")
        self.nof_features = X.shape[1]
        self.Ws.append(self.xavier_init((self.nof_features, self.hidden_size_1)))
        self.biases.append(np.zeros((1, self.hidden_size_1)))
        self.Ws.append(self.xavier_init((self.hidden_size_1, self.hidden_size_2)))
        self.biases.append(np.zeros((1, self.hidden_size_2)))
        self.Ws.append(self.xavier_init((self.hidden_size_2, self.nof_classes)))
        self.biases.append(np.zeros((1, self.nof_classes)))
        if(self.Verbose):
            print("Finished initializing weights.")
        return
    
    def shuffle_data(self, X, y):
        # Generate random indices for shuffling
        indices = np.random.permutation(len(X))

        # Shuffle both arrays based on the indices
        shuffled_X = X[indices]
        shuffled_y = y[indices]

        return shuffled_X, shuffled_y
        
    def get_batch(self, X, y, start_idx): 
        # Determine the indices for the current batch
        start_idx = start_idx * self.batch_size
        
        # Adjust the end index if remaining samples are less than batch_size
        end_idx = min(start_idx + self.batch_size, X.shape[0])

        # Extract the current batch from X and y
        batch_X = X[start_idx:end_idx]
        batch_y = y[start_idx:end_idx]
        
        return batch_X, batch_y
    
    def softmax(self, z):    
        # Compute the exponential of each element
        exp_z = np.exp(z)
        
        # Compute the sum of exponential values along each row
        sum_exp_z = np.sum(exp_z, axis=1, keepdims=True)
        
        # Compute the softmax probabilities by dividing each exponential value by the sum
        softmax_output = exp_z / sum_exp_z
        return softmax_output
    
    def relu(self, x):
        return np.maximum(0, x)
    
    def cross_entropy(self, y_true, y_pred, epsilon = 1e-10):
        # Clip predicted probabilities to avoid log(0) errors
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)

        # Calculate cross-entropy loss
        loss = -np.mean(y_true * np.log(y_pred))

        return loss

    def relu_derivative(self, x):
        # Calculate the derivative of the ReLU function
        return np.where(x <= 0, 0, 1)
    
    def feed_forward(self, X):
        self.X = X 
        self.Z1 = (np.dot(self.X, self.Ws[0]) + self.biases[0]) #inputs for hidden layer
        
        self.A1 = (self.relu(self.Z1)) # outputs of hidden layer 1
        self.Z2 = (np.dot(self.A1, self.Ws[1]) + self.biases[1]) # inputs for hidden layer 2
        
        self.A2 = (self.relu(self.Z2)) # outputs of hidden layer 2
        self.Z3 = (np.dot(self.A2, self.Ws[2]) + self.biases[2]) # inputs for output layer 
    
        self.A3 = self.softmax(self.Z3) # network output
        return self.A3
    
    
    def back_backpropagation(self, y):
        m = self.X.shape[0]
        one_hot_y = np.eye(self.nof_classes)[y]
        
        # computing layer 3
        self.dZ3 = (self.A3 - one_hot_y)
        dW3 = np.dot(self.A2.T, self.dZ3) / m
        dB3 = np.mean(self.dZ3, axis=0, keepdims = True)
        
        # computing layer 2
        self.dZ2 = np.dot(self.dZ3, self.Ws[2].T) * self.relu_derivative(self.Z2)
        dW2 = np.dot(self.A1.T, self.dZ2) / m
        dB2 = np.mean(self.dZ2, axis=0, keepdims=True)
        
        # computing layer 1
        self.dZ1 = (np.dot(self.dZ2, self.Ws[1].T) * self.relu_derivative(self.Z1))
        dW1 = np.dot(self.X.T, self.dZ1) / m 
        dB1 = np.mean(self.dZ1, axis=0, keepdims = True) 
        
        #adjusting weights and biases
        self.Ws[0] = self.Ws[0] - self.learning_rate * dW1
        self.biases[0] = self.biases[0] - self.learning_rate * dB1
        self.Ws[1] = self.Ws[1] - self.learning_rate * dW2
        self.biases[1] = self.biases[1] - self.learning_rate * dB2
        self.Ws[2] = self.Ws[2] - self.learning_rate * dW3
        self.biases[2] = self.biases[2] - self.learning_rate * dB3
        return
    
    def fit(self, X, y):
        if(self.Verbose):
            start = time.time()
            print("Starts fitting ...")
        self.init_weights(X, y)
        batches_amount = math.ceil(X.shape[0] / self.batch_size) # check to round up 
        for epoch in range(self.epochs):
            shuffled_X, shuffled_y = self.shuffle_data(X, y)
            for batch_idx in range(batches_amount):
                batch_X, batch_y = self.get_batch(shuffled_X, shuffled_y, batch_idx)
                self.feed_forward(batch_X)
                self.back_backpropagation(batch_y)
            if(self.Verbose):
                error = self.cross_entropy(np.eye(self.nof_classes)[shuffled_y], self.feed_forward(shuffled_X))
                print(f"Epoch {epoch + 1} of {self.epochs} finished with error: {error:.5f}")

        if(self.Verbose):
            end = time.time()
            print(f"finished fitting in {end-start:.2f} seconds.")
        return self.Ws, self.biases
    
    
    def predict(self, X):
        y_pred = self.feed_forward(X)
        y_pred = np.argmax(y_pred, axis=1)
        return y_pred
    
    
    def score(self, X, y):
        predictions = self.predict(X)  # Obtain predicted labels using the predict() method
        accuracy = np.mean(predictions == y)  # Calculate accuracy by comparing predictions with true labels
        return accuracy

# choose which dataset to load

# 1. YOUR DATASET

In [3]:
X = np.load('MNIST-data.npy')
y = np.load("MNIST-lables.npy")

In [4]:
X = np.reshape(X, (X.shape[0], X.shape[1]*X.shape[2]))
X = X / 255 # normalize

# 2. 'MY' DATASET

In [6]:
from sklearn.datasets import fetch_openml
# Fetch the MNIST dataset
mnist = fetch_openml('mnist_784', version=1, cache=True)

In [8]:
# Get the input data and target labels
X = np.array(mnist.data, dtype='float32') / 255 # Convert to NumPy array and normalize
y = np.array(mnist.target, dtype='int') # Convert to NumPy array

# Main

In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = NeuralNetwork(batch_size = 16)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)

print(f"run with score of: {score:.5f}")