In [31]:
import numpy as np

import tensorflow as tf

class Network():
    def __init__(self, nnodes, ninputs, lamda=0):
        self.lamda = lamda
        self.weight = np.random.randn(ninputs, nnodes) * np.sqrt(2. / ninputs) #xaiver initialization
        self.bias = np.random.rand(nnodes) * 0.01
        self.sdw = np.zeros((ninputs, nnodes))
        self.sdb = np.zeros(nnodes)
        self.vdw = np.zeros((ninputs, nnodes))
        self.vdb = np.zeros(nnodes)
        self.t = 0

    def forward(self, inputs):
        self.input = inputs
        self.output = np.dot(inputs, self.weight) + self.bias
        return self.output

    def backward(self, gradient):
        self.gradient_weight = np.dot(self.input.T, gradient)
        self.gradient_bias = np.sum(gradient, axis=0)
        self.gradient_input = np.dot(gradient, self.weight.T)
        return self.gradient_input

    def calculate(self, optimizer):
        if optimizer == 'adam':
            self.t += 1
            beta1, beta2 = 0.9, 0.999
            epsilon = 1e-8

            self.sdw = beta2 * self.sdw + (1 - beta2) * (self.gradient_weight ** 2)
            self.sdb = beta2 * self.sdb + (1 - beta2) * (self.gradient_bias ** 2)

            self.vdw = beta1 * self.vdw + (1 - beta1) * self.gradient_weight
            self.vdb = beta1 * self.vdb + (1 - beta1) * self.gradient_bias

            # Bias correction for adam optimizer for the starting difference while using exponantially weighted average
            sdw_corrected = self.sdw / (1 - beta2 ** self.t)
            sdb_corrected = self.sdb / (1 - beta2 ** self.t)
            vdw_corrected = self.vdw / (1 - beta1 ** self.t)
            vdb_corrected = self.vdb / (1 - beta1 ** self.t)

            self.sdw_corrected = sdw_corrected
            self.sdb_corrected = sdb_corrected
            self.vdw_corrected = vdw_corrected
            self.vdb_corrected = vdb_corrected

    def update(self, learning_rate, optimizer):
        if optimizer == 'adam':
            self.weight -= learning_rate * self.vdw_corrected / (np.sqrt(self.sdw_corrected) + 1e-8)
            self.bias -= learning_rate * self.vdb_corrected / (np.sqrt(self.sdb_corrected) + 1e-8)
        else:
            self.weight -= learning_rate * self.gradient_weight
            self.bias -= learning_rate * self.gradient_bias

    def l2(self):
        return np.sum(self.weight ** 2)

class Relu():
    def forward(self, inputs):
        self.input = inputs
        self.output = np.maximum(0, inputs)
        return self.output

    def backward(self, gradients):
        self.gradient = gradients * (self.input > 0) #why not self.output>>>because we need a boolean return
        return self.gradient

class Sigmoid():
    def forward(self, inputs):
        self.input = inputs
        self.output = 1 / (1 + np.exp(-inputs))
        return self.output

    def backward(self, dvalues):
        self.dinputs = dvalues * (1 - self.output) * self.output
        return self.dinputs

class Softmax():
    def forward(self, inputs):
        self.input = inputs
        exp = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        probabilities = exp / np.sum(exp, axis=1, keepdims=True)
        self.output = probabilities
        return self.output

    def backward(self, gradient):
        return gradient

class CategoricalCrossEntropyLoss():
    def forward(self, probs, true_outputs, layers):
        clipped_probs = np.clip(probs, 1e-7, 1 - 1e-7)
        loss_data = -np.sum(true_outputs * np.log(clipped_probs)) / (len(true_outputs) + 1e-8)

        l2_terms = [layer.lamda * np.sum(layer.l2()) for layer in layers]
        loss_weight = 0.5 * np.sum(l2_terms) / (len(true_outputs) +  1e-8)
        return loss_data + loss_weight

    def accuracy(self, probs, true_outputs):

        prediction=np.argmax(probs, axis=1)
        true_label=np.argmax(true_outputs, axis=1)
        accuracy=np.mean(prediction == true_label)
        return accuracy

    def backward(self, probs, true_outputs):
        samples = len(true_outputs)

        self.dinputs = (probs - true_outputs) / samples
        return self.dinputs

class BinaryCrossEntropyLoss():
    def forward(self, y_pred, y_true, layers):
        y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
        loss_data = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
        return loss_data

    def backward(self, dvalues, y_true):
        dvalues = np.clip(dvalues, 1e-7, 1 - 1e-7)
        self.dinputs = (dvalues - y_true) / len(y_true)
        return self.dinputs

class Fit():
    def __init__(self, layers,epochs ,loss_function, layers_for_fit, learning_rate, optimizer='gradient'):
        self.layers_for_fit = layers_for_fit
        self.layers = layers
        self.loss_function = loss_function
        self.learning_rate = learning_rate
        self.optimizer = optimizer
        self.epochs=epochs
    def fit(self, X_train, y_train,X_test,y_test, batch_size):
        for epoch in range(self.epochs):
            epoch_loss = 0
            epoch_loss_val = 0
            for i in range(0, len(X_train), batch_size):
                batch_inputs = X_train[i:i + batch_size]
                batch_validate=X_test[i:i + batch_size]
                batch_true_outputs = y_train[i:i + batch_size]
                batch_validate_outputs = y_test[i:i + batch_size]

                x = batch_inputs
                for layer in self.layers_for_fit:
                    x = layer.forward(x)

                loss = self.loss_function.forward(x, batch_true_outputs, self.layers)
                epoch_loss += loss  # Accumulate batch loss

                gradient = self.loss_function.backward(x, batch_true_outputs)
                for layer in reversed(self.layers_for_fit):
                    gradient = layer.backward(gradient)

                for layer in self.layers:
                    layer.calculate(self.optimizer)

                for layer in self.layers:
                    layer.update(self.learning_rate, self.optimizer)


            print(f"Epoch {epoch + 1}, Loss: {epoch_loss / len(X_train) * batch_size}")  # Print average loss for the epoch
            epoch_accuracy = 0
            epoch_loss_val = 0
            for i in range(0,len(X_test),batch_size):
                batch_validate = X_test[i:i + batch_size]
                batch_validate_true = y_test[i:i + batch_size]

                x2=batch_validate
                for layer in self.layers_for_fit:
                    x2=layer.forward(x2)

                loss_validate = self.loss_function.forward(x2, batch_validate_true, self.layers)
                accurate=self.loss_function.accuracy(x2, batch_validate_true)
                epoch_loss_val += loss_validate
                epoch_accuracy+=accurate
            print(f"Epoch {epoch + 1}, val_Loss: {epoch_loss_val / len(X_test) * batch_size},val_accuracy:{epoch_accuracy/ len(X_test) * batch_size}")  # Print average loss for the epoch





In [32]:

# Load MNIST data
data = tf.keras.datasets.mnist
(X_train, y_train), (X_test, y_test) = data.load_data()

X_train = X_train.reshape(-1, 28*28) / 255.0
X_test = X_test.reshape(-1, 28*28) / 255.0

y_train = np.eye(10)[y_train]
y_test = np.eye(10)[y_test]
# Define network
layer1 = Network(128, 784, lamda=0.01)
layer2 = Network(64, 128, lamda=0.01)
layer3 = Network(10, 64, lamda=0)
relu1 = Relu()
relu2 = Relu()
softmax = Softmax()
loss_function = CategoricalCrossEntropyLoss()

layers_for_fit = [layer1, relu1, layer2, relu2, layer3, softmax]
layers = [layer1, layer2, layer3]

# Train model
batch_size = 32
learning_rate = 0.0001
optimizer = 'adam'
model = Fit(layers, 5,loss_function, layers_for_fit, learning_rate, optimizer)
model.fit(X_train, y_train,X_test, y_test, batch_size)



Epoch 1, Loss: 0.6327762747199365
Epoch 1, val_Loss: 0.33753377576406185,val_accuracy:0.9251
Epoch 2, Loss: 0.3103527863087198
Epoch 2, val_Loss: 0.2755986077950051,val_accuracy:0.9421
Epoch 3, Loss: 0.2603352757623868
Epoch 3, val_Loss: 0.24368666568203756,val_accuracy:0.9512
Epoch 4, Loss: 0.23122326944442514
Epoch 4, val_Loss: 0.22309545098250638,val_accuracy:0.957
Epoch 5, Loss: 0.21130484296992344
Epoch 5, val_Loss: 0.20947531802239328,val_accuracy:0.9616
