In [1]:
import numpy as np
import tensorflow as tf # only for data import
import matplotlib.pyplot as plt

In [2]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [3]:
def to_categorical(x, n_col=None):
  """ One hot encoding function"""
  if not n_col:
    n_col = np.amax(x) + 1
  
  one_hot = np.zeros((x.shape[0], n_col))
  one_hot[np.arange(x.shape[0]), x] = 1
  return one_hot

In [4]:
def accuracy(y_true, y_pred):
  ### Returns accuracy ###
  return ((np.sum(y_true == y_pred, axis = 0) / len(y_true))*100)

In [5]:
def batch_loader(X, y = None, batch_size=64):
  """ Generates batches for training"""
  n_samples = X.shape[0]
  for i in np.arange(0, n_samples, batch_size):
    begin, end = i, min(i + batch_size, n_samples)
    if y is not None:
      yield X[begin:end], y[begin: end]
    else:
      yield X[begin:end]

In [6]:
y_train, y_test = to_categorical(y_train.astype("int")), to_categorical(y_test.astype("int"))
X_train, X_test = X_train / 255.0, X_test / 255.0

In [7]:
X_train, X_test = X_train.reshape(-1, 28*28), X_test.reshape(-1, 28*28)
X_train.shape, X_test.shape

((60000, 784), (10000, 784))

In [8]:
n_input_dim = 28*28 # 784
n_out = 10 # 10 classes

In [9]:
#Implements the loss and activation functions
class CrossEntropy():
  def __init__(self): pass

  def loss(self, y, p):
    p = np.clip(p, 1e-15, 1- 1e-15)
    return -y*np.log(p) - (1 - y) * np.log(1- p)
  
  def gradient(self, y, p):
    p = np.clip(p, 1e-15, 1- 1e-15)
    return -(y/p) + (1 - y) / (1 - p)

In [10]:
class ReLU(): #Activation function
  def __init__(self, alpha = 0.2):
    self.alpha = alpha
  
  def __call__(self, x):
    return self.activation(x)
  
  def activation(self, x):
    return np.where(x >= 0, x, self.alpha * x)
  
  def gradient(self, x):
    return np.where(x >= 0, 1, self.alpha)

In [11]:
class Softmax(): #Activation Function
  def __init__(self): pass
  
  def __call__(self, x):
    return self.activation(x)
  
  def activation(self, x):
    e_x = np.exp(x - np.max(x, axis = -1, keepdims=True))
    return e_x / np.sum(e_x, axis=-1, keepdims = True)
  
  def gradient(self, x):
    p = self.activation(x)
    return p * (1 - p)

In [12]:
class Activation():
  def __init__(self, activation, name="activation"):
    self.activation = activation
    self.gradient = activation.gradient
    self.input = None
    self.output = None
    self.name = name
  
  def forward(self, x):
    self.input = x
    self.output = self.activation(x)
    return self.output
  
  def backward(self, output_error, lr = 0.01):
    return self.gradient(self.input) * output_error
  
  def __call__(self, x):
    return self.forward(x)

In [13]:
class Linear():
  def __init__(self, n_in, n_out, name="linear"):
    limit = 1 / np.sqrt(n_in)
    self.W = np.random.uniform(-limit, limit, (n_in, n_out))
    self.b = np.zeros((1, n_out)) # Biases
    self.input = None
    self.output = None
    self.name = name
  
  def forward(self, x):
    self.input = x
    self.output = np.dot(self.input, self.W) + self.b # Wx + b
    return self.output
  
  def backward(self, output_error, lr = 0.01):
    input_error = np.dot(output_error, self.W.T)
    delta = np.dot(self.input.T, output_error) # Calculate the weights error

    # using simple SGD
    self.W -= lr * delta
    self.b -= lr * np.mean(output_error)
    # Weights are updated below
    return input_error
  
  def __call__(self, x):
    return self.forward(x)

In [14]:
class Network():
  def __init__(self, input_dim, output_dim, lr=0.01):
    # input_dim = 784, output_dim = 10 for mnist
    self.layers = [
                   Linear(input_dim, 256, name="input"),
                   Activation(ReLU(), name="relu1"),
                   Linear(256, 128, name="input"),
                   Activation(ReLU(), name="relu2"),
                   Linear(128, output_dim, name="output"),
                   Activation(Softmax(), name="softmax")
    ]
    self.lr = lr
  
  def forward(self, x):
    for layer in self.layers:
      x = layer(x)
    return x
  
  def backward(self, loss_grad):
    for layer in reversed(self.layers):
      loss_grad = layer.backward(loss_grad, self.lr)
    # Iterating backwards through the layers
  
  def __call__(self, x):
    return self.forward(x)

In [15]:
criterion = CrossEntropy()
model = Network(n_input_dim, n_out, lr=1e-3)

In [16]:
EPOCHS = 10

In [17]:
for epoch in range(EPOCHS):
  loss = []
  acc = []
  for x_batch, y_batch in batch_loader(X_train, y_train):
    out = model(x_batch) # Forward pass
    loss.append(np.mean(criterion.loss(y_batch, out))) # Loss - for display
    acc.append(accuracy(np.argmax(y_batch, axis=1), np.argmax(out, axis=1))) # Accuracy - FOr display
    error = criterion.gradient(y_batch, out) # Calculate gradient of loss
    model.backward(error) # Backpropagation
  
  print(f"Epoch {epoch + 1}, Loss: {np.mean(loss)}, Acc: {np.mean(acc)}")

Epoch 1, Loss: 0.09329290355615685, Acc: 84.99300373134328
Epoch 2, Loss: 0.04409986590562006, Acc: 92.47234808102345
Epoch 3, Loss: 0.03340807513351868, Acc: 94.38299573560768
Epoch 4, Loss: 0.026561364404123353, Acc: 95.60401119402985
Epoch 5, Loss: 0.02201701719286328, Acc: 96.42024253731343
Epoch 6, Loss: 0.0187234983963393, Acc: 96.97328091684435
Epoch 7, Loss: 0.016176914723832037, Acc: 97.42470682302772
Epoch 8, Loss: 0.014113739378294228, Acc: 97.78451492537313
Epoch 9, Loss: 0.012398874672675739, Acc: 98.0527052238806
Epoch 10, Loss: 0.010953694233745911, Acc: 98.29424307036247


In [18]:
out = model(X_test) # Now we run the model on the test set
accuracy(np.argmax(y_test, axis=1), np.argmax(out, axis=1)) # We get an accuracy of 96%

97.42