### **INITIALIZATION**

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
!pip install kaggle

In [None]:
# configure path of the kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c digit-recognizer

In [None]:
# extracting the compressed dataset
from zipfile import ZipFile
dataset = '/content/digit-recognizer.zip'
with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print("dataset is extracted")

### **DIVIDING THE DATASET**

In [None]:
data = pd.read_csv("/content/train.csv")
print(data.head())
data = np.array(data)

In [None]:
m, n = data.shape
print(m,n) # 42000 data sets, 784 pixels + 1 label

In [12]:
np.random.shuffle(data)

data_dev = data[0:1000].T # dividing the test set, transposing so each column is a example
y_dev = data_dev[0]
x_dev = data_dev[1:n]
x_dev = x_dev / 255

data_train = data[1000:m].T # the training data set
y_train = data_train[0]
x_train = data_train[1:n]
x_train = x_train / 255

### **DEFINING THE NEURAL NETWORK FUNCTIONS**

In [22]:
def init_params():
  w1 = np.random.rand(10,784) - 0.5
  b1 = np.random.rand(10,1) - 0.5
  w2 = np.random.rand(10,10) - 0.5
  b2 = np.random.rand(10,1) - 0.5
  return w1, b1, w2, b2

def ReLU(z):
  return np.maximum(z,0)

def softmax(z):
  A = np.exp(z) / sum(np.exp(z))
  return A

def forward_prop(w1,b1,w2,b2,x):
  z1 = w1.dot(x) + b1
  a1 = ReLU(z1)
  z2 = w2.dot(a1) + b2
  a2 = softmax(z2)
  return z1, a1, z2, a2

def deriv_ReLU(z):
  return z>0

def one_hot(y):
    one_hot_y = np.zeros((y.size, y.max() + 1))
    one_hot_y[np.arange(y.size), y] = 1
    one_hot_y = one_hot_y.T
    return one_hot_y

def back_prop(z1,a1,z2,a2,w1,w2,x,y):
  one_hot_y = one_hot(y)
  dz2 = a2 - one_hot_y
  dw2 = 1/m * dz2.dot(a1.T)
  db2 = 1/m * np.sum(dz2)
  dz1 = w2.T.dot(dz2) * deriv_ReLU(z1)
  dw1 = 1/m * dz1.dot(x.T)
  db1 = 1/m * np.sum(dz1)
  return dw1, db1, dw2, db2

def update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha):
  w1 = w1 - alpha*dw1
  b1 = b1 - alpha*db1
  w2 = w2 - alpha*dw2
  b2 = b2 - alpha*db2
  return w1, b1, w2, b2

### **TRAINING THE DATA**

In [31]:
def get_predictions(a2):
  return np.argmax(a2, 0)

def get_accuracy(predictions, y):
  print(predictions, y)
  return np.sum(predictions == y)/y.size

def gradient_descent(x, y, iterations, alpha):
  w1, b1, w2, b2 = init_params()
  for i in range(iterations):
    z1, a1, z2, a2 = forward_prop(w1, b1, w2, b2, x)
    dw1, db1, dw2, db2 = back_prop(z1,a1,z2,a2,w1,w2,x,y)
    w1, b1, w2, b2 = update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, alpha)
    if i%50 == 0:
      print("iteration: ", i)
      predictions = get_predictions(a2)
      print("accuracy: ", (get_accuracy(predictions, y)))
  return w1, b1, w2, b2

In [None]:
w1, b1, w2, b2 = gradient_descent(x_train, y_train, 500, 0.10)

### **TESTING ON THE TRAINING DATASET**

In [36]:
def make_predictions(x,w1,b1,w2,b2):
  _, _, _, a2 = forward_prop(w1,b1,w2,b2,x)
  predictions = get_predictions(a2)
  return predictions

def test_predictions(index,w1,b1,w2,b2):
  currect_image = x_train[:, index, None]
  predictions = make_predictions(x_train[:, index, None], w1,b1,w2,b2)
  label = y_train[index]
  print("predictions: ", predictions)
  print("label: ", label)
  current_image = currect_image.reshape((28,28))*255
  plt.gray()
  plt.imshow(current_image, interpolation = 'nearest')
  plt.show()

In [None]:
test_predictions(0,w1,b1,w2,b2)
test_predictions(1,w1,b1,w2,b2)
test_predictions(2,w1,b1,w2,b2)

### **CHECKING THE ACCURACY ON THE TEST SET**

In [None]:
dev_predictions = make_predictions(x_dev, w1,b1,w2,b2)
get_accuracy(dev_predictions, y_dev)

### **THE ACCURACY IS AROUND 85%**