In [129]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [130]:
df = pd.read_csv('train.csv')

In [131]:
df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [132]:
y = df.label
X_full = df.drop(['label'],axis = 1,inplace=False)
X_full = X_full/255

X_dev = X_full.iloc[0:1000,:]
y_dev = y.iloc[0:1000]

X_train = X_full.iloc[1000:X_full.shape[0],:]
y_train = y.iloc[1000:y.shape[0]]



In [133]:
X_train.iloc[0]

pixel0      0.0
pixel1      0.0
pixel2      0.0
pixel3      0.0
pixel4      0.0
           ... 
pixel779    0.0
pixel780    0.0
pixel781    0.0
pixel782    0.0
pixel783    0.0
Name: 1000, Length: 784, dtype: float64

In [134]:
y_train.iloc[0]

1

## Part 1: Neural Network Implementation

### i)

In [135]:
def init_params(layer_sizes=[784,120,45,10]):
    num_layers = len(layer_sizes) - 1
    network = []
    for i in range(num_layers):
        weights = np.random.rand(layer_sizes[i+1],layer_sizes[i])
        biases = np.random.rand(layer_sizes[i+1],1)
        network.append([weights,biases])
    return network

In [136]:
network = init_params()
print(len(network))

3


### ii) Activation Functions

In [137]:
def ReLU(a):
    return np.maximum(0, a)

In [138]:
def softmax(z):
    z_max = np.max(z)
    z_exp = np.exp(z - z_max)
    sum_exp = np.sum(z_exp)
    softmax_z = np.round(z_exp / sum_exp, 3)
    return softmax_z

### iii) Forward Propagation

In [153]:
def forward_propagation(network,inp):
    pre_activation = [] ## Z values
    activation = [inp] ## A values, where inp is A0
    for i in range(len(network)):
        if(i!=len(network)-1):
            pre_activation.append(np.dot(network[i][0],activation[i]) + network[i][1])
            activation.append(ReLU(pre_activation[i]))
            print(pre_activation[i].shape,activation[i+1].shape)
        else:
            pre_activation.append(np.dot(network[i][0],activation[i]) + network[i][1])
            activation.append(softmax(pre_activation[i]))
            print(pre_activation[i].shape,activation[i+1].shape)
    return (pre_activation,activation)
    

In [140]:
print(ReLU(np.random.rand(120,1)).shape)

(120, 1)


In [155]:
# inp = np.random.rand(784,1)
inp = X_train.iloc[0,:]
print(inp.shape)
print(inp.head())
pre_activation,activation = forward_propagation(network,inp)
print(len(activation))
print(pre_activation[0].shape)
print(activation[0].shape)
print(activation[1].shape)
print(activation[2].shape)
print(activation[3].shape)

(784,)
pixel0    0.0
pixel1    0.0
pixel2    0.0
pixel3    0.0
pixel4    0.0
Name: 1000, dtype: float64
(120, 120) (120, 120)
(45, 120) (45, 120)


ValueError: shapes (10,45) and (784,) not aligned: 45 (dim 1) != 784 (dim 0)

In [142]:
print(activation[3])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### iv) One-Hot Encoding

In [143]:
def one_hot(Y):
    n_classes = 10  # We assume there are 10 possible classes (digits 0-9)
    try:
        m = len(Y)  # Number of examples
    except TypeError:
        m = 1       
    
    # Create an empty one-hot encoded array
    one_hot_Y = np.zeros((n_classes, m))
    
    # Assign 1 to the corresponding index for each element in Y
    try:
        for i, y in enumerate(Y):
            one_hot_Y[y, i] = 1
    except TypeError:
        one_hot_Y[Y] = 1
    return one_hot_Y.T

In [144]:
y_train.iloc[-1]

9

In [145]:
y_train.shape
one_hot(y_train)

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

## Part 2: Backward Propagation and Model Training

### i) Backward Propagation

In [146]:
def backward_propagation(pre_activation,activation,network,Y):
    pre_activation_gradients = []
    weight_gradients = []
    bias_gradients = []
    
    for i in reversed(range(len(network))):
        if(i!=len(network)-1):
            pre_activation_gradients.insert(0,np.multiply(np.dot(network[i+1][0].T, pre_activation_gradients[0]), np.int64(activation[i+1] > 0)))
            weight_gradients.insert(0,np.dot(pre_activation_gradients[0],activation[i].T))
            bias_gradients.insert(0,np.sum(pre_activation_gradients[0], axis=1, keepdims=True))
        else:
            ## for the end element, which needs to be dealt with first.
            pre_activation_gradients.insert(0,activation[i+1]-Y)
            weight_gradients.insert(0,np.dot(pre_activation_gradients[0],activation[i].T))
            bias_gradients.insert(0,np.sum(pre_activation_gradients[0], axis=1, keepdims=True))
    return weight_gradients,bias_gradients

In [148]:
activation[3].shape

(10, 120)

### ii) Update parameters

In [147]:
weight_gradients,bias_gradients = backward_propagation(pre_activation,activation,network,Y = one_hot(y_train.iloc[0]))
print(len(weight_gradients),len(bias_gradients))

ValueError: operands could not be broadcast together with shapes (10,120) (1,10) 

In [None]:
def update_params(network,weight_gradients,bias_gradients,alpha):
    for layer in range(len(network)):
        network[layer][0] -= weight_gradients[layer]*alpha
        network[layer][1] -= bias_gradients[layer]*alpha
        