In [1]:
import pandas as pd
import numpy as np

## Part 1 - Load and preprocess data

In [2]:
train_data = np.loadtxt("coding_test_dl_intern/mnist_train.csv", delimiter=',')
test_data = np.loadtxt("coding_test_dl_intern/mnist_test.csv", delimiter=',')

In [3]:
train_labels = train_data[:, 0]
train_images = train_data[:, 1:]

X_train = train_images[(train_labels == 2) | (train_labels == 7)] / 255.

y_train = train_labels[(train_labels == 2) | (train_labels == 7)]
y_train[y_train == 2] = 0
y_train[y_train == 7] = 1

In [4]:
test_labels = test_data[:, 0]
test_images = test_data[:, 1:]

X_test = test_images[(test_labels == 2) | (test_labels == 7)] / 255.
y_test = test_labels[(test_labels == 2) | (test_labels == 7)]
y_test[y_test == 2] = 0
y_test[y_test == 7] = 1

## Part 2 - Neural Networks implementation

In [5]:
def init_weights(n_nodes, input_dim, seed=1749):
    """
    Each row is a node, each column is a dimension of the input,
    so dimension is (n_nodes, input_dim).
    We initialize with a random normal in order to break symmetry.
    """
    np.random.seed(seed)
    W = np.random.normal(size=(n_nodes, input_dim))
    return W

In [6]:
def init_bias(n_nodes, seed=1749):
    """
    The bias is added to Wx, therefore has dimension (n_nodes, 1).
    Bias can be initialized to 0 since it isn't affected by the problem
    of symmetry.
    """
    np.random.seed(seed)
    b = np.zeros((n_nodes, 1))
    return b

In [7]:
def sigmoid(z):
    """
    This is the activation function. It takes as input
    z = Wx + b, and applies the sigmoid function.
    """
    return 1/(1 + np.exp(-z))

In [8]:
def deriv_sigmoid(z):
    """
    The derivation of the sigmoid function
    """
    return sigmoid(z) * (1 - sigmoid(z))

In [9]:
def compute_loss(y_hat, y, offset=True):
    '''
    Here we compute the loss function between two predictions.
    If offset is true, we use a small value of epsilon to avoid numerical problems.
    '''
    if offset:
        epsilon = 10e-30
    else:
        epsilon = 0
        
    loss = - (y * np.log(y_hat + epsilon) + (1-y) * np.log(1 - y_hat + epsilon))
    return np.average(loss)

In [10]:
# FORWARD PROP
def forward_prop(W1, b1, W2, b2, X, y):
    '''
    This process the forward propagation step
    '''
    # First hidden layer
    Z1 = W1.dot(X) + b1
    A1 = sigmoid(Z1)

    # 2nd layer (output layer)
    Z2 = W2.dot(A1) + b2
    A2 = sigmoid(Z2)
    
    return A1, A2, Z1, Z2

In [20]:
# BACKPROP
def backprop(A1, A2, W1, W2, Z1, Z2, X, y):
    """
    This process the backward propagation algorithm.
    """
    m = X.shape[1]
    
    # Compute derivatives for second (output) layer
    dZ2 = A2 - y
    dW2 = (1/m) * dZ2.dot(A1.T)
    db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)

    # Computer derivatives for first layer
    dZ1 = W2.T.dot(dZ2) * deriv_sigmoid(Z1)
    dW1 = (1/m) * dZ1.dot(X.T)
    db1 = (1/m) * np.sum(dZ1, axis=1, keepdims=True)
    
    return dW1, dW2, db1, db2

In [38]:
def train(X, y, lr=0.001, n_iter=1000, display_training=True):
    n_nodes = 300
    
    # Initialize the first layer
    W1 = init_weights(n_nodes=n_nodes, input_dim=X.shape[0])
    b1 = init_bias(n_nodes=n_nodes)

    # Initialize the second layer
    W2 = init_weights(n_nodes=1, input_dim=n_nodes)
    b2 = init_bias(n_nodes=1)

    # Run for n_iter iterations
    for n in range(1, n_iter+1):
        A1, A2, Z1, Z2 = forward_prop(W1, b1, W2, b2, X, y)
        dW1, dW2, db1, db2 = backprop(A1, A2, W1, W2, Z1, Z2, X, y)

        # Update all the layers
        W1 = W1 - lr * dW1
        W2 = W2 - lr * dW2
        b1 = b1 - lr * db1
        b2 = b2 - lr * db2

        # Display loss
        if n % 10 == 0 and display_training:
            loss = compute_loss(A2, y)
            print(f"Iteration: {n}.\t Loss: {loss}")
    
    return W1, W2, b1, b2

In [88]:
def predict(X, W1, W2, b1, b2):
    # First hidden layer
    Z1 = W1.dot(X) + b1
    A1 = sigmoid(Z1)

    # 2nd layer (output layer)
    Z2 = W2.dot(A1) + b2
    A2 = sigmoid(Z2)
    
    y_proba = np.squeeze(A2)
    y_pred = np.around(y_proba)
    
    return y_pred, y_proba

In [39]:
W1, W2, b1, b2 = train(X_train.T, y_train, n_iter=200)

Iteration: 10.	 Loss: 6.713632961735607
Iteration: 20.	 Loss: 6.437458379899252
Iteration: 30.	 Loss: 6.171760037183202
Iteration: 40.	 Loss: 5.917094214671635
Iteration: 50.	 Loss: 5.6739369084114015
Iteration: 60.	 Loss: 5.44269225762907
Iteration: 70.	 Loss: 5.223684248138618
Iteration: 80.	 Loss: 5.017127780658234
Iteration: 90.	 Loss: 4.823092122567815
Iteration: 100.	 Loss: 4.641478353974736
Iteration: 110.	 Loss: 4.472022080326518
Iteration: 120.	 Loss: 4.314317604484327
Iteration: 130.	 Loss: 4.167852677303077
Iteration: 140.	 Loss: 4.032042924745048
Iteration: 150.	 Loss: 3.9062598426177857
Iteration: 160.	 Loss: 3.789851786845553
Iteration: 170.	 Loss: 3.682159769130984
Iteration: 180.	 Loss: 3.5825298541361152
Iteration: 190.	 Loss: 3.4903232891432743
Iteration: 200.	 Loss: 3.404924703943308


In [90]:
y_pred, y_proba = predict(X_test.T, W1, W2, b1, b2)

In [95]:
print("Test Loss:", compute_loss(y_proba, y_test))

Test Loss: 3.2195844421660076


In [98]:
def accuracy(y_pred, y):
    return np.average(y_pred == y)

In [99]:
accuracy(y_pred, y_test)

0.44854368932038835