# Digit Recognizer
## I will be using Tensorflow to create a neural network that learns to understand numbers

In [None]:
import csv
import cPickle as pickle
import matplotlib.pyplot as plt
import numpy as np
import os
import theano
import theano.tensor as T

# Config the matlotlib backend as plotting inline in IPython
%matplotlib inline

In [None]:
# Read from csv
data = []
with open("train.csv", "rb") as csvfile:
    trainreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    next(trainreader)
    for row in trainreader:
        data.append(row)
data = np.array(data, dtype=float)

In [None]:
# Normalize data
X = data[:, 1:]
mu = np.mean(X, axis=0)
sigma = np.std(X, axis=0)
data[:, 1:] = np.nan_to_num((X-mu)/sigma)
np.random.shuffle(data)

In [None]:
train = data[:33600]
cv = data[33600:]

In [None]:
nn_input_dim = train[:, 1:].shape[1]
nn_output_dim = 10
num_examples = train.shape[0]

In [None]:
Xvec = T.matrix('X')
yvec = T.lvector('y')

In [None]:
nn_hdim = 3
lmbda = 0.5
epsilon = 0.5

In [None]:
W1 = theano.shared(np.random.randn(nn_input_dim, nn_hdim), name='W1')
b1 = theano.shared(np.zeros(nn_hdim), name='b1')
W2 = theano.shared(np.random.randn(nn_hdim, nn_output_dim), name='W2')
b2 = theano.shared(np.zeros(nn_output_dim), name='b2')

### Feed-forward Propogation

In [None]:
z1 = Xvec.dot(W1) + b1
a1 = T.tanh(z1)
z2 = a1.dot(W2) + b2
prob = T.nnet.softmax(z2)

prediction = T.argmax(prob, axis=1)

### Cost Function

In [None]:
reg_loss = 1./num_examples * lmbda/2 * (T.sum(T.sqr(W1)) + T.sum(T.sqr(W2)))
loss = T.nnet.categorical_crossentropy(prob, yvec).mean() + reg_loss

### Back Propogation

In [None]:
dW1 = T.grad(loss, W1)
db1 = T.grad(loss, b1)
dW2 = T.grad(loss, W2)
db2 = T.grad(loss, b2)

### Functions

In [None]:
forward_prop = theano.function([Xvec], prob)
calculate_loss = theano.function([Xvec, yvec], loss)
predict = theano.function([Xvec], prediction)

gradient_step = theano.function(
    [Xvec, yvec], loss,
    updates=((W2, W2 - epsilon * dW2),
             (W1, W1 - epsilon * dW1),
             (b2, b2 - epsilon * db2),
             (b1, b1 - epsilon * db1)))

## Train function

In [None]:
Xcv, ycv = cv[:, 1:], cv[:, 0]

def train_func(num_examples, it=5000, printloss=False):
    W1 = theano.shared(np.random.randn(nn_input_dim, nn_hdim), name='W1')
    b1 = theano.shared(np.zeros(nn_hdim), name='b1')
    W2 = theano.shared(np.random.randn(nn_hdim, nn_output_dim), name='W2')
    b2 = theano.shared(np.zeros(nn_output_dim), name='b2')
    
    Xtrain, ytrain = train[:num_examples, 1:], train[:num_examples, 0]        
    
    if num_examples == 0:
        if printloss: print("Number of examples: %d\nIteration #: %d\nTraining Cost: %f\n" % (num_examples, 0, 0.0))
        return 0, calculate_loss(Xcv, np.array(ycv, dtype=int))
    
    for i in range(it):
        train_cost = gradient_step(Xtrain, np.array(ytrain, dtype=int))
        
        if printloss and (i+1)%1000 == 0:
            print("Number of examples: %d\nIteration #: %d\nTraining Cost: %f\n" % (num_examples, i+1, train_cost))

        cv_cost = calculate_loss(Xcv, np.array(ycv, dtype=int))
    return train_cost, cv_cost

In [None]:
def train_all(pfile=None, it=5000, printloss=False):
    if pfile and os.path.isfile(pfile):
        with open(pfile, "rb") as f:
            data = pickle.load(f)
            W1 = data["W1"]
            b1 = data["b1"]
            W2 = data["W2"]
            b2 = data["b2"]
            return data["m"], data["train_cost"], data["cv_cost"]

    m_list = []
    train_cost_list = []
    cv_cost_list = []

    for i in range(0, train.shape[0], 1000):
        train_cost, cv_cost = train_func(i, it=it, printloss=printloss)
        m_list.append(i)
        train_cost_list.append(train_cost)
        cv_cost_list.append(cv_cost)
    with open(pfile, "wb") as f:
        data = {"W1": W1, "b1": b1, "W2": W2, "b2": b2, 
                "m": m_list, "train_cost": train_cost_list, "cv_cost": cv_cost_list}
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
    return m_list, train_cost_list, cv_cost_list

### Run #1 with the following parameters:

nn_hdim = 3; lmbda = 0.5; epsilon = 0.5

In [None]:
m_list, train_cost_list, cv_cost_list = train_all(pfile="first.pickle", it=2000)

### Plot learning curve

In [None]:
plt.xlabel("m")
plt.ylabel("Cost")
plt.plot(m_list, train_cost_list, c="r", label="Training Cost")
plt.plot(m_list, cv_cost_list, c="g", label="Cross Validation Cost")
plt.show()