In [1]:
import numpy as np 
np.random.seed(1234)
from numpy import array,zeros,multiply,tanh,exp
import pickle
from collections import deque

In [26]:
with open('train_x.pkl', 'rb') as f:
    xtrain = pickle.load(f)
with open('train_y.pkl', 'rb') as f:
    ytrain = pickle.load(f)
with open('valid_x.pkl', 'rb') as f:
    xvalid = pickle.load(f)
with open('valid_y.pkl', 'rb') as f:
    yvalid = pickle.load(f)
with open('test_x.pkl', 'rb') as f:
    xtest = pickle.load(f)
with open('test_y.pkl', 'rb') as f:
    ytest = pickle.load(f)

In [27]:
xtrain = xtrain/255.0
xvalid = xvalid/255.0
xtest = xtest/255.0

# Layer Designing

In [4]:
def mini_batch(X,Y,batch_size = 25):
    batch_size,num = 25,X.shape[0] / batch_size
    return zip(np.array_split(X, num, axis=0),np.array_split(Y, num, axis=0))

In [34]:
def sig(x): 
    return 1 / (1 + np.exp(-x))
def tanh(x):
    return np.tanh(x) 
def tanh_prime(x):
    return 1.0 - x**2.0 
def sig_prime(x): 
    return np.multiply(x, (1 - x))
def softmax(x): 
    return np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)

In [6]:
class Sigmoid():
    def output(self, X):
        return sig(X)
    def input_grad(self, X, output_prime):
        return np.multiply(sig_prime(X), output_prime)
    def par_grad(self, X, output_grad):
        return []

In [35]:
class Tanh():
    def output(self, X):
        return tanh(X)
    def input_grad(self, X, output_prime):
        return np.multiply(tanh_prime(X), output_prime)
    def par_grad(self, X, output_grad):
        return []

In [7]:
class Softmax_Cost_Layer():
    def cost(self, pred, target):
        temp = -( target * np.log(pred)).sum()
        return temp / pred.shape[0]
    def input_grad(self, pred, target):
        return (pred - target) / pred.shape[0]
    def output(self, X):
        return softmax(X)
    def par_grad(self, X, output_grad):
        return []

In [8]:
class Linear():
    def __init__(self, inp, out):
        self.W, self.B = np.random.randn(inp, out) * 0.2, zeros(out)
        #momentum
        self.vw,self.vb = np.zeros_like(self.W),np.zeros_like(self.B)
        
    def input_grad(self,inp ,output_prime):
        return output_prime.dot(self.W.T)
    def par_grad(self, inp, out):
        return np.concatenate((inp.T.dot(out).flatten(),np.sum(out, axis=0).flatten()),axis = 0).tolist() 
    def output(self, inp):
        return inp.dot(self.W) + self.B

In [9]:
def create_model(layer_arch,activation='sig'):
    layer_stack = []
    layer_stack.append(Linear(xtrain.shape[1],layer_arch[0]))
    layer_stack.append(Sigmoid())
    for i in range(len(layer_arch)-1):
        layer_stack.append(Linear(layer_arch[i],layer_arch[i+1]))
        layer_stack.append(Sigmoid())
    layer_stack.append(Linear(layer_arch[-1],ytrain.shape[1]))
    layer_stack.append(Softmax_Cost_Layer())
    return layer_stack

In [10]:
def forward(inp,NN):
    hidden_stack = [inp]
    temp = hidden_stack[0]
    for h_layer in NN:
        temp = h_layer.output(temp)
        hidden_stack.append(temp)
    return hidden_stack

In [11]:
def backward(hidden_stack,T,NN):
    temp = None
    grad_que = deque()
    for h_layer in reversed(NN):
        if temp is not None:
            temp1 = h_layer.input_grad(hidden_stack.pop(),temp)
        else:
            temp1 = h_layer.input_grad(hidden_stack.pop(),T)
        grad_que.appendleft(h_layer.par_grad(hidden_stack[-1],temp))
        temp = temp1
    return list(grad_que)

In [12]:
def update_SG(NN,grad_NN,alpha):
    for idx in range(len(grad_NN)):
        if len(grad_NN[idx]) > 0:
            b_len = NN[idx].B.shape[0]
            NN[idx].B -= alpha * array(grad_NN[idx][-b_len:])
            NN[idx].W -= alpha * array(grad_NN[idx][:-b_len]).reshape((NN[idx].W.shape[0],NN[idx].W.shape[1]))

In [13]:
def update_Momentum(NN,grad_NN,alpha=0.1,mu=0.83):
    for idx in range(len(grad_NN)):
        if len(grad_NN[idx]) > 0:
            b_len = NN[idx].B.shape[0]
            beta = - alpha * array(grad_NN[idx][-b_len:])
            gamma = - alpha * array(grad_NN[idx][:-b_len]).reshape((NN[idx].W.shape[0],NN[idx].W.shape[1]))
            NN[idx].vb = mu * NN[idx].vb + beta
            NN[idx].vw = mu * NN[idx].vw + gamma
            NN[idx].B += NN[idx].vb
            NN[idx].W += NN[idx].vw

In [14]:
def update_NAG(NN,grad_NN,alpha=0.1,mu=0.83):
    for idx in range(len(grad_NN)):
        if len(grad_NN[idx]) > 0:
            b_len = NN[idx].B.shape[0]
            beta = - alpha * array(grad_NN[idx][-b_len:])
            gamma = - alpha * array(grad_NN[idx][:-b_len]).reshape((NN[idx].W.shape[0],NN[idx].W.shape[1]))
#             v_prev = v            
            v_prevb = NN[idx].vb
            v_prevw = NN[idx].vw
#             v = mu * v - learning_rate * dx
            NN[idx].vb = mu * NN[idx].vb + beta
            NN[idx].vw = mu * NN[idx].vw + gamma
#             x += -mu * v_prev + (1 + mu) * v
            NN[idx].B += -mu * v_prevb + (1 + mu) * NN[idx].vb
            NN[idx].W += -mu * v_prevw + (1 + mu) * NN[idx].vw

In [36]:
def update_ADAM(self,dx,alpha,t,eps=1e-8,beta1=0.9,beta2=0.999):
    pass

In [15]:
def anneal_alpha(self):
        self.alpha = self.alpha/2

def error(x,y):
        err = 0
        for i in range(x.shape[0]):
            if (np.argmax(x[i]) == np.argmax(y[i])):
                 err+=1.0
        return err/x.shape[0]

In [16]:
def performance_measure(xtrain,ytrain,xvalid,yvalid,NN,epochs=20,alpha=0.1,btch=25):
    train_cost,valid_cost,train_acc,valid_acc = [],[],[],[]
    
    for epoch in range(epochs):
        print(epoch)
        batches = mini_batch(xtrain,ytrain)
        for mini_x,mini_y in batches:
            pred = forward(mini_x,NN)
            back_grad = backward(pred,mini_y,NN)
            update_NAG(NN,back_grad)
        #Training loss and cost
        pred = forward(xtrain,NN)
        train_acc.append(error(pred[-1],ytrain))
        train_cost.append(NN[-1].cost(pred[-1],ytrain))
        #Validation loss and cost
        pred = forward(xvalid,NN)
        temp = error(pred[-1],yvalid)
#         if temp < valid_loss[-1]:
#             anneal_alpha()
        valid_acc.append(temp)
        valid_cost.append(NN[-1].cost(pred[-1],yvalid))
    
    return train_cost,valid_cost,train_acc,valid_acc

In [17]:
model = create_model([30,30])
t_cost,v_cost,t_acc,v_acc = performance_measure(xtrain,ytrain,xvalid,yvalid,model)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [18]:
t_cost

[0.25179446434529362,
 0.18625151001367715,
 0.14991622277573907,
 0.12779668998045157,
 0.11441389050316539,
 0.1007319329884636,
 0.092612594032525489,
 0.084695543104319454,
 0.081162898617131399,
 0.07715309312604704,
 0.07272205239208876,
 0.072481110789210254,
 0.070957417856538779,
 0.067502639952847263,
 0.067974231773120494,
 0.06180963862838218,
 0.063353672398673219,
 0.060632121742573944,
 0.059215709537530727,
 0.054993350424434553]

In [19]:
t_acc

[0.92224,
 0.94154,
 0.9523,
 0.95946,
 0.96358,
 0.96778,
 0.97022,
 0.97286,
 0.97422,
 0.97492,
 0.97602,
 0.97622,
 0.97684,
 0.97782,
 0.97834,
 0.98052,
 0.97972,
 0.98068,
 0.98138,
 0.98264]

In [20]:
v_cost

[0.24213008015754084,
 0.1953865336930963,
 0.17010793973601546,
 0.15901174880559998,
 0.15581319302998101,
 0.1496955278258581,
 0.14763452670968019,
 0.14598762198507972,
 0.14732195767827111,
 0.14837414679954711,
 0.14979326481114397,
 0.15508279073015457,
 0.15721855507292981,
 0.15794999628555878,
 0.16325783206863473,
 0.16017737764661713,
 0.16762350143981597,
 0.16615121409618838,
 0.16997730438480385,
 0.1671816429418983]

In [21]:
v_acc

[0.9297,
 0.9426,
 0.9514,
 0.9546,
 0.9549,
 0.9585,
 0.9596,
 0.9611,
 0.9601,
 0.9601,
 0.9605,
 0.9594,
 0.9589,
 0.9583,
 0.9591,
 0.9591,
 0.9589,
 0.9589,
 0.9581,
 0.9593]

In [29]:
def test_performance_measure(xtest,ytest,NN):
    test_acc, test_cost = [],[]
    pred1 = forward(xtest,NN)
    test_acc.append(error(pred1[-1],ytest))
    test_cost.append(NN[-1].cost(pred1[-1],ytest))
    return test_acc,test_cost

In [31]:
test_acc,test_cost = test_performance_measure(xtest,ytest,model)

In [32]:
test_cost

[0.15921880852907819]

In [33]:
test_acc

[0.9602]

# Data Preprocessing

In [22]:
# from numpy import genfromtxt

In [23]:
# data = genfromtxt('mnist_train.csv', delimiter=',')
# test_data = genfromtxt('mnist_test.csv', delimiter=',')

In [24]:
# train = data[:50000]
# valid = data[50000:]
# train_y = train[:,0]
# train_x = train[:,1:]
# train_y_1 = np.eye(10)[train_y.astype(int)]
# valid_y = valid[:,0]
# valid_x = valid[:,1:]
# valid_y_1 = np.eye(10)[valid_y.astype(int)]
# test_y = test_data[:,0]
# test_x = test_data[:,1:]
# test_y_1 = np.eye(10)[test_y.astype(int)]

In [25]:
# pickle.dump(train_x,open("train_x.pkl",'wb'))
# pickle.dump(train_y_1,open("train_y.pkl",'wb'))
# pickle.dump(valid_x,open("valid_x.pkl",'wb'))
# pickle.dump(valid_y_1,open("valid_y.pkl",'wb'))
# pickle.dump(test_x,open("test_x.pkl",'wb'))
# pickle.dump(test_y_1,open("test_y.pkl",'wb'))