In [2]:
import keras
from keras.datasets import mnist
from keras.utils import to_categorical
import numpy as np
from sklearn.metrics import classification_report

Using TensorFlow backend.


In [3]:
(x_train_origin, y_train_origin),(x_test_origin, y_test_origin) = mnist.load_data()
num_class = 10

In [4]:
x_train = x_train_origin.reshape((60000, 28 * 28))
x_train = x_train.astype('float32') / 255

x_test = x_test_origin.reshape((10000, 28 * 28))
x_test = x_test.astype('float32') / 255

In [5]:
y_train = to_categorical(y_train_origin, num_classes=10) 
y_test = to_categorical(y_test_origin, num_classes=10) 

In [6]:
def init_params(dim):
    # w = feature * class
    # b = 1 * class
    w = np.zeros((dim, 10))
    b = np.zeros((1, 10))
    
    return w,b

the reference for softmax: (https://segmentfault.com/a/1190000010039529)

In [7]:
def softmax(x):
    """
    Compute the softmax function for each row of the input x.

    Arguments:
    x -- A N dimensional vector or M x N dimensional numpy matrix.

    Return:
    x -- You are allowed to modify x in-place
    """
    orig_shape = x.shape

    if len(x.shape) > 1:
        # Matrix
        exp_minmax = lambda x: np.exp(x - np.max(x))
        denom = lambda x: 1.0 / np.sum(x)
        x = np.apply_along_axis(exp_minmax,1,x)
        denominator = np.apply_along_axis(denom,1,x) 
        
        if len(denominator.shape) == 1:
            denominator = denominator.reshape((denominator.shape[0],1))
        
        x = x * denominator
    else:
        # Vector
        x_max = np.max(x)
        x = x - x_max
        numerator = np.exp(x)
        denominator =  1.0 / np.sum(numerator)
        x = numerator.dot(denominator)
        
    assert x.shape == orig_shape
    return x

In [8]:
def propagate(w, b, X, Y):
    
    m = X.shape[0]
    A = softmax((np.dot(X,w)) + b)
    cost  = -1/m * np.sum(Y*np.log(A))
    dw = -1/m * np.dot(X.T, (Y-A))
    db = -1/m * np.sum(Y-A)
    
    
    grads = {"dw":dw, "db": db}
    return grads, cost

In [49]:
def optimize(w, b, X, Y, num_iters, batch_size, learning_rate, print_cost):
    
    costs = []
    m = X.shape[0]
    for i in range(num_iters):
        #cost_batch is used to collect the cost during one iteration over different batches
        cost_batch = []
        #stochastic gradient descent
        shuffled_indices = np.random.permutation(m)
        X_shuffled = X[shuffled_indices,:]
        y_shuffled = Y[shuffled_indices,:]
        #mini-batch
        for j in range(0, m, batch_size):
            x_batch = X_shuffled[j:j+batch_size,:]
            y_batch = y_shuffled[j:j+batch_size,:]
            grads, cost = propagate(w, b, x_batch, y_batch)
            dw = grads['dw']
            db = grads['db']
            
            w = w - learning_rate * dw
            b = b - learning_rate * db

            cost_batch.append(cost)
            # the cost of one iteration is the average number over batches
            cost = np.mean(cost_batch)
        costs.append(cost)
        
        if print_cost:
            print("Cost after iteration %i: %f" % (i+1, cost))

    
    params = {"w": w,
              "b": b}
    
    grads = {"dw": dw,
             "db": db}
    
    
    return params, grads, costs

In [50]:
def predict(w, b, X):
    return softmax((np.dot(X, w) + b))

In [51]:
def accuracy(y_hat, Y):
    
    max_index = np.argmax(y_hat, axis=1)
    y_hat[np.arange(y_hat.shape[0]), max_index] = 1
    accuracy = np.sum(np.argmax(y_hat, axis=1)==np.argmax(Y, axis=1))   
    accuracy = accuracy *1.0/Y.shape[0]
    return accuracy

In [60]:
def model(x_train, y_train, x_test, y_test, num_iters=20, batch_size = 5, learning_rate=0.5, print_cost=False):
    
    w, b = init_params(x_train.shape[1])
    
    parameters, grads, costs = optimize(w, b, x_train, y_train, num_iters, batch_size, learning_rate, print_cost)
    
    w = parameters['w']
    b = parameters['b']
    
    
    y_pred_test = predict(w, b, x_test)
    y_pred_train = predict(w, b, x_train)
    
    train_accuracy = accuracy(y_pred_train, y_train)
    test_accuracy = accuracy(y_pred_test, y_test)
    
    print("train accuracy: {} %".format(100*train_accuracy))
    print("test accuracy: {} %".format(100*test_accuracy))
    
    
    d = {
        'w':w,
        'b':b,
        'costs':costs,
        'Y_pred_test': y_pred_test, 
        'Y_pred_train' : y_pred_train, 
        'iterations':num_iters,
        'learning_rate':learning_rate,
        'num_iters': num_iters}
    return d, train_accuracy, test_accuracy

In [61]:
d, train, test = model(x_train, y_train, x_test, y_test, num_iters=10, batch_size=1000, learning_rate=0.01, print_cost=True)

Cost after iteration 1: 2.030298
Cost after iteration 2: 1.616022
Cost after iteration 3: 1.350116
Cost after iteration 4: 1.173601
Cost after iteration 5: 1.050627
Cost after iteration 6: 0.960783
Cost after iteration 7: 0.892426
Cost after iteration 8: 0.838652
Cost after iteration 9: 0.795176
Cost after iteration 10: 0.759242
train accuracy: 84.62166666666666 %
test accuracy: 85.64 %
