In [378]:
%pylab inline
import numpy as np
from scipy import special
import pandas as pd
import os

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [379]:
METHOD_NAME = "nn_part-32_dropout_layer2"
DATA_FOLDER = "data/"
MODEL_FOLDER = "models/" + METHOD_NAME + "/"
OUTPUT_FOLDER = "outputs/"
TRAIN_FILE_PATH = DATA_FOLDER + "spam_train.csv"
TEST_FILE_PATH = DATA_FOLDER + "spam_test.csv"

In [380]:
if not os.path.exists(MODEL_FOLDER):
    os.makedirs(MODEL_FOLDER)
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

## Read Training Data

In [381]:
train_data = pd.read_csv(TRAIN_FILE_PATH, header=None)
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,1,0.0,0.0,1.13,0.0,0.37,0.0,0.0,0.0,0.0,...,0.0,0.145,0.0,0.436,0.0,0.0,1.792,55,147,0
1,2,0.0,0.0,0.6,0.0,0.0,0.6,0.0,0.0,0.6,...,0.0,0.143,0.047,0.191,0.143,0.0,2.041,31,196,1
2,3,0.0,0.0,0.48,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.45,0.0,0.0,1.138,4,41,0
3,4,0.0,0.51,0.0,0.0,0.51,0.51,1.02,0.0,0.0,...,0.0,0.142,0.0,0.071,1.212,0.0,7.025,130,281,1
4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.64,...,0.0,0.116,0.0,0.232,0.0,0.0,1.551,6,45,0


In [382]:
x_data_rows = train_data[list(range(1, 58))].as_matrix()
y_data_rows = train_data[[58]].as_matrix()

## Select Data

In [383]:
x_select_col_idxs = [26, 40, 31, 24,  3, 41, 54, 55, 45, 52, 28, 22, 53, 34, 43, 47, 6, 44, 15, 25, 16, 19, 38,  8, 56, 32,  4, 35, 27,  5, 48, 14]
x_data_rows = x_data_rows[:, x_select_col_idxs]

## Preprocess Data

## Transform Data

In [384]:
# Normalize
x_train_means = np.mean(x_data_rows, axis=0)
x_train_stds =  np.std(x_data_rows, axis=0)
x_data_rows = (x_data_rows - x_train_means) / x_train_stds

In [385]:
# print(np.mean(x_data_rows, axis=0), np.std(x_data_rows, axis=0))

In [386]:
# add 1 in the last column, so that b in including in w
x_data_rows = np.c_[x_data_rows, np.ones(x_data_rows.shape[0]) ]

In [387]:
# pre-transpose x for matrix multiplication
x_data_rows_trans = np.transpose(x_data_rows)

In [388]:
x_row_num, x_col_num = x_data_rows.shape  # 4001, 58

## Train

In [389]:
# reset parameters
w = np.random.randn(x_col_num, 1)  # shape = (x_col_num, 1). not (1, x_col_num) for the convenience of matrix multiplication
total_epoch = 0

In [390]:
layer_1_node_num = 20
layer_2_node_num = layer_1_node_num

In [391]:
w_1 = np.random.randn(x_col_num, layer_1_node_num)  # dot(x, w_1) output a x_row_num * layer_1_node_num matrix
w_2 = np.random.randn(layer_1_node_num, layer_2_node_num)
w_out = np.random.randn(layer_2_node_num, 1)  # dot(layer_1_output, w_2) output a x_row_num * 1 matrix
# v_w_1 = np.random.randn(x_col_num, layer_1_node_num)  # dot(x, w_1) output a x_row_num * layer_1_node_num matrix
# v_w_2 = np.random.randn(layer_1_node_num, 1)  # dot(layer_1_output, w_2) output a x_row_num * 1 matrix

In [392]:
dropout_rate = 0.5

In [393]:
# momentum
gamma = 0.9
# vw = np.zeros((x_col_num, 1))

In [394]:
lr = (0.5) / x_row_num 

In [395]:
def to_bool(x):
    return 1 if x > 0.5 else 0

to_bool = np.vectorize(to_bool)

In [396]:
def sigmoid_deriv(x):  # input may be an matrix
    return x * (1-x)  # element-wise

In [397]:
# a = np.ones((3, 10))
# m = np.random.binomial(1, 0.5, 10)
# a * m

In [398]:
# training iterations

# special.expit is sigmoid

while True:
    layer_0_output = x_data_rows
    
    layer_1_output = special.expit(np.dot(layer_0_output, w_1))  # shape = (x_row_num, layer_1_node_num)
#     mask_1 = numpy.random.binomial(1, dropout_rate, layer_1_node_num) * (1.0/(1-dropout_rate))
#     layer_1_output *= mask_1

    layer_2_output = special.expit(np.dot(layer_1_output, w_2))  # shape = (x_row_num, layer_2_node_num)
    mask_2 = numpy.random.binomial(1, dropout_rate, layer_2_node_num) * (1.0/(1-dropout_rate))
    layer_2_output *= mask_2
    
    layer_out_output = special.expit(np.dot(layer_2_output, w_out))  # shape = (x_row_num, 1)
    
    # backpropagation
    layer_out_error = layer_out_output - y_data_rows
    layer_out_delta = layer_out_error * sigmoid_deriv(layer_out_output)
    
    layer_2_error = layer_out_delta.dot(w_out.T)
    layer_2_delta = layer_2_error * sigmoid_deriv(layer_2_output)
    
    layer_1_error = layer_2_delta.dot(w_2.T)  # shape = (x_row_num, layer_1_node_num)
    layer_1_delta = layer_1_error * sigmoid_deriv(layer_1_output)  # shape = (x_row_num, layer_1_node_num)

    w_out -= lr * (layer_2_output.T).dot(layer_out_delta)
    w_2 -= lr * (layer_1_output.T).dot(layer_2_delta)  # shape = (layer_1_node_num, 1)
    w_1 -= lr * (layer_0_output.T).dot(layer_1_delta)  # shape = (x_row_num, layer_1_node_num)
    
#     v_w_2 = lr * (layer_1_output.T).dot(layer_2_delta) + gamma * v_w_2
#     v_w_1 = lr * (layer_0_output.T).dot(layer_1_delta) + gamma * v_w_1
#     w_2 += v_w_2  # shape = (layer_1_node_num, 1)
#     w_1 += v_w_1  # shape = (x_row_num, layer_1_node_num)

    # write model
    if (total_epoch < 10000 and total_epoch % 1000 == 0) or (total_epoch % 10000 == 0):
        pred_error = np.sum(np.abs(to_bool(layer_out_output) - y_data_rows))
        rms_error = np.mean(np.square(layer_out_error))
        
        model_file_info = "epo" + str(total_epoch) + "_err" + str(rms_error)[:8]
        print(model_file_info)
        print('rms_error', rms_error)
        print('pred_error', pred_error, '/', x_row_num, '=', pred_error/x_row_num)
        
        np.savez(MODEL_FOLDER + model_file_info, w_1=w_1, w_2=w_2, w_out=w_out, lr=lr, total_epoch=total_epoch, x_train_means=x_train_means, x_train_stds=x_train_stds)

#     vw = lr * w_grad + gamma * vw
#     w = w - vw

    total_epoch += 1
    

epo0_err0.276644
rms_error 0.27664478588
pred_error 1810 / 4001 = 0.452386903274
epo1000_err0.135710
rms_error 0.135710463909
pred_error 665 / 4001 = 0.166208447888
epo2000_err0.117128
rms_error 0.117128432283
pred_error 484 / 4001 = 0.120969757561
epo3000_err0.099466
rms_error 0.0994663839613
pred_error 435 / 4001 = 0.108722819295
epo4000_err0.217477
rms_error 0.217477676739
pred_error 1554 / 4001 = 0.388402899275
epo5000_err0.312395
rms_error 0.312395803329
pred_error 2447 / 4001 = 0.611597100725
epo6000_err0.118168
rms_error 0.118168803768
pred_error 638 / 4001 = 0.159460134966
epo7000_err0.105465
rms_error 0.105465758826
pred_error 392 / 4001 = 0.0979755061235
epo8000_err0.081541
rms_error 0.0815414188583
pred_error 430 / 4001 = 0.107473131717
epo9000_err0.101638
rms_error 0.101638653706
pred_error 447 / 4001 = 0.111722069483
epo10000_err0.105603
rms_error 0.105603271408
pred_error 543 / 4001 = 0.135716070982
epo20000_err0.068449
rms_error 0.0684499229302
pred_error 389 / 4001 = 0.

KeyboardInterrupt: 

## Test

In [None]:
test_data = pd.read_csv(TEST_FILE_PATH, header=None)

In [None]:
_id = test_data[0].apply(str)

In [None]:
x_test_data_rows = test_data[list(range(1, 58))].as_matrix()

In [None]:
x_test_data_rows = x_test_data_rows[:, x_select_col_idxs]

In [None]:
# read model

In [None]:
# normalize
x_test_data_rows = (x_test_data_rows - x_train_means) / x_train_stds
# add 1
x_test_data_rows = np.c_[x_test_data_rows, np.ones(x_test_data_rows.shape[0]) ]

In [None]:
# test
test_layer_1_output = special.expit(np.dot(x_test_data_rows, w_1))
test_layer_2_output = special.expit(np.dot(test_layer_1_output, w_2))
test_layer_out_output = special.expit(np.dot(test_layer_2_output, w_out))
y_test_rows = to_bool(test_layer_out_output)

y_test_series = pd.Series(y_test_rows.flatten())

# concat id and y
output = pd.concat([_id, y_test_series], axis=1)
output.columns=["id","label"]

# write file
output.to_csv(OUTPUT_FOLDER + "output_" + METHOD_NAME + "_" + model_file_info + ".csv", index=False)