In [1]:
%pylab inline
import numpy as np
from scipy import special
import pandas as pd
import os

Populating the interactive namespace from numpy and matplotlib


In [2]:
METHOD_NAME = "nn_part-32-4"
DATA_FOLDER = "data/"
MODEL_FOLDER = "models/" + METHOD_NAME + "/"
OUTPUT_FOLDER = "outputs/"
TRAIN_FILE_PATH = DATA_FOLDER + "spam_train.csv"
TEST_FILE_PATH = DATA_FOLDER + "spam_test.csv"

In [3]:
if not os.path.exists(MODEL_FOLDER):
    os.makedirs(MODEL_FOLDER)
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

## Read Training Data

In [4]:
train_data = pd.read_csv(TRAIN_FILE_PATH, header=None)
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,1,0.0,0.0,1.13,0.0,0.37,0.0,0.0,0.0,0.0,...,0.0,0.145,0.0,0.436,0.0,0.0,1.792,55,147,0
1,2,0.0,0.0,0.6,0.0,0.0,0.6,0.0,0.0,0.6,...,0.0,0.143,0.047,0.191,0.143,0.0,2.041,31,196,1
2,3,0.0,0.0,0.48,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.45,0.0,0.0,1.138,4,41,0
3,4,0.0,0.51,0.0,0.0,0.51,0.51,1.02,0.0,0.0,...,0.0,0.142,0.0,0.071,1.212,0.0,7.025,130,281,1
4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.64,...,0.0,0.116,0.0,0.232,0.0,0.0,1.551,6,45,0


In [5]:
x_data_rows = train_data[list(range(1, 58))].as_matrix()
y_data_rows = train_data[[58]].as_matrix()

## Select Data

In [6]:
x_select_col_idxs = [26, 40, 31, 24,  3, 41, 54, 55, 45, 52, 28, 22, 53, 34, 43, 47, 6, 44, 15, 25, 16, 19, 38,  8, 56, 32,  4, 35, 27,  5, 48, 14]
x_data_rows = x_data_rows[:, x_select_col_idxs]

## Preprocess Data

## Transform Data

In [7]:
# Normalize
x_train_means = np.mean(x_data_rows, axis=0)
x_train_stds =  np.std(x_data_rows, axis=0)
x_data_rows = (x_data_rows - x_train_means) / x_train_stds

In [8]:
# print(np.mean(x_data_rows, axis=0), np.std(x_data_rows, axis=0))

In [9]:
# add 1 in the last column, so that b in including in w
x_data_rows = np.c_[x_data_rows, np.ones(x_data_rows.shape[0]) ]

In [10]:
# pre-transpose x for matrix multiplication
x_data_rows_trans = np.transpose(x_data_rows)

In [11]:
x_row_num, x_col_num = x_data_rows.shape  # 4001, 58

## Train

In [12]:
# reset parameters
w = np.random.randn(x_col_num, 1)  # shape = (x_col_num, 1). not (1, x_col_num) for the convenience of matrix multiplication
total_epoch = 0

In [13]:
layer_1_node_num = 50

In [14]:
w_1 = np.random.randn(x_col_num, layer_1_node_num)  # dot(x, w_1) output a x_row_num * layer_1_node_num matrix
w_2 = np.random.randn(layer_1_node_num, 1)  # dot(layer_1_output, w_2) output a x_row_num * 1 matrix
# v_w_1 = np.random.randn(x_col_num, layer_1_node_num)  # dot(x, w_1) output a x_row_num * layer_1_node_num matrix
# v_w_2 = np.random.randn(layer_1_node_num, 1)  # dot(layer_1_output, w_2) output a x_row_num * 1 matrix

In [15]:
# momentum
gamma = 0.9
# vw = np.zeros((x_col_num, 1))

In [36]:
lr = (10) / x_row_num 

In [17]:
def sigmoid_deriv(x):  # input may be an matrix
    return x * (1-x)  # element-wise

In [40]:
# training iterations

# special.expit is sigmoid

while True:
    layer_0_output = x_data_rows
    layer_1_output = special.expit(np.dot(layer_0_output, w_1))  # shape = (x_row_num, layer_1_node_num)
    layer_2_output = special.expit(np.dot(layer_1_output, w_2))  # shape = (x_row_num, 1)
    y_diff_rows = (y_data_rows - layer_2_output)  # shape = (x_row_num, 1)
    
    # backpropagation
    layer_2_error = y_diff_rows  # shape = (x_row_num, 1)
    layer_2_delta = layer_2_error * sigmoid_deriv(layer_2_output)  # shape = (x_row_num, 1)
    
    layer_1_error = layer_2_delta.dot(w_2.T)  # shape = (x_row_num, layer_1_node_num)
    layer_1_delta = layer_1_error * sigmoid_deriv(layer_1_output)  # shape = (x_row_num, layer_1_node_num)

    w_2 += lr * (layer_1_output.T).dot(layer_2_delta)  # shape = (layer_1_node_num, 1)
    w_1 += lr * (layer_0_output.T).dot(layer_1_delta)  # shape = (x_row_num, layer_1_node_num)
    
#     v_w_2 = lr * (layer_1_output.T).dot(layer_2_delta) + gamma * v_w_2
#     v_w_1 = lr * (layer_0_output.T).dot(layer_1_delta) + gamma * v_w_1
#     w_2 += v_w_2  # shape = (layer_1_node_num, 1)
#     w_1 += v_w_1  # shape = (x_row_num, layer_1_node_num)

    # write model
    if total_epoch % 10000 == 0:
        pred_error = np.sum(np.abs(y_data_rows - to_bool(layer_2_output)))
        rms_error = np.mean(np.square(y_diff_rows))
        
        model_file_info = "epo" + str(total_epoch) + "_err" + str(rms_error)[:8]
        print(model_file_info)
        print('rms_error', rms_error)
        print('pred_error', pred_error)
        
        np.savez(MODEL_FOLDER + model_file_info, w_1=w_1, w_2=w_2, lr=lr, total_epoch=total_epoch, x_train_means=x_train_means, x_train_stds=x_train_stds)

#     vw = lr * w_grad + gamma * vw
#     w = w - vw

    total_epoch += 1
    

epo690000_err0.017855
rms_error 0.0178557186413
pred_error 73
epo700000_err0.017839
rms_error 0.0178397118103
pred_error 73
epo710000_err0.017823
rms_error 0.0178233689029
pred_error 73
epo720000_err0.017806
rms_error 0.0178069274799
pred_error 73
epo730000_err0.017790
rms_error 0.0177903895768
pred_error 73
epo740000_err0.017773
rms_error 0.0177737747818
pred_error 73
epo750000_err0.017757
rms_error 0.0177571362433
pred_error 73
epo760000_err0.017740
rms_error 0.0177405408139
pred_error 73
epo770000_err0.017723
rms_error 0.0177239895236
pred_error 73
epo780000_err0.017707
rms_error 0.0177074698454
pred_error 73
epo790000_err0.017690
rms_error 0.0176909402527
pred_error 73
epo800000_err0.017674
rms_error 0.0176742739149
pred_error 73
epo810000_err0.017657
rms_error 0.0176578629523
pred_error 73
epo820000_err0.017641
rms_error 0.0176414229246
pred_error 73
epo830000_err0.017624
rms_error 0.0176248189091
pred_error 73
epo840000_err0.017608
rms_error 0.0176080374672
pred_error 73
epo85000

KeyboardInterrupt: 

## Test

In [41]:
test_data = pd.read_csv(TEST_FILE_PATH, header=None)
test_data.shape

(600, 58)

In [42]:
_id = test_data[0].apply(str)

In [43]:
x_test_data_rows = test_data[list(range(1, 58))].as_matrix()

In [44]:
x_test_data_rows = x_test_data_rows[:, x_select_col_idxs]

In [45]:
# read model

In [46]:
# normalize
x_test_data_rows = (x_test_data_rows - x_train_means) / x_train_stds
# add 1
x_test_data_rows = np.c_[x_test_data_rows, np.ones(x_test_data_rows.shape[0]) ]

In [47]:
def to_bool(x):
    return 1 if x > 0.5 else 0

to_bool = np.vectorize(to_bool)

In [48]:
# test
test_layer_1_output = special.expit(np.dot(x_test_data_rows, w_1))
test_layer_2_output = special.expit(np.dot(test_layer_1_output, w_2))
y_test_rows = to_bool(test_layer_2_output)

y_test_series = pd.Series(y_test_rows.flatten())

# concat id and y
output = pd.concat([_id, y_test_series], axis=1)
output.columns=["id","label"]

# write file
output.to_csv(OUTPUT_FOLDER + "output_" + METHOD_NAME + "_" + model_file_info + ".csv", index=False)