In [73]:
%pylab inline
import numpy as np
from scipy import special
import pandas as pd
import os

Populating the interactive namespace from numpy and matplotlib


In [74]:
METHOD_NAME = "logistic_part_2"

DATA_FOLDER = "data/"
MODEL_FOLDER = "models/" + METHOD_NAME + "/"
OUTPUT_FOLDER = "outputs/"

TRAIN_FILE_PATH = DATA_FOLDER + "spam_train.csv"
TEST_FILE_PATH = DATA_FOLDER + "spam_test.csv"

In [75]:
if not os.path.exists(MODEL_FOLDER):
    os.makedirs(MODEL_FOLDER)
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

## Read Training Data

In [76]:
train_data = pd.read_csv(TRAIN_FILE_PATH, header=None)
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,1,0.0,0.0,1.13,0.0,0.37,0.0,0.0,0.0,0.0,...,0.0,0.145,0.0,0.436,0.0,0.0,1.792,55,147,0
1,2,0.0,0.0,0.6,0.0,0.0,0.6,0.0,0.0,0.6,...,0.0,0.143,0.047,0.191,0.143,0.0,2.041,31,196,1
2,3,0.0,0.0,0.48,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.45,0.0,0.0,1.138,4,41,0
3,4,0.0,0.51,0.0,0.0,0.51,0.51,1.02,0.0,0.0,...,0.0,0.142,0.0,0.071,1.212,0.0,7.025,130,281,1
4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.64,...,0.0,0.116,0.0,0.232,0.0,0.0,1.551,6,45,0


In [77]:
x_data_rows = train_data[list(range(1, 58))].as_matrix()
y_data_rows = train_data[[58]].as_matrix()

## Select Data

In [78]:
x_select_col_idxs = [26, 40, 31, 24,  3, 41, 54, 55, 45, 52, 28, 22, 53, 34, 43, 47, 6, 44, 15, 25, 16, 19, 38,  8, 56, 32,  4, 35, 27,  5, 48, 14]
x_data_rows = x_data_rows[:, x_select_col_idxs]

In [79]:
x_data_rows.shape

(4001, 32)

## Preprocess Data

## Transform Data

In [80]:
# Normalize
x_train_means = np.mean(x_data_rows, axis=0)
x_train_stds =  np.std(x_data_rows, axis=0)
x_data_rows = (x_data_rows - x_train_means) / x_train_stds

In [81]:
# print(np.mean(x_data_rows, axis=0), np.std(x_data_rows, axis=0))

In [82]:
# add 1 in the last column, so that b in including in w
x_data_rows = np.c_[x_data_rows, np.ones(x_data_rows.shape[0]) ]

In [83]:
# pre-transpose x for matrix multiplication
x_data_rows_trans = np.transpose(x_data_rows)

In [84]:
x_row_num, x_col_num = x_data_rows.shape  # 4001, 58

## Train

In [85]:
# reset parameters
w = np.random.randn(x_col_num, 1)  # shape = (x_col_num, 1). not (1, x_col_num) for the convenience of matrix multiplication
total_epoch = 0

In [None]:
# momentum
gamma = 0.9
vw = np.zeros((x_col_num, 1))

In [86]:
lr = (10) / x_row_num 

In [87]:
# training iterations

while True:
    y_pred_rows = special.expit(np.dot(x_data_rows, w))  # shape = (x_row_num, 1)
    y_diff_rows = (y_data_rows - y_pred_rows)  # shape = (x_row_num, 1)
    w_grad = -np.dot(x_data_rows_trans, y_diff_rows)

#     # print info
#     if total_epoch % 10000 == 0:
        
#         print('total_epoch', total_epoch)
#         print('rms_error', rms_error)

    # write model
    if total_epoch % 10000 == 0:
        rms_error = np.sum(np.square(y_diff_rows)) / x_row_num
        model_file_info = "epo" + str(total_epoch) + "_err" + str(rms_error)[:6]
        print(model_file_info)
        print('rms_error', rms_error)
        np.savez(MODEL_FOLDER + model_file_info, w=w, vw=vw, lr=lr, total_epoch=total_epoch, x_train_means=x_train_means, x_train_stds=x_train_stds)

    vw = lr * w_grad + gamma * vw
    w = w - vw

    total_epoch += 1
    

epo0_err0.5203
rms_error 0.520331924599
epo10000_err0.0597
rms_error 0.0597931033622
epo20000_err0.0593
rms_error 0.0593718677942
epo30000_err0.0591
rms_error 0.059172115291
epo40000_err0.0590
rms_error 0.0590595781678
epo50000_err0.0589
rms_error 0.0589895809672
epo60000_err0.0589
rms_error 0.0589443198333
epo70000_err0.0589
rms_error 0.0589142699985
epo80000_err0.0588
rms_error 0.0588936602908
epo90000_err0.0588
rms_error 0.0588789918783
epo100000_err0.0588
rms_error 0.0588681698102
epo110000_err0.0588
rms_error 0.0588599340136
epo120000_err0.0588
rms_error 0.058853509853
epo130000_err0.0588
rms_error 0.0588484046459
epo140000_err0.0588
rms_error 0.0588442920429
epo150000_err0.0588
rms_error 0.0588409465639
epo160000_err0.0588
rms_error 0.0588382061276
epo170000_err0.0588
rms_error 0.0588359501458
epo180000_err0.0588
rms_error 0.0588340863393
epo190000_err0.0588
rms_error 0.0588325425221
epo200000_err0.0588
rms_error 0.0588312612792
epo210000_err0.0588
rms_error 0.05883019638
epo2200

KeyboardInterrupt: 

## Test

In [None]:
test_data = pd.read_csv(TEST_FILE_PATH, header=None)
test_data.shape

In [None]:
_id = test_data[0].apply(str)

In [None]:
x_test_data_rows = test_data[list(range(1, 58))].as_matrix()

In [None]:
# x_select_col_idxs = [26, 40, 31, 24,  3, 41, 54, 55, 45, 52, 28, 22, 53, 34]
x_test_data_rows = x_test_data_rows[:, x_select_col_idxs]

In [None]:
# read model

In [None]:
# normalize
x_test_data_rows = (x_test_data_rows - x_train_means) / x_train_stds
# add 1
x_test_data_rows = np.c_[x_test_data_rows, np.ones(x_test_data_rows.shape[0]) ]

In [None]:
def to_bool(x):
    return 1 if x > 0.5 else 0

to_bool = np.vectorize(to_bool)

In [None]:
# test
y_test_rows = to_bool(special.expit(np.dot(x_test_data_rows, w)))

y_test_series = pd.Series(y_test_rows.flatten())

# concat id and y
output = pd.concat([_id, y_test_series], axis=1)
output.columns=["id","label"]

# write file
output.to_csv(OUTPUT_FOLDER + "output_" + METHOD_NAME + "_" + model_file_info + ".csv", index=False)