In [17]:
import numpy as np
from scipy import special
import pandas as pd
import os
import sys

In [18]:
METHOD_NAME = "hw_logistic"

TRAIN_FILE_PATH = "data/spam_train.csv"# sys.argv[1]
MODEL_FILE_PATH = "model"# sys.argv[2] # output model

## Read Training Data

In [19]:
train_data = pd.read_csv(TRAIN_FILE_PATH, header=None)

In [20]:
x_data_rows = train_data[list(range(1, 58))].as_matrix()
y_data_rows = train_data[[58]].as_matrix()

## Select Data

In [21]:
x_select_col_idxs = [26, 40, 31, 24,  3, 41, 54, 55, 45, 52, 28, 22, 53, 34, 43, 47, 6, 44, 15, 25, 16, 19, 38,  8, 56, 32,  4, 35, 27,  5, 48, 14]
x_data_rows = x_data_rows[:, x_select_col_idxs]

In [22]:
x_data_rows.shape

(4001, 32)

## Preprocess Data

## Transform Data

In [23]:
# Normalize
x_train_means = np.mean(x_data_rows, axis=0)
x_train_stds =  np.std(x_data_rows, axis=0)
x_data_rows = (x_data_rows - x_train_means) / x_train_stds

In [24]:
# print(np.mean(x_data_rows, axis=0), np.std(x_data_rows, axis=0))

In [25]:
# add 1 in the last column, so that b in including in w
x_data_rows = np.c_[x_data_rows, np.ones(x_data_rows.shape[0]) ]

In [26]:
# pre-transpose x for matrix multiplication
x_data_rows_trans = np.transpose(x_data_rows)

In [27]:
x_row_num, x_col_num = x_data_rows.shape  # 4001, 58

## Train

In [28]:
# reset parameters
w = np.random.randn(x_col_num, 1)  # shape = (x_col_num, 1). not (1, x_col_num) for the convenience of matrix multiplication
total_epoch = 0

In [29]:
# momentum
gamma = 0.9
vw = np.zeros((x_col_num, 1))

In [30]:
lr = (10) / x_row_num 

In [31]:
def to_bool(x):
    return 1 if x > 0.5 else 0

to_bool = np.vectorize(to_bool)

In [32]:
# training iterations

for i in range(200000):
    # forward
    y_pred_rows = special.expit(np.dot(x_data_rows, w))  # shape = (x_row_num, 1)
    y_diff_rows = (y_data_rows - y_pred_rows)  # shape = (x_row_num, 1)
    w_grad = -np.dot(x_data_rows_trans, y_diff_rows)
    
    # write model
    if total_epoch % 10000 == 0:
        train_error_count = np.sum(np.abs(to_bool(y_pred_rows) - y_data_rows))
        train_accuracy = float(x_row_num - train_error_count) / x_row_num
        
        model_file_info = "epo" + str(total_epoch) + "_acc" + str(train_accuracy)[:6]
        print(model_file_info)
        np.savez(MODEL_FILE_PATH, w=w, vw=vw, lr=lr, total_epoch=total_epoch, x_train_means=x_train_means, x_train_stds=x_train_stds)

    # update weights. using momentum
    vw = lr * w_grad + gamma * vw
    w = w - vw

    total_epoch += 1

# write model
train_error_count = np.sum(np.abs(to_bool(y_pred_rows) - y_data_rows))
train_accuracy = float(x_row_num - train_error_count) / x_row_num
model_file_info = "epo" + str(total_epoch) + "_acc" + str(train_accuracy)[:6]
print(model_file_info)
np.savez(MODEL_FILE_PATH, w=w, vw=vw, lr=lr, total_epoch=total_epoch, x_train_means=x_train_means, x_train_stds=x_train_stds)

epo0_acc0.4676
epo10000_acc0.9232
epo20000_acc0.9230
epo30000_acc0.9230
epo40000_acc0.9230
epo50000_acc0.9230
epo60000_acc0.9230
epo70000_acc0.9230
epo80000_acc0.9230
epo90000_acc0.9230
epo100000_acc0.9230
epo110000_acc0.9230
epo120000_acc0.9230
epo130000_acc0.9230
epo140000_acc0.9230
epo150000_acc0.9230
epo160000_acc0.9230
epo170000_acc0.9230
epo180000_acc0.9230
epo190000_acc0.9230
epo200000_acc0.9230


## Test

In [33]:
test_data = pd.read_csv(TEST_FILE_PATH, header=None)
test_data.shape

NameError: name 'TEST_FILE_PATH' is not defined

In [None]:
_id = test_data[0].apply(str)

In [None]:
x_test_data_rows = test_data[list(range(1, 58))].as_matrix()

In [None]:
# x_select_col_idxs = [26, 40, 31, 24,  3, 41, 54, 55, 45, 52, 28, 22, 53, 34]
x_test_data_rows = x_test_data_rows[:, x_select_col_idxs]

In [None]:
# read model

In [None]:
# normalize
x_test_data_rows = (x_test_data_rows - x_train_means) / x_train_stds
# add 1
x_test_data_rows = np.c_[x_test_data_rows, np.ones(x_test_data_rows.shape[0]) ]

In [None]:
# test
y_test_rows = to_bool(special.expit(np.dot(x_test_data_rows, w)))

y_test_series = pd.Series(y_test_rows.flatten())

# concat id and y
output = pd.concat([_id, y_test_series], axis=1)
output.columns=["id","label"]

# write file
output.to_csv(OUTPUT_FOLDER + "output_" + METHOD_NAME + "_" + model_file_info + ".csv", index=False)