In [1]:
%pylab inline
import numpy as np
from scipy import special
import pandas as pd
import os

Populating the interactive namespace from numpy and matplotlib


In [2]:
METHOD_NAME = "logistic_all_2"
DATA_FOLDER = "data/"
MODEL_FOLDER = "models/" + METHOD_NAME + "/"
OUTPUT_FOLDER = "outputs/"
TRAIN_FILE_PATH = DATA_FOLDER + "spam_train.csv"
TEST_FILE_PATH = DATA_FOLDER + "spam_test.csv"

In [3]:
if not os.path.exists(MODEL_FOLDER):
    os.makedirs(MODEL_FOLDER)
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

## Read Training Data

In [4]:
train_data = pd.read_csv(TRAIN_FILE_PATH, header=None)
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,1,0.0,0.0,1.13,0.0,0.37,0.0,0.0,0.0,0.0,...,0.0,0.145,0.0,0.436,0.0,0.0,1.792,55,147,0
1,2,0.0,0.0,0.6,0.0,0.0,0.6,0.0,0.0,0.6,...,0.0,0.143,0.047,0.191,0.143,0.0,2.041,31,196,1
2,3,0.0,0.0,0.48,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.45,0.0,0.0,1.138,4,41,0
3,4,0.0,0.51,0.0,0.0,0.51,0.51,1.02,0.0,0.0,...,0.0,0.142,0.0,0.071,1.212,0.0,7.025,130,281,1
4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.64,...,0.0,0.116,0.0,0.232,0.0,0.0,1.551,6,45,0


In [5]:
x_data_rows = train_data[list(range(1, 58))].as_matrix()
y_data_rows = train_data[[58]].as_matrix()

## Select Data

## Preprocess Data

## Transform Data

In [6]:
# Normalize
x_train_means = np.mean(x_data_rows, axis=0)
x_train_stds =  np.std(x_data_rows, axis=0)
x_data_rows = (x_data_rows - x_train_means) / x_train_stds

In [7]:
# print(np.mean(x_data_rows, axis=0), np.std(x_data_rows, axis=0))

In [8]:
# add 1 in the last column, so that b in including in w
x_data_rows = np.c_[x_data_rows, np.ones(x_data_rows.shape[0]) ]

In [9]:
# pre-transpose x for matrix multiplication
x_data_rows_trans = np.transpose(x_data_rows)

In [10]:
x_row_num, x_col_num = x_data_rows.shape  # 4001, 58

## Train

In [11]:
# reset parameters
w_init = np.random.randn(x_col_num, 1)  # shape = (x_col_num, 1). not (1, x_col_num) for the convenience of matrix multiplication
total_epoch = 0

In [12]:
lr = (10**-1) / x_row_num 

In [13]:
# training iterations

lrs = [(100) / x_row_num, (10) / x_row_num, (1) / x_row_num, (10**-1) / x_row_num, (10**-2) / x_row_num, (10**-3) / x_row_num]
errors_by_lr = []

# while True:
for lr in lrs:
    print("==== lr", lr)
    w = w_init
    total_epoch = 0
    errors = []
    for i in range(1001):
        y_pred_rows = special.expit(np.dot(x_data_rows, w))  # shape = (x_row_num, 1)
        y_diff_rows = (y_data_rows - y_pred_rows)  # shape = (x_row_num, 1)
        w_grad = -np.dot(x_data_rows_trans, y_diff_rows)

        # print info
#         if total_epoch % 100 == 0:
        rms_error = np.sum(np.square(y_diff_rows)) / x_row_num
        errors.append(rms_error)

        # write model
        if total_epoch % 1000 == 0:
            model_file_info = "epo" + str(total_epoch) + "_err" + str(rms_error)[:6]
            print(model_file_info)
            np.savez(MODEL_FOLDER + model_file_info, w=w, lr=lr, total_epoch=total_epoch, x_train_means=x_train_means, x_train_stds=x_train_stds)

        w = w - lr * w_grad

        total_epoch += 1
    
    print('rms_error', rms_error)
    errors_by_lr.append(errors)

NameError: name 'lrs' is not defined

## Plot

In [None]:
rcParams.update({'font.size': 12})
figure(figsize=(6, 4), dpi=240, edgecolor='k')

for i in range(len(errors_by_lr)):
    plot(errors_by_lr[i], label=str(lrs[i]))

ylabel('rms error')
xlabel('iterations')
legend(bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0)


## Test

In [None]:
test_data = pd.read_csv(TEST_FILE_PATH, header=None)
test_data.shape

In [None]:
_id = test_data[0].apply(str)

In [None]:
x_test_data_rows = test_data[list(range(1, 58))].as_matrix()

In [None]:
# read model

In [None]:
# normalize
x_test_data_rows = (x_test_data_rows - x_train_means) / x_train_stds
# add 1
x_test_data_rows = np.c_[x_test_data_rows, np.ones(x_test_data_rows.shape[0]) ]

In [None]:
def to_bool(x):
    return 1 if x > 0.5 else 0

to_bool = np.vectorize(to_bool)

In [None]:
# test
y_test_rows = to_bool(special.expit(np.dot(x_test_data_rows, w)))

y_test_series = pd.Series(y_test_rows.flatten())

# concat id and y
output = pd.concat([_id, y_test_series], axis=1)
output.columns=["id","label"]

# write file
output.to_csv(OUTPUT_FOLDER + "output_" + METHOD_NAME + model_file_info + ".csv", index=False)

In [None]:
x_test_data_rows.shape