In [415]:
%pylab inline
import numpy as np
import pandas as pd
import os

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [416]:
DATA_FOLDER = "data/"
MODEL_FOLDER = "models/hw_linear-regression/"
OUTPUT_FOLDER = "outputs/"
TRAIN_FILE_PATH = DATA_FOLDER + "train.csv"
TEST_FILE_PATH = DATA_FOLDER + "test_X.csv"

In [417]:
if not os.path.exists(MODEL_FOLDER):
    os.makedirs(MODEL_FOLDER)

In [418]:
# read train data
train_data = pd.read_csv(TRAIN_FILE_PATH, header=0, encoding="big5")

In [419]:
# preprocess train data
train_data[train_data=="NR"] = 0

In [420]:
train_data.head(18)

Unnamed: 0,日期,測站,測項,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23
0,2014/1/1,豐原,AMB_TEMP,14.0,14.0,14.0,13.0,12.0,12.0,12.0,...,22.0,22.0,21.0,19.0,17.0,16.0,15.0,15.0,15.0,15.0
1,2014/1/1,豐原,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.8,...,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
2,2014/1/1,豐原,CO,0.51,0.41,0.39,0.37,0.35,0.3,0.37,...,0.37,0.37,0.47,0.69,0.56,0.45,0.38,0.35,0.36,0.32
3,2014/1/1,豐原,NMHC,0.2,0.15,0.13,0.12,0.11,0.06,0.1,...,0.1,0.13,0.14,0.23,0.18,0.12,0.1,0.09,0.1,0.08
4,2014/1/1,豐原,NO,0.9,0.6,0.5,1.7,1.8,1.5,1.9,...,2.5,2.2,2.5,2.3,2.1,1.9,1.5,1.6,1.8,1.5
5,2014/1/1,豐原,NO2,16.0,9.2,8.2,6.9,6.8,3.8,6.9,...,11.0,11.0,22.0,28.0,19.0,12.0,8.1,7.0,6.9,6.0
6,2014/1/1,豐原,NOx,17.0,9.8,8.7,8.6,8.5,5.3,8.8,...,14.0,13.0,25.0,30.0,21.0,13.0,9.7,8.6,8.7,7.5
7,2014/1/1,豐原,O3,16.0,30.0,27.0,23.0,24.0,28.0,24.0,...,65.0,64.0,51.0,34.0,33.0,34.0,37.0,38.0,38.0,36.0
8,2014/1/1,豐原,PM10,56.0,50.0,48.0,35.0,25.0,12.0,4.0,...,52.0,51.0,66.0,85.0,85.0,63.0,46.0,36.0,42.0,42.0
9,2014/1/1,豐原,PM2.5,26.0,39.0,36.0,35.0,31.0,28.0,25.0,...,36.0,45.0,42.0,49.0,45.0,44.0,41.0,30.0,24.0,13.0


In [421]:
# concat 20 days data
train_array_months = []  # 12 numpy.array, array's shape is (18, 480)
for month in range(12):
    temp_train_array = np.concatenate( [train_data[(360*month+18*day):(360*month+18*(day+1))][list(range(3, 27))].apply(pd.to_numeric).as_matrix() for day in range(20)], axis=1)
    train_array_months.append(temp_train_array)

In [422]:
# use part of data
x_row_idxs = [1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13]  # > 0.2
x_row_num = len(x_row_idxs)
x_col_num = 9

train_array_months_processed = [ta[x_row_idxs, :] for ta in train_array_months]

## Train

In [423]:
w = np.random.randn(x_row_num, x_col_num)
b = np.random.randn()
total_epoch = 0

In [424]:
prev_loss = np.inf

In [425]:
# momentum
gamma = 0.9
vb = 0
vw = np.zeros((x_row_num, x_col_num))

In [426]:
b = np.random.randn()
w = np.random.randn(x_row_num, x_col_num)

In [427]:
lr = 4 * 10**(-9)

In [428]:
# # load model
# model = np.load(MODEL_FOLDER + 'epo2100_los4.9199.npz')
# w = model['w']
# b = model['b']
# vb = model['vb']
# vw = model['vw']
# lr = model['lr']
# total_epoch = model['total_epoch']

In [429]:
while True:
    # one training epoch

    # reset temp variables 
    loss = 0
    b_grad = 0
    w_grad = np.zeros((x_row_num, x_col_num))

    # calculate loss and gradient over all training data
    for month in range(12):
        for hour in range(470):
            # extract x and y from training data
            # train_array_months is a list of 12 arrays
            # x will be an ?x9 numpy array. '?' depends on how we process training data
            x = train_array_months_processed[month][:, hour:hour+9]
            y_data = train_array_months[month][9, hour+9]

            # calculate y from x, w, b
            y_pred= einsum('ij,ij', x, w) + b

            # update loss
            diff = y_data - y_pred
            loss += diff * diff

            # update gradient of b and w
            mult = 2 * diff * (-1)
            b_grad += mult
            w_grad += mult * x  # 2 * diff * (-1) * x

    # calculate rms error
    loss = sqrt(loss/(12*470))

    # save model
    if total_epoch % 100 == 0:
        model_file_info = "epo" + str(total_epoch) + "_los" + str(loss)[:6]
        print(model_file_info)
        np.savez(MODEL_FOLDER + model_file_info, b=b, w=w, vb=vb, vw=vw, lr=lr, total_epoch=total_epoch)

        # stop training in some cases
        if loss < 6:
            break
        
    # update parameters
    vb = lr * b_grad + gamma * vb
    vw = lr * w_grad + gamma * vw
    b = b - vb
    w = w - vw
    total_epoch += 1

print('end of training')

epo0_los495.96
epo100_los13.476
epo200_los9.6748
epo300_los8.2515
epo400_los7.6015
epo500_los7.2443
epo600_los7.0179
epo700_los6.8599
epo800_los6.7423
epo900_los6.6503
epo1000_los6.5755
epo1100_los6.5129
epo1200_los6.4592
epo1300_los6.4122
epo1400_los6.3706
epo1500_los6.3332
epo1600_los6.2995
epo1700_los6.2687
epo1800_los6.2405
epo1900_los6.2145
epo2000_los6.1906
epo2100_los6.1684
epo2200_los6.1479
epo2300_los6.1287
epo2400_los6.1109
epo2500_los6.0942
epo2600_los6.0787
epo2700_los6.0641
epo2800_los6.0504
epo2900_los6.0376
epo3000_los6.0255
epo3100_los6.0142
epo3200_los6.0035
epo3300_los5.9934
end of training


## Test

In [430]:
test_data = pd.read_csv(TEST_FILE_PATH, header=None, encoding="big5")
test_data[test_data=="NR"] = 0
test_array = test_data[:][list(range(2, 11))].apply(pd.to_numeric).as_matrix()
data_num = 240

In [431]:
x_test_arrays = []
for data_idx in range(data_num):
    row_start_idx = data_idx*18
    this_row_idxs = [row_start_idx+i for i in x_row_idxs]
    this_extr_rows = test_array[this_row_idxs, :]
    x_test_arrays.append(this_extr_rows)

In [432]:
_id = pd.Series(["id_" + str(i) for i in range(data_num)])

In [433]:
# # load model
# model_file = 'epo1532682_los5.7251.npz'
# model = np.load(MODEL_FOLDER + model_file)
# w = model['w']
# b = model['b']
# model_file_info = model_file[:-4]

In [434]:
# calc y
y_preds = []
for dataIdx in range(data_num):
    x = x_test_arrays[dataIdx]
    y_pred = einsum('ij,ij', x, w) + b
    y_preds.append(y_pred)
y_pred_series = pd.Series(y_preds)

# concat id and y
output = pd.concat([_id, y_pred_series], axis=1)
output.columns=["id","value"]

# set as 0 if value < 0
output.ix[output["value"] < 0, ["value"]] = 0

# write file
# output.to_csv(OUTPUT_FOLDER + "output_hw_linear-regression_" + model_file_info + ".csv", index=False)
output.to_csv("linear_regression.csv", index=False)