In [1]:
%pylab inline
import numpy as np
import pandas as pd



Populating the interactive namespace from numpy and matplotlib


In [2]:
DATA_FOLDER = "data/"
MODEL_FOLDER = "models/all-momentum/"
OUTPUT_FOLDER = "outputs/"
TRAIN_FILE_PATH = DATA_FOLDER + "train.csv"
TEST_FILE_PATH = DATA_FOLDER + "test_X.csv"

In [3]:
# read train data
train_data = pd.read_csv(TRAIN_FILE_PATH, header=0, encoding="big5")

In [4]:
# preprocess train data
train_data[train_data=="NR"] = 0

In [5]:
train_data

Unnamed: 0,日期,測站,測項,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23
0,2014/1/1,豐原,AMB_TEMP,14,14,14,13,12,12,12,...,22,22,21,19,17,16,15,15,15,15
1,2014/1/1,豐原,CH4,1.8,1.8,1.8,1.8,1.8,1.8,1.8,...,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8
2,2014/1/1,豐原,CO,0.51,0.41,0.39,0.37,0.35,0.3,0.37,...,0.37,0.37,0.47,0.69,0.56,0.45,0.38,0.35,0.36,0.32
3,2014/1/1,豐原,NMHC,0.2,0.15,0.13,0.12,0.11,0.06,0.1,...,0.1,0.13,0.14,0.23,0.18,0.12,0.1,0.09,0.1,0.08
4,2014/1/1,豐原,NO,0.9,0.6,0.5,1.7,1.8,1.5,1.9,...,2.5,2.2,2.5,2.3,2.1,1.9,1.5,1.6,1.8,1.5
5,2014/1/1,豐原,NO2,16,9.2,8.2,6.9,6.8,3.8,6.9,...,11,11,22,28,19,12,8.1,7,6.9,6
6,2014/1/1,豐原,NOx,17,9.8,8.7,8.6,8.5,5.3,8.8,...,14,13,25,30,21,13,9.7,8.6,8.7,7.5
7,2014/1/1,豐原,O3,16,30,27,23,24,28,24,...,65,64,51,34,33,34,37,38,38,36
8,2014/1/1,豐原,PM10,56,50,48,35,25,12,4,...,52,51,66,85,85,63,46,36,42,42
9,2014/1/1,豐原,PM2.5,26,39,36,35,31,28,25,...,36,45,42,49,45,44,41,30,24,13


In [6]:
# concat 20 days data
trainDataByMonth = []  # 12 numpy.array, array's shape is (18, 480)
for month in range(12):
    trainDataArr = np.concatenate( [train_data[(360*month+18*day):(360*month+18*(day+1))][list(range(3, 27))].apply(pd.to_numeric).as_matrix() for day in range(20)], axis=1)
    trainDataByMonth.append(trainDataArr)

## Train

In [7]:
w = np.random.randn(18, 9)
b = np.random.randn()

In [8]:
prevLoss = np.inf

In [9]:
# momentum
gamma = 0.9
vb = 0
vw = np.zeros((18, 9))

In [10]:
gamma = 0.9

In [14]:
lr = 4 * 10**(-10)

In [12]:
# load model
model = np.load('models/all-momentum/los5.692615.npz')
w = model['w']
b = model['b']
vb = model['vb']
vw = model['vw']
lr = model['lr']
prevLoss = model['prevLoss']

In [None]:
while True:
    losses = []
    for epoch in range(1000):
        loss = 0
        deltaB = 0
        deltaW = np.zeros((18, 9))
        for month in range(12):
            for hour in range(470):
                x = trainDataByMonth[month][:, hour:hour+9]
                realY = trainDataByMonth[month][9, hour+9]

                predY = einsum('ij,ij', x, w) + b
                diff = realY - predY
                loss += diff * diff

                mult = 2 * diff * (-1)
                deltaB += mult
                deltaW += mult * x  # 2 * diff * (-1) * x

        loss = sqrt(loss/(12*470))
        losses.append(loss)

        if loss > prevLoss:
            lr *= 0.75
        
        vb = gamma * vb + lr * deltaB
        vw = gamma * vw + lr * deltaW
        b = b - vb
        w = w - vw
        
        prevLoss = loss
    
    file_name = "los" + str(loss)[:8]
    print(file_name)
    print(lr, loss)
    
    np.savez(MODEL_FOLDER + "los" + str(loss)[:8], w=w, b=b, vw=vw, vb=vb, lr=lr, prevLoss=prevLoss)

print('end')

los5.692600
4e-10 5.6926008641
los5.692592
4e-10 5.69259227328
los5.692583
4e-10 5.69258371114
los5.692575
4e-10 5.69257517753
los5.692566
4e-10 5.69256667233
los5.692558
4e-10 5.69255819539
los5.692549
4e-10 5.6925497466
los5.692541
4e-10 5.69254132581
los5.692532
4e-10 5.69253293289
los5.692524
4e-10 5.69252456773
los5.692516
4e-10 5.69251623018
los5.692507
4e-10 5.69250792011
los5.692499
4e-10 5.69249963741
los5.692491
4e-10 5.69249138194
los5.692483
4e-10 5.69248315358
los5.692474
4e-10 5.6924749522
los5.692466
4e-10 5.69246677768
los5.692458
4e-10 5.69245862989
los5.692450
4e-10 5.69245050871
los5.692442
4e-10 5.69244241401
los5.692434
4e-10 5.69243434568
los5.692426
4e-10 5.6924263036
los5.692418
4e-10 5.69241828763
los5.692410
4e-10 5.69241029767
los5.692402
4e-10 5.69240233359
los5.692394
4e-10 5.69239439527
los5.692386
4e-10 5.6923864826
los5.692378
4e-10 5.69237859546
los5.692370
4e-10 5.69237073373
los5.692362
4e-10 5.6923628973
los5.692355
4e-10 5.69235508605
los5.692347
4e

## Test

In [None]:
test_data = pd.read_csv(TEST_FILE_PATH, header=None, encoding="big5")
# preprocess
test_data[test_data=="NR"] = 0
# to numpy array
test_array = test_data[:][list(range(2, 11))].apply(pd.to_numeric).as_matrix()
test_array.shape

In [None]:
dataNum = 240

In [None]:
predYs = []
for dataIdx in range(dataNum):
    rowIdx = dataIdx * 18
    x = test_array[rowIdx+9:rowIdx+10, :]
    predY = einsum('ij,ij', x, w) + b
    predYs.append(predY)
predYs

In [None]:
predYSeries = pd.Series(predYs)
predYSeries

In [None]:
_id = pd.Series(["id_" + str(i) for i in range(dataNum)])
# _id = test_data.ix[test_data[1] == "PM2.5", 0]
output = pd.concat([_id, predYSeries], axis=1)
output.columns=["id","value"]
output

In [None]:
output.ix[output["value"]<0,["value"]]=0
output

In [None]:
output.to_csv(OUTPUT_FOLDER + "output_linear_only-pm2.5.csv", index=False)