In [1]:
import pandas as pd
import numpy as np
import sys
import random
import math


TRAIN_PATH = "../data/train.csv"
VALID_PATH = "../data/test.csv"

### Read training data

In [2]:
df = pd.read_csv(TRAIN_PATH, encoding='big5')  ## Read data
df.drop(columns=df.columns[:3], inplace=True) ## Remove first three columns
df.replace('NR', 0, inplace=True)             ## Replace NR to 0
df = df.astype(np.float)
raw_data = df.to_numpy()
# raw_data[raw_data<0] = 0

### Extract Features

In [3]:
month_data = {}
for month in range(12):
    sample = np.empty([18, 480])
    for day in range(20):
        sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day) : 18 * (20 * month + day + 1), :]
    month_data[month] = sample
    
x = np.empty([12 * 471, 18 * 9], dtype = float)
y = np.empty([12 * 471, 1], dtype = float)
for month in range(12):
    for day in range(20):
        for hour in range(24):
            if day == 19 and hour > 14:
                continue
            x[month * 471 + day * 24 + hour, :] = month_data[month][:,day * 24 + hour : day * 24 + hour + 9].reshape(1, -1) #vector dim:18*9 (9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9)
            y[month * 471 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9] #value

x = np.insert(x, x.shape[1], values=1, axis=1)

### Normalize

In [4]:
mean_x = np.mean(x, axis=0) #18 * 9 
std_x = np.std(x, axis=0) #18 * 9 
max_x = np.max(x, axis=0)
min_x = np.min(x, axis=0)

for i in range(len(x)): #12 * 471
    for j in range(len(x[0])): #18 * 9 
        if std_x[j] != 0:
#             x[i][j] = (x[i][j] - mean_x[j]) / std_x[j]
            x[i][j] = (x[i][j] - min_x[j]) / (max_x[j] - min_x[j])

### K-Fold 

In [5]:
def K_Fold(size, n_series, n_parts):
    n_valid = size//n_parts
    idxs = list(range(size))
    random.seed(0)
    random.shuffle(idxs)
    
    train_idx = []
    valid_idx = []
    for i in range(n_parts):
        train_idx.append(idxs[n_valid*i: n_valid*(i+1)])
        valid_idx.append(idxs[0:n_valid*i]+idxs[n_valid*(i+1):])
            
    return train_idx, valid_idx

### Linear Regression

In [21]:
class LinearRegressionUsingGD:    

    def __init__(self, lr=0.05, eps=1e-10,iters=2000):
        self.lr = lr
        self.eps = eps
        self.iters = iters
        
    def fit(self, x, y):
        m = x.shape[0]
        self.ω = np.zeros((x.shape[1], 1))
        adagrad = np.zeros((x.shape[1], 1))
        
        for t in range(self.iters):
            h = np.dot(x, self.ω)
            loss = h - y
            cost = np.sqrt(np.mean(np.power(loss, 2)))
            gradient = np.dot(x.transpose(), loss) / m
            self.ω = self.ω - self.lr * gradient

            if((t+1)%1000==0):
                print(str(t+1) + ":" + str(cost))
                
        return self
    
    def predict(self, x):
        return np.dot(x, self.ω)
    
    def weight():
        return self.w

### Training

In [22]:
N_FOLDS  = 5
train_idx, valid_idx = K_Fold(x.shape[0], 9, N_FOLDS)

weight_best = None
loss_best = sys.maxsize

for i in range(N_FOLDS):
    LRGD = LinearRegressionUsingGD(lr=0.06, eps=1e-10,iters=10000)
    
    x_train = np.asarray([x[idx] for idx in train_idx[i]])
    y_train = y[train_idx[i]]
    
    x_valid = np.asarray([x[idx] for idx in valid_idx[i]])
    y_valid = y[valid_idx[i]]
    
    LRGD.fit(x_train, y_train)
    y_pred = LRGD.predict(x_valid)
    loss = np.sqrt(np.mean(np.power(y_valid - y_pred, 2)))#rmse
    
    if loss < loss_best:
        weight_best = LRGD.ω
        loss_best = loss
    
    print("Folder: {}, Loss: {}".format(i, loss))
    
np.save("weight.npy", weight_best)

1000:7.430866170956298
2000:6.78333753655135
3000:6.447567922328777
4000:6.238701659371139
5000:6.095579368044176
6000:5.990754186726658
7000:5.910084469507814
8000:5.845631564925082
9000:5.7926382926258935
10000:5.748092225920544
Folder: 0, Loss: 6.403631965910276
1000:7.290969783219211
2000:6.669726804854198
3000:6.372205141710293
4000:6.192024572383281
5000:6.067515834905695
6000:5.974600452504242
7000:5.901811011758901
8000:5.84283393355735
9000:5.793834111185964
10000:5.752319571530369
Folder: 1, Loss: 6.425529134031125
1000:8.006320179193724
2000:7.306434063379626
3000:6.975044452487796
4000:6.773128637470439
5000:6.632475577068642
6000:6.526930109952592
7000:6.444014904955606
8000:6.376805273688396
9000:6.321049257450814
10000:6.273949819081777
Folder: 2, Loss: 6.316852338242566
1000:7.319627690189497
2000:6.612234657724307
3000:6.247391041022926
4000:6.024437378608427
5000:5.876176769796342
6000:5.771909293623724
7000:5.695404595348716
8000:5.637309976548846
9000:5.591907963289

## Testing 

In [8]:
testdata = pd.read_csv(VALID_PATH, header = None, encoding = 'big5')
test_data = testdata.iloc[:, 2:]
test_data[test_data == 'NR'] = 0
test_data = test_data.astype(np.float)
test_data = test_data.to_numpy()
test_x = np.empty([240, 18*9], dtype = float)
for i in range(240):
    test_x[i, :] = test_data[18 * i: 18* (i + 1), :].reshape(1, -1)
    
# test_data[test_data < 0] = 0

    
for i in range(len(test_x)):
    for j in range(len(test_x[0])):
        if std_x[j] != 0:
#             test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]
            test_x[i][j] = (test_x[i][j] - min_x[j]) / (max_x[j] - min_x[j])
            
test_x = np.insert(test_x, test_x.shape[1], values=1, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)


In [9]:
w = np.load('weight.npy')
ans_y = np.dot(test_x, w)
ans_y

array([[ 10.64479754],
       [ 18.60468392],
       [ 30.62498463],
       [ 10.56232299],
       [ 30.6320705 ],
       [ 23.16501207],
       [ 22.38166553],
       [ 34.14968821],
       [ 24.38926425],
       [ 51.42626162],
       [ 21.03981138],
       [ 12.24485779],
       [ 56.75283804],
       [ 50.43002068],
       [ 22.13163154],
       [ 11.06166302],
       [ 30.83641325],
       [ 64.4231117 ],
       [  3.9461177 ],
       [ 14.85155013],
       [ 41.05543201],
       [ 64.89379391],
       [  8.54916512],
       [ 23.54599126],
       [ 16.59817991],
       [ 35.2809057 ],
       [ 12.77296972],
       [ 74.17091703],
       [  7.76493393],
       [ 55.88545376],
       [ 22.65808116],
       [  5.68937764],
       [  6.63359498],
       [ 29.14055749],
       [ 34.99507344],
       [ 33.78759608],
       [ 39.03481912],
       [ 30.54116472],
       [ 45.26244765],
       [ 40.84287303],
       [  7.42632654],
       [ 39.07789412],
       [ 35.2094105 ],
       [ 49

In [10]:
import csv
with open('submit.csv', mode='w', newline='') as submit_file:
    csv_writer = csv.writer(submit_file)
    header = ['id', 'value']
    print(header)
    csv_writer.writerow(header)
    for i in range(240):
        row = ['id_' + str(i), ans_y[i][0]]
        csv_writer.writerow(row)
        print(row)

['id', 'value']
['id_0', 10.644797535606381]
['id_1', 18.604683921963627]
['id_2', 30.624984626030923]
['id_3', 10.562322989728049]
['id_4', 30.63207050480257]
['id_5', 23.165012070030233]
['id_6', 22.381665530380815]
['id_7', 34.14968821321543]
['id_8', 24.389264249228255]
['id_9', 51.4262616228506]
['id_10', 21.039811378249063]
['id_11', 12.24485779249173]
['id_12', 56.75283804284453]
['id_13', 50.43002067525627]
['id_14', 22.13163153507384]
['id_15', 11.061663017966193]
['id_16', 30.83641325438021]
['id_17', 64.4231117045459]
['id_18', 3.9461177028651897]
['id_19', 14.85155013266911]
['id_20', 41.055432011354235]
['id_21', 64.89379390830881]
['id_22', 8.54916512276196]
['id_23', 23.545991257449902]
['id_24', 16.598179910607776]
['id_25', 35.28090569934713]
['id_26', 12.77296971942049]
['id_27', 74.17091703315705]
['id_28', 7.76493393281388]
['id_29', 55.88545376041563]
['id_30', 22.658081156729665]
['id_31', 5.689377636836959]
['id_32', 6.633594975595766]
['id_33', 29.14055748943991