In [2]:
import pandas as pd
import numpy as np
import sys
import random
import math

## Read data

In [86]:
def Read_Data(data_path):
    df = pd.read_csv(data_path, encoding='big5')  ## Read data
    df.drop(columns=df.columns[:3], inplace=True) ## Remove first three columns
    df.replace('NR', 0, inplace=True)             ## Replace NR to 0
    df = df.astype(np.float)
    raw_data = df.to_numpy()
    raw_data[raw_data<0] = 0
    
    month_data = {}
    for month in range(12):
        sample = np.empty([18, 480])
        for day in range(20):
            sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day) : 18 * (20 * month + day + 1), :]
        month_data[month] = sample
    
    X = np.empty([12 * 471, 18 * 9], dtype = float)
    Y = np.empty([12 * 471, 1], dtype = float)
    for month in range(12):
        for day in range(20):
            for hour in range(24):
                if day == 19 and hour > 14:
                    continue
                X[month * 471 + day * 24 + hour, :] = month_data[month][:,day * 24 + hour : day * 24 + hour + 9].reshape(1, -1) #vector dim:18*9 (9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9)
                Y[month * 471 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9] #value

    ## Normalize  在多少個標準差內
    mean_x = np.mean(X, axis = 0) #18 * 9 
    std_x = np.std(X, axis = 0) #18 * 9 
    for i in range(len(X)): #12 * 471
        for j in range(len(X[0])): #18 * 9 
            if std_x[j] != 0:
                X[i][j] = (X[i][j] - mean_x[j]) / std_x[j]

    X = np.insert(X, X.shape[1], values=1, axis=1)
    return X, Y

In [87]:
def K_Fold(size, n_series, n_parts):
    n_valid = size//n_parts
    
    train_idx = []
    valid_idx = []
    for i in range(n_parts):
        start = list(range(size))
        random.shuffle(start)
        
        train_idx.append(start[n_valid:])
        valid_idx.append(start[:n_valid])
            
            
    return train_idx, valid_idx

## Linear Regression

In [93]:
class LinearRegressionUsingGD:
    
    def __init__(self, lr=20, eps=1e-10,iters=3000):
        self.lr = lr
        self.eps = eps
        self.iters = iters
        
    def fit(self, x, y):
        self.ω = np.zeros((x.shape[1], 1))
        adagrad = np.zeros((x.shape[1], 1))
        
        for t in range(self.iters):
            loss = np.sqrt(np.mean(np.power(np.dot(x, self.ω) - y, 2)))#rmse
            if((t+1)%500==0):
                print(str(t+1) + ":" + str(loss))
            gradient = 2 * np.dot(x.transpose(), np.dot(x, self.ω) - y) #dim*1
            adagrad += gradient ** 2
            self.ω = self.ω - self.lr * gradient / np.sqrt(adagrad + self.eps)
            
        return self
    
    def predict(self, x):
        return np.dot(x, self.ω)
    
    def weight():
        return self.w

## Training

In [94]:
## 前製作業
DATA_PATH = "../data/train.csv"
X, Y = Read_Data(DATA_PATH)

In [95]:
N_FOLDS  = 5
train_idx, valid_idx = K_Fold(X.shape[0], 9, N_FOLDS)

weight_best = None
loss_best = sys.maxsize

for i in range(N_FOLDS):
    LRGD = LinearRegressionUsingGD()
    
    x_train = np.asarray([X[idx] for idx in train_idx[i]])
    y_train = Y[train_idx[i]]
    
    x_valid = np.asarray([X[idx] for idx in valid_idx[i]])
    y_valid = Y[valid_idx[i]]
    
    LRGD.fit(x_train, y_train)
    y_pred = LRGD.predict(x_valid)
    loss = np.sqrt(np.mean(np.power(y_valid - y_pred, 2)))#rmse
    
    if loss < loss_best:
        weight_best = LRGD.weight
        loss_best = loss

    print("Folder: {}, Loss: {}".format(i, loss_best))

500:6.118905961906962
1000:5.77993870977707
1500:5.670052566785369
2000:5.615819290538504
2500:5.586335526349196
3000:5.569560627540151
Folder: 0, Loss: 6.285064241206787
500:6.242019986155406
1000:5.919013056671352
1500:5.814075280506351
2000:5.760122078170419
2500:5.729007297065529
3000:5.710062999146107
Folder: 1, Loss: 5.756882377852687
500:6.485854315461657
1000:6.130348600639498
1500:5.973344458517011
2000:5.883511865566699
2500:5.8282728650356095
3000:5.7932718164097885
Folder: 2, Loss: 5.584119170909978
500:6.225879784561205
1000:5.892436188443931
1500:5.781382278699425
2000:5.726499964972605
2500:5.696638046028178
3000:5.6796453463139365
Folder: 3, Loss: 5.584119170909978
500:6.134328893931342
1000:5.92868984629562
1500:5.849452861022927
2000:5.808884565253724
2500:5.7859967536280985
3000:5.772517000502007
Folder: 4, Loss: 5.453961062511527
