In [1]:
import pandas as pd
import numpy as np
import sys
import random
import math

## Read data

In [2]:
def Read_Data(data_path):
    df = pd.read_csv(data_path, encoding='big5')  ## Read data
    df.drop(columns=df.columns[:3], inplace=True) ## Remove first three columns
    df.replace('NR', 0, inplace=True)             ## Replace NR to 0
    df = df.astype(np.float)
    raw_data = df.to_numpy()
    raw_data[raw_data<0] = 0
    
    month_data = {}
    for month in range(12):
        sample = np.empty([18, 480])
        for day in range(20):
            sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day) : 18 * (20 * month + day + 1), :]
        month_data[month] = sample
    
    X = np.empty([12 * 471, 18 * 9], dtype = float)
    Y = np.empty([12 * 471, 1], dtype = float)
    for month in range(12):
        for day in range(20):
            for hour in range(24):
                if day == 19 and hour > 14:
                    continue
                X[month * 471 + day * 24 + hour, :] = month_data[month][:,day * 24 + hour : day * 24 + hour + 9].reshape(1, -1) #vector dim:18*9 (9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9)
                Y[month * 471 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9] #value

    ## Normalize  在多少個標準差內
    mean_x = np.mean(X, axis = 0) #18 * 9 
    std_x = np.std(X, axis = 0) #18 * 9 
    for i in range(len(X)): #12 * 471
        for j in range(len(X[0])): #18 * 9 
            if std_x[j] != 0:
                X[i][j] = (X[i][j] - mean_x[j]) / std_x[j]

    X = np.insert(X, X.shape[1], values=1, axis=1)
    return X, Y

In [3]:
def K_Fold(size, n_series, n_parts):
    n_valid = size//n_parts
    
    train_idx = []
    valid_idx = []
    for i in range(n_parts):
        start = list(range(size))
        random.shuffle(start)
        
        train_idx.append(start[n_valid:])
        valid_idx.append(start[:n_valid])
            
    return train_idx, valid_idx

## Linear Regression

In [8]:
class LinearRegressionUsingGD:    

    def __init__(self, lr=0.05, iters=3000):
        self.lr = lr
        self.iters = iters
        
    def fit(self, x, y):
        m = x.shape[0]
        self.ω = np.zeros((x.shape[1], 1))
        adagrad = np.zeros((x.shape[1], 1))
        
        for t in range(self.iters):
            h = np.dot(x, self.ω)
            loss = h - y
            cost = np.sqrt(np.mean(np.power(loss, 2)))
            gradient = np.dot(x.transpose(), loss) / m
            self.ω = self.ω - self.lr * gradient

            if((t+1)%1000==0):
                print(str(t+1) + ":" + str(cost))
                
        return self
    
    def predict(self, x):
        return np.dot(x, self.ω)
    
    def weight():
        return self.w

## Training

In [9]:
## 前製作業
DATA_PATH = "../data/train.csv"
X, Y = Read_Data(DATA_PATH)

In [10]:
LRGD = LinearRegressionUsingGD()
LRGD.fit(X, Y)

np.save("weight.npy", LRGD.ω)

1000:5.718052134599244
2000:5.675716998908237
3000:5.669107775195088


## Testing

In [35]:
df = pd.read_csv(DATA_PATH, encoding='big5')  ## Read data
df.drop(columns=df.columns[:3], inplace=True) ## Remove first three columns
df.replace('NR', 0, inplace=True)             ## Replace NR to 0
df = df.astype(np.float)
raw_data = df.to_numpy()
raw_data[raw_data<0] = 0

month_data = {}
for month in range(12):
    sample = np.empty([18, 480])
    for day in range(20):
        sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day) : 18 * (20 * month + day + 1), :]
    month_data[month] = sample

X = np.empty([12 * 471, 18 * 9], dtype = float)
Y = np.empty([12 * 471, 1], dtype = float)
for month in range(12):
    for day in range(20):
        for hour in range(24):
            if day == 19 and hour > 14:
                continue
            X[month * 471 + day * 24 + hour, :] = month_data[month][:,day * 24 + hour : day * 24 + hour + 9].reshape(1, -1) #vector dim:18*9 (9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9)
            Y[month * 471 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9] #value

## Normalize  在多少個標準差內
mean_x = np.mean(X, axis = 0) #18 * 9 
std_x = np.std(X, axis = 0) #18 * 9 
for i in range(len(X)): #12 * 471
    for j in range(len(X[0])): #18 * 9 
        if std_x[j] != 0:
            X[i][j] = (X[i][j] - mean_x[j]) / std_x[j]

X = np.insert(X, X.shape[1], values=1, axis=1)

In [36]:
testdata = pd.read_csv('../data/test.csv', header = None, encoding = 'big5')
test_data = testdata.iloc[:, 2:]
test_data[test_data == 'NR'] = 0
test_data = test_data.astype(np.float)
test_data = test_data.to_numpy()
test_x = np.empty([240, 18*9], dtype = float)
for i in range(240):
    test_x[i, :] = test_data[18 * i: 18* (i + 1), :].reshape(1, -1)
    
test_data[test_data < 0] = 0

    
for i in range(len(test_x)):
    for j in range(len(test_x[0])):
        if std_x[j] != 0:
            test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]
test_x = np.concatenate((test_x, np.ones([240, 1])), axis = 1).astype(float)
test_x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


array([[-0.25192794, -0.2529226 , -0.41532356, ..., -1.04594393,
         0.07797893,  1.        ],
       [-1.38318472, -1.54531796, -1.54560884, ..., -0.10906991,
        -0.48454426,  1.        ],
       [ 1.52576127,  1.36257161,  1.52230835, ..., -1.04594393,
        -0.57829812,  1.        ],
       ...,
       [ 0.3945045 ,  0.55482451,  0.71496172, ..., -0.20275731,
         1.20302531,  1.        ],
       [-1.86800905, -1.8684168 , -1.86854749, ..., -1.13963133,
        -1.14082131,  1.        ],
       [-1.38318472, -1.38376854, -1.38413951, ...,  3.26367657,
         1.76554849,  1.        ]])

In [37]:
w = np.load('weight.npy')
ans_y = np.dot(test_x, w)
ans_y

array([[ 6.58288868e+00],
       [ 1.84244773e+01],
       [ 2.41221003e+01],
       [ 7.64894653e+00],
       [ 2.70306206e+01],
       [ 2.22013598e+01],
       [ 2.37115018e+01],
       [ 3.05295876e+01],
       [ 1.68113009e+01],
       [ 5.98564496e+01],
       [ 1.17160964e+01],
       [ 9.18483843e+00],
       [ 6.32517336e+01],
       [ 5.31673734e+01],
       [ 2.22127525e+01],
       [ 1.22940312e+01],
       [ 3.23256213e+01],
       [ 6.66815989e+01],
       [-4.03679157e-01],
       [ 1.71196836e+01],
       [ 4.17717543e+01],
       [ 7.22723806e+01],
       [ 9.26547168e+00],
       [ 1.79550655e+01],
       [ 1.48589937e+01],
       [ 3.79754403e+01],
       [ 1.48000817e+01],
       [ 6.76522838e+01],
       [ 7.20493103e+00],
       [ 5.53852252e+01],
       [ 2.44540442e+01],
       [ 8.50821394e+00],
       [ 2.63617610e+00],
       [ 1.87357399e+01],
       [ 2.76451524e+01],
       [ 3.72368521e+01],
       [ 4.33040865e+01],
       [ 2.98286988e+01],
       [ 4.2

In [38]:
import csv
with open('submit.csv', mode='w', newline='') as submit_file:
    csv_writer = csv.writer(submit_file)
    header = ['id', 'value']
    print(header)
    csv_writer.writerow(header)
    for i in range(240):
        row = ['id_' + str(i), ans_y[i][0]]
        csv_writer.writerow(row)
        print(row)

['id', 'value']
['id_0', 6.582888683657]
['id_1', 18.424477292712464]
['id_2', 24.122100338508115]
['id_3', 7.648946529543143]
['id_4', 27.030620616740055]
['id_5', 22.201359846056846]
['id_6', 23.711501751016343]
['id_7', 30.529587555327996]
['id_8', 16.811300858726543]
['id_9', 59.85644956208702]
['id_10', 11.716096394486517]
['id_11', 9.18483843053859]
['id_12', 63.2517335882297]
['id_13', 53.167373351027024]
['id_14', 22.212752513954744]
['id_15', 12.294031172572373]
['id_16', 32.32562127132431]
['id_17', 66.68159889426246]
['id_18', -0.4036791572454881]
['id_19', 17.11968363161875]
['id_20', 41.771754323137564]
['id_21', 72.27238062391129]
['id_22', 9.26547168437639]
['id_23', 17.955065468436963]
['id_24', 14.858993719792629]
['id_25', 37.97544032323237]
['id_26', 14.800081683691161]
['id_27', 67.65228381345108]
['id_28', 7.204931026916505]
['id_29', 55.38522521373399]
['id_30', 24.454044240240094]
['id_31', 8.508213940975594]
['id_32', 2.6361760992886456]
['id_33', 18.73573994658