In [2]:
import pandas as pd
import numpy as np
import sys
import random
import math

## Read data

In [3]:
def Read_Data(data_path):
    df = pd.read_csv(data_path, encoding='big5')  ## Read data
    df.drop(columns=df.columns[:3], inplace=True) ## Remove first three columns
    df.replace('NR', 0, inplace=True)             ## Replace NR to 0
    df = df.astype(np.float)
    raw_data = df.to_numpy()
    raw_data[raw_data<0] = 0
    
    month_data = {}
    for month in range(12):
        sample = np.empty([18, 480])
        for day in range(20):
            sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day) : 18 * (20 * month + day + 1), :]
        month_data[month] = sample
    
    X = np.empty([12 * 471, 18 * 9], dtype = float)
    Y = np.empty([12 * 471, 1], dtype = float)
    for month in range(12):
        for day in range(20):
            for hour in range(24):
                if day == 19 and hour > 14:
                    continue
                X[month * 471 + day * 24 + hour, :] = month_data[month][:,day * 24 + hour : day * 24 + hour + 9].reshape(1, -1) #vector dim:18*9 (9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9)
                Y[month * 471 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9] #value

    ## Normalize  在多少個標準差內
    mean_x = np.mean(X, axis = 0) #18 * 9 
    std_x = np.std(X, axis = 0) #18 * 9 
    for i in range(len(X)): #12 * 471
        for j in range(len(X[0])): #18 * 9 
            if std_x[j] != 0:
                X[i][j] = (X[i][j] - mean_x[j]) / std_x[j]

    X = np.insert(X, X.shape[1], values=1, axis=1)
    return X, Y

In [4]:
def K_Fold(size, n_series, n_parts):
    n_valid = size//n_parts
    
    train_idx = []
    valid_idx = []
    for i in range(n_parts):
        start = list(range(size))
        random.shuffle(start)
        
        train_idx.append(start[n_valid:])
        valid_idx.append(start[:n_valid])
            
            
    return train_idx, valid_idx

## Linear Regression

In [5]:
class LinearRegressionUsingGD:
    
    def __init__(self, lr=20, eps=1e-10,iters=3000):
        self.lr = lr
        self.eps = eps
        self.iters = iters
        
    def fit(self, x, y):
        self.ω = np.zeros((x.shape[1], 1))
        adagrad = np.zeros((x.shape[1], 1))
        
        for t in range(self.iters):
            loss = np.sqrt(np.mean(np.power(np.dot(x, self.ω) - y, 2)))#rmse
            if((t+1)%500==0):
                print(str(t+1) + ":" + str(loss))
            gradient = 2 * np.dot(x.transpose(), np.dot(x, self.ω) - y) #dim*1
            adagrad += gradient ** 2
            self.ω = self.ω - self.lr * gradient / np.sqrt(adagrad + self.eps)
            
        return self
    
    def predict(self, x):
        return np.dot(x, self.ω)
    
    def weight():
        return self.w

## Training

In [6]:
## 前製作業
DATA_PATH = "../data/train.csv"
X, Y = Read_Data(DATA_PATH)

In [25]:
N_FOLDS  = 5
train_idx, valid_idx = K_Fold(X.shape[0], 9, N_FOLDS)

weight_best = None
loss_best = sys.maxsize

for i in range(N_FOLDS):
    LRGD = LinearRegressionUsingGD()
    
    x_train = np.asarray([X[idx] for idx in train_idx[i]])
    y_train = Y[train_idx[i]]
    
    x_valid = np.asarray([X[idx] for idx in valid_idx[i]])
    y_valid = Y[valid_idx[i]]
    
    LRGD.fit(x_train, y_train)
    y_pred = LRGD.predict(x_valid)
    loss = np.sqrt(np.mean(np.power(y_valid - y_pred, 2)))#rmse
    
    if loss < loss_best:
        weight_best = LRGD.ω
        loss_best = loss
    
    print("Folder: {}, Loss: {}".format(i, loss))
    
np.save("weight.npy", weight_best)

500:6.363026237788221
1000:5.939148356633326
1500:5.767370694350936
2000:5.680850329333689
2500:5.6341240924275535
3000:5.607923936258073
Folder: 0, Loss: 6.254216367784773
500:6.246397830077351
1000:5.890743498573801
1500:5.774941249649627
2000:5.717898398703483
2500:5.68685841376956
3000:5.669168801810322
Folder: 1, Loss: 5.907619014784265
500:6.228159976890824
1000:5.902461327280126
1500:5.792831251107579
2000:5.738800011402785
2500:5.709528651927577
3000:5.692914676278069
Folder: 2, Loss: 5.782124626644634
500:6.006423893719245
1000:5.793780369043158
1500:5.715505083414079
2000:5.6764124655381725
2500:5.654467583331398
3000:5.641462859749926
Folder: 3, Loss: 6.032213135573619
500:5.992013441870922
1000:5.759947108735798
1500:5.668871525881338
2000:5.6215654591114905
2500:5.594416095141232
3000:5.578143416039442
Folder: 4, Loss: 6.228075564892298


## Testing

In [26]:
df = pd.read_csv(DATA_PATH, encoding='big5')  ## Read data
df.drop(columns=df.columns[:3], inplace=True) ## Remove first three columns
df.replace('NR', 0, inplace=True)             ## Replace NR to 0
df = df.astype(np.float)
raw_data = df.to_numpy()
raw_data[raw_data<0] = 0

month_data = {}
for month in range(12):
    sample = np.empty([18, 480])
    for day in range(20):
        sample[:, day * 24 : (day + 1) * 24] = raw_data[18 * (20 * month + day) : 18 * (20 * month + day + 1), :]
    month_data[month] = sample

X = np.empty([12 * 471, 18 * 9], dtype = float)
Y = np.empty([12 * 471, 1], dtype = float)
for month in range(12):
    for day in range(20):
        for hour in range(24):
            if day == 19 and hour > 14:
                continue
            X[month * 471 + day * 24 + hour, :] = month_data[month][:,day * 24 + hour : day * 24 + hour + 9].reshape(1, -1) #vector dim:18*9 (9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9)
            Y[month * 471 + day * 24 + hour, 0] = month_data[month][9, day * 24 + hour + 9] #value

## Normalize  在多少個標準差內
mean_x = np.mean(X, axis = 0) #18 * 9 
std_x = np.std(X, axis = 0) #18 * 9 
for i in range(len(X)): #12 * 471
    for j in range(len(X[0])): #18 * 9 
        if std_x[j] != 0:
            X[i][j] = (X[i][j] - mean_x[j]) / std_x[j]

X = np.insert(X, X.shape[1], values=1, axis=1)

In [34]:
testdata = pd.read_csv('../data/test.csv', header = None, encoding = 'big5')
test_data = testdata.iloc[:, 2:]
test_data[test_data == 'NR'] = 0
test_data = test_data.astype(np.float)
test_data = test_data.to_numpy()
test_x = np.empty([240, 18*9], dtype = float)
for i in range(240):
    test_x[i, :] = test_data[18 * i: 18* (i + 1), :].reshape(1, -1)
    
test_data[test_data < 0] = 0

    
for i in range(len(test_x)):
    for j in range(len(test_x[0])):
        if std_x[j] != 0:
            test_x[i][j] = (test_x[i][j] - mean_x[j]) / std_x[j]
test_x = np.concatenate((np.ones([240, 1]), test_x), axis = 1).astype(float)
test_x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


array([[ 1.        , -0.25192794, -0.2529226 , ..., -0.67065391,
        -1.04594393,  0.07797893],
       [ 1.        , -1.38318472, -1.54531796, ...,  0.17279117,
        -0.10906991, -0.48454426],
       [ 1.        ,  1.52576127,  1.36257161, ..., -1.32666675,
        -1.04594393, -0.57829812],
       ...,
       [ 1.        ,  0.3945045 ,  0.55482451, ...,  0.26650729,
        -0.20275731,  1.20302531],
       [ 1.        , -1.86800905, -1.8684168 , ..., -1.04551839,
        -1.13963133, -1.14082131],
       [ 1.        , -1.38318472, -1.38376854, ...,  2.98427476,
         3.26367657,  1.76554849]])

In [35]:
w = np.load('weight.npy')
print(weight_best)
ans_y = np.dot(test_x, weight_best)
ans_y

[[ 5.73336470e-01]
 [-9.59997602e-02]
 [ 6.41488380e-01]
 [-1.54820544e+00]
 [-2.63745987e+00]
 [ 4.05395551e+00]
 [-8.99423285e-02]
 [-2.71346390e+00]
 [ 1.70759005e+00]
 [-4.67402859e-01]
 [-4.17089311e-02]
 [ 3.57868862e-01]
 [ 2.16749954e-01]
 [-2.53677954e-02]
 [-1.14634954e-01]
 [-5.60656373e-02]
 [ 9.19828081e-02]
 [ 5.51848637e-01]
 [ 7.22597441e-02]
 [ 5.02446794e-03]
 [-3.38360278e-02]
 [-1.80206354e-02]
 [ 1.32972355e-01]
 [-4.79722456e-03]
 [-2.24034370e-01]
 [ 1.27059379e-01]
 [ 2.97456020e-01]
 [-3.37651625e-01]
 [ 2.43804875e-01]
 [-1.36965549e-01]
 [ 3.05702812e-01]
 [ 4.30888941e-01]
 [-6.71499662e-01]
 [ 3.11272956e-01]
 [-3.81168258e-02]
 [ 3.12767936e-01]
 [-6.93109780e-03]
 [-7.04101637e-02]
 [ 3.06645990e-01]
 [-1.15388021e-01]
 [ 1.07274397e-01]
 [-3.08840921e-01]
 [ 1.36227636e-01]
 [ 1.44788683e-01]
 [-3.53389639e-01]
 [ 2.74047892e-01]
 [-2.70576212e-01]
 [-2.61083459e-01]
 [-1.97695996e-01]
 [-2.11719099e-01]
 [ 3.90356884e-02]
 [-4.10734016e-01]
 [-6.4924577

array([[-12.55353621],
       [-14.80583801],
       [ -0.49672932],
       [-21.33218731],
       [ 22.74766338],
       [  3.16596186],
       [  7.32454291],
       [ 11.97815811],
       [ 47.23093644],
       [ 50.55311676],
       [-16.79526487],
       [ 97.59928529],
       [ 37.42091173],
       [ 17.7444512 ],
       [-14.84263262],
       [-22.42379144],
       [ 38.86316081],
       [ 33.28795051],
       [  6.62973083],
       [ -2.42125194],
       [  3.01282984],
       [ 40.447779  ],
       [-12.23752153],
       [ 28.00554685],
       [-35.21240975],
       [ 17.65646786],
       [-31.42664621],
       [ 40.07478572],
       [ -4.89050522],
       [ 18.45836594],
       [ 30.812449  ],
       [-15.79114846],
       [-36.03288931],
       [ 29.95455792],
       [ 27.70101829],
       [-14.44725352],
       [ 12.59306182],
       [  5.07931203],
       [ 50.99800139],
       [ 47.27742146],
       [-11.33829437],
       [ 11.88200708],
       [ 43.66353991],
       [ 28

In [36]:


import csv
with open('submit.csv', mode='w', newline='') as submit_file:
    csv_writer = csv.writer(submit_file)
    header = ['id', 'value']
    print(header)
    csv_writer.writerow(header)
    for i in range(240):
        row = ['id_' + str(i), ans_y[i][0]]
        csv_writer.writerow(row)
        print(row)

['id', 'value']
['id_0', -12.553536209913997]
['id_1', -14.805838012980592]
['id_2', -0.49672932092399336]
['id_3', -21.332187310943567]
['id_4', 22.747663379824523]
['id_5', 3.165961860146569]
['id_6', 7.324542913417045]
['id_7', 11.978158106859716]
['id_8', 47.230936441688286]
['id_9', 50.55311676259493]
['id_10', -16.795264873287977]
['id_11', 97.5992852916402]
['id_12', 37.420911734453675]
['id_13', 17.744451199942006]
['id_14', -14.842632617552912]
['id_15', -22.42379143950978]
['id_16', 38.86316081337261]
['id_17', 33.2879505085013]
['id_18', 6.629730827823856]
['id_19', -2.4212519390150016]
['id_20', 3.012829838729523]
['id_21', 40.44777899770341]
['id_22', -12.237521527664937]
['id_23', 28.005546845353063]
['id_24', -35.21240975010662]
['id_25', 17.656467862889215]
['id_26', -31.426646209764804]
['id_27', 40.074785716178454]
['id_28', -4.890505217991339]
['id_29', 18.45836594162746]
['id_30', 30.812448998694478]
['id_31', -15.79114846005303]
['id_32', -36.03288930625813]
['id_3