# HW3 - Stock Movement Prediction

作業檔案：
- hw3.ipynb

資料：
https://www.sharecast.com/index/SP_500/prices/download

- train.csv: S&P 500 訓練資料(2009-2017)
- test.csv: S&P 500 測試資料(2018)


In [127]:
# Read data

import pandas as pd
import numpy as np

train_data_path = './train.csv'
test_data_path = './test.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

print(train_df.shape)
print(train_df.head())
print(test_df.shape)
print(test_df.head())

(2264, 6)
          Date  Open Price  Close Price  High Price  Low Price      Volume
0  02-Jan-2009      902.99       931.80      934.73     899.35  4048270080
1  05-Jan-2009      929.17       927.45      936.63     919.53  5413910016
2  06-Jan-2009      931.17       934.70      943.85     927.28  5392620032
3  07-Jan-2009      927.45       906.65      927.45     902.37  4704940032
4  08-Jan-2009      905.73       909.73      910.00     896.81  4991549952
(252, 6)
          Date  Open Price  Close Price  High Price  Low Price      Volume
0  02-Jan-2018     2683.73      2695.81     2695.89    2682.36  1846463232
1  03-Jan-2018     2697.85      2713.06     2714.37    2697.77  2090595328
2  04-Jan-2018     2719.31      2723.99     2729.29    2719.07  2100767744
3  05-Jan-2018     2731.33      2743.15     2743.45    2727.92  1918869120
4  08-Jan-2018     2742.67      2747.71     2748.51    2737.60  1894823936


In [128]:
# Drop unnecessary columns

drop_col_names = ['Date'] # !--- or you can modify it to drop the columns you don't want ---!

train_df.drop(columns=drop_col_names, inplace=True)
test_df.drop(columns=drop_col_names, inplace=True)

print(train_df.shape)
print(train_df.head())
print(test_df.shape)
print(test_df.head())

(2264, 5)
   Open Price  Close Price  High Price  Low Price      Volume
0      902.99       931.80      934.73     899.35  4048270080
1      929.17       927.45      936.63     919.53  5413910016
2      931.17       934.70      943.85     927.28  5392620032
3      927.45       906.65      927.45     902.37  4704940032
4      905.73       909.73      910.00     896.81  4991549952
(252, 5)
   Open Price  Close Price  High Price  Low Price      Volume
0     2683.73      2695.81     2695.89    2682.36  1846463232
1     2697.85      2713.06     2714.37    2697.77  2090595328
2     2719.31      2723.99     2729.29    2719.07  2100767744
3     2731.33      2743.15     2743.45    2727.92  1918869120
4     2742.67      2747.71     2748.51    2737.60  1894823936


In [129]:
# Add the column `Tomorrow Movement` by comparing the `Close Price` with the previous days as the training target

train_df['Tomorrow Movement'] = np.where(train_df['Close Price'].diff() >= 0, 1, 0)
test_df['Tomorrow Movement'] = np.where(test_df['Close Price'].diff() >= 0, 1, 0)

train_df['Tomorrow Movement'] = train_df['Tomorrow Movement'].shift(-1)
test_df['Tomorrow Movement'] = test_df['Tomorrow Movement'].shift(-1)

print(train_df.head())
print(train_df.tail())

   Open Price  Close Price  High Price  Low Price      Volume  \
0      902.99       931.80      934.73     899.35  4048270080   
1      929.17       927.45      936.63     919.53  5413910016   
2      931.17       934.70      943.85     927.28  5392620032   
3      927.45       906.65      927.45     902.37  4704940032   
4      905.73       909.73      910.00     896.81  4991549952   

   Tomorrow Movement  
0                0.0  
1                1.0  
2                0.0  
3                1.0  
4                0.0  
      Open Price  Close Price  High Price  Low Price      Volume  \
2259     2684.22      2683.34     2685.35    2678.13  1383888512   
2260     2679.09      2680.50     2682.74    2677.96  1103808384   
2261     2682.10      2682.62     2685.64    2678.91  1149108352   
2262     2686.10      2687.54     2687.66    2682.69  1126089856   
2263     2689.15      2673.61     2692.12    2673.61  1332374016   

      Tomorrow Movement  
2259                0.0  
2260      

In [130]:
# !--- You can add your own data preprocessing here ---!

In [131]:
# Drop rows with NaN values

train_df = train_df.dropna()
test_df = test_df.dropna()

print(train_df.shape)

(2263, 6)


In [132]:
# Divide x and y data

train_x_df = train_df.drop(columns=['Tomorrow Movement'])
train_y_df = train_df['Tomorrow Movement']

test_x_df = test_df.drop(columns=['Tomorrow Movement'])
test_y_df = test_df['Tomorrow Movement']

print(train_x_df.shape)
print(train_x_df.head())
print(train_y_df.shape)
print(train_y_df.head())
print('-----')
print(test_x_df.shape)
print(test_x_df.head())
print(test_y_df.shape)
print(test_y_df.head())

(2263, 5)
   Open Price  Close Price  High Price  Low Price      Volume
0      902.99       931.80      934.73     899.35  4048270080
1      929.17       927.45      936.63     919.53  5413910016
2      931.17       934.70      943.85     927.28  5392620032
3      927.45       906.65      927.45     902.37  4704940032
4      905.73       909.73      910.00     896.81  4991549952
(2263,)
0    0.0
1    1.0
2    0.0
3    1.0
4    0.0
Name: Tomorrow Movement, dtype: float64
-----
(251, 5)
   Open Price  Close Price  High Price  Low Price      Volume
0     2683.73      2695.81     2695.89    2682.36  1846463232
1     2697.85      2713.06     2714.37    2697.77  2090595328
2     2719.31      2723.99     2729.29    2719.07  2100767744
3     2731.33      2743.15     2743.45    2727.92  1918869120
4     2742.67      2747.71     2748.51    2737.60  1894823936
(251,)
0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: Tomorrow Movement, dtype: float64


In [133]:
# Normalize data

# !--- Modify here if you want ---!

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_x_df)

normalized_train_x_df = scaler.transform(train_x_df)
normalized_train_x_df = np.transpose(normalized_train_x_df)

normalized_train_x_df = pd.DataFrame({
    'Open Price': normalized_train_x_df[0],
    'Close Price': normalized_train_x_df[1],
    'High Price': normalized_train_x_df[2],
    'Low Price': normalized_train_x_df[3],
    'Volume': normalized_train_x_df[4],
})

normalized_test_x_df = scaler.transform(test_x_df)
normalized_test_x_df = np.transpose(normalized_test_x_df)
normalized_test_x_df = pd.DataFrame({
    'Open Price': normalized_test_x_df[0],
    'Close Price': normalized_test_x_df[1],
    'High Price': normalized_test_x_df[2],
    'Low Price': normalized_test_x_df[3],
    'Volume': normalized_test_x_df[4],
})

print(normalized_train_x_df.head())
print(train_y_df[:5])

   Close Price  High Price  Low Price  Open Price    Volume
0    -1.494607   -1.505683  -1.541181   -1.552572  0.813175
1    -1.503581   -1.501760  -1.499581   -1.498571  1.823826
2    -1.488625   -1.486853  -1.483605   -1.494446  1.808070
3    -1.546489   -1.520714  -1.534956   -1.502119  1.299148
4    -1.540136   -1.556744  -1.546417   -1.546921  1.511255
0    0.0
1    1.0
2    0.0
3    1.0
4    0.0
Name: Tomorrow Movement, dtype: float64


# Logistic Regression

In [134]:
# Train & Predict using Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_model = LogisticRegression() # !--- You can tune parameters here ---!
lr_model.fit(train_x_df, train_y_df)

predict_train_y = lr_model.predict(train_x_df)
print('training accuracy:')
print(accuracy_score(train_y_df, predict_train_y))

lr_predict_test_y = lr_model.predict(test_x_df)
print('\ntesting accuracy:')
print(accuracy_score(test_y_df, lr_predict_test_y))

print('\ntesting result prob:')
print(lr_model.predict_proba(test_x_df))

print('\npredicted testing labels:')
print(lr_predict_test_y)

training accuracy:
0.5475033141847105

testing accuracy:
0.5258964143426295

testing result prob:
[[0.47373765 0.52626235]
 [0.47027306 0.52972694]
 [0.47012876 0.52987124]
 [0.47270983 0.52729017]
 [0.47305113 0.52694887]
 [0.4721649  0.5278351 ]
 [0.47086505 0.52913495]
 [0.47186221 0.52813779]
 [0.47002017 0.52997983]
 [0.46401445 0.53598555]
 [0.46646792 0.53353208]
 [0.46860247 0.53139753]
 [0.46624391 0.53375609]
 [0.47030731 0.52969269]
 [0.46943068 0.53056932]
 [0.46601131 0.53398869]
 [0.46744132 0.53255868]
 [0.46930086 0.53069914]
 [0.47033248 0.52966752]
 [0.46673882 0.53326118]
 [0.46235434 0.53764566]
 [0.46580034 0.53419966]
 [0.46263301 0.53736699]
 [0.45191395 0.54808605]
 [0.44538165 0.55461835]
 [0.46088337 0.53911663]
 [0.45497735 0.54502265]
 [0.44872828 0.55127172]
 [0.46343584 0.53656416]
 [0.4714597  0.5285403 ]
 [0.46672417 0.53327583]
 [0.46846799 0.53153201]
 [0.46933479 0.53066521]
 [0.47063413 0.52936587]
 [0.46937827 0.53062173]
 [0.47075887 0.52924113]
 [

In [135]:
# Print precision, recall, fbeta-score and confusion matrix

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

print('precision, recall, fbeta-score:')
print(precision_recall_fscore_support(test_y_df, lr_predict_test_y, average='weighted'))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(test_y_df, lr_predict_test_y).ravel()
print((tn, fp, fn, tp))

precision, recall, fbeta-score:
(0.2765670386184346, 0.5258964143426295, 0.3624977895207681, None)

confusion matrix(tn, fp, fn, tp):
(0, 119, 0, 132)


# SVM

In [136]:
# Train & Predict with SVC

from sklearn.svm import SVC

svc_model = SVC() # !--- You can tune parameters here ---!
svc_model.fit(normalized_train_x_df, train_y_df)

predict_train_y = svc_model.predict(normalized_train_x_df)
print('training accuracy:')
print(accuracy_score(train_y_df, predict_train_y))

svc_predict_test_y = svc_model.predict(normalized_test_x_df)
print('\ntesting accuracy:')
print(accuracy_score(test_y_df, svc_predict_test_y))

print('\npredicted testing labels:')
print(svc_predict_test_y)

training accuracy:
0.5483870967741935

testing accuracy:
0.5258964143426295


In [137]:
# Print precision, recall, fbeta-score and confusion matrix

print('precision, recall, fbeta-score:')
print(precision_recall_fscore_support(test_y_df, svc_predict_test_y, average='weighted'))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(test_y_df, svc_predict_test_y).ravel()
print((tn, fp, fn, tp))

precision, recall, fbeta-score:
(0.2765670386184346, 0.5258964143426295, 0.3624977895207681, None)

confusion matrix(tn, fp, fn, tp):
(0, 119, 0, 132)


# Neural Network

In [138]:
# Define NN output groundtruth

falling_prob = pd.DataFrame(data=np.where(train_y_df == 0, 1, 0)[:])
train_y_df = pd.DataFrame(data=np.where(train_y_df == 0, 0, 1)[:])
train_y_df = pd.concat( [ falling_prob, train_y_df ], axis=1, ignore_index=True )

falling_prob = pd.DataFrame(data=np.where(test_y_df == 0, 1, 0)[:])
test_y_df = pd.DataFrame(data=np.where(test_y_df == 0, 0, 1)[:])
test_y_df = pd.concat( [ falling_prob, test_y_df ], axis=1, ignore_index=True )

print(train_y_df.shape)
print(train_y_df.head())

(2263, 5)
   Close Price  High Price  Low Price  Open Price    Volume
0    -1.494607   -1.505683  -1.541181   -1.552572  0.813175
1    -1.503581   -1.501760  -1.499581   -1.498571  1.823826
2    -1.488625   -1.486853  -1.483605   -1.494446  1.808070
3    -1.546489   -1.520714  -1.534956   -1.502119  1.299148
4    -1.540136   -1.556744  -1.546417   -1.546921  1.511255
(2263, 2)
   0  1
0  1  0
1  0  1
2  1  0
3  0  1
4  1  0


In [139]:
# Define NN structure

import torch
import torch.nn.functional as F

# !--- You can modify the NN structure here ---!
class M_NN(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(M_NN, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        h = self.linear1(x)
        acti_out = F.relu(h)
        y_pred = self.linear2(h)
        return y_pred


# N = batch size, D_in = input size, H = hidden size, D_out = output size
N, D_in, H, D_out = 300, 5, 100, 2  # !--- You can modify here ---!

model = M_NN(D_in, H, D_out)
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum') # !--- You can modify here ---!
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) # !--- You can modify here ---!


# Train NN
# !--- You can modify here ---!

for t in range(1000):
    for batch_num in range(N, len(normalized_train_x_df), N): 
        y_pred = model(torch.FloatTensor(normalized_train_x_df[batch_num-N:batch_num].values.tolist()))
        loss = criterion(y_pred, torch.FloatTensor(train_y_df[batch_num-N:batch_num].values.tolist()))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (t%100 == 0):
        print('epoch:%d - loss:%.10f' % (t, loss.item()))

epoch:0 - loss:583.8586425781
epoch:100 - loss:416.2506103516
epoch:200 - loss:415.3554992676
epoch:300 - loss:415.0850524902
epoch:400 - loss:414.9734802246
epoch:500 - loss:414.9225769043
epoch:600 - loss:414.8976745605
epoch:700 - loss:414.8839111328
epoch:800 - loss:414.8742370605
epoch:900 - loss:414.8660583496


In [140]:
# Predict

nn_predict_train_y = model.forward( torch.FloatTensor(normalized_train_x_df.values.tolist()))
result_train = np.where(nn_predict_train_y[:, 0] > nn_predict_train_y[:, 1], 1, 0) # !-- You can modify here --!
print('training accuracy:')
print(accuracy_score(train_y_df[0], result_train))

nn_predict_test_y = model.forward( torch.FloatTensor(normalized_test_x_df.values.tolist()))
result_test = np.where(nn_predict_test_y[:, 0] > nn_predict_test_y[:, 1], 1, 0) # !-- You can modify here --!
print('\ntesting accuracy:')
print(accuracy_score(test_y_df[0], result_test))

print('\npredicted testing prob:')
print(nn_predict_test_y)
print('\npredicted testing labels:')
print(result_test)

training accuracy:
0.547945205479452

testing accuracy:
0.5059760956175299

predicted testing prob:
tensor([[ 2.6308e-02, -2.7420e-02],
        [ 4.1684e-02, -4.3117e-02],
        [ 1.5923e-02, -1.7559e-02],
        [ 3.6360e-02, -3.7242e-02],
        [ 1.9350e-02, -1.9998e-02],
        [ 6.8915e-03, -8.2635e-03],
        [ 1.6368e-02, -1.6536e-02],
        [ 4.8928e-02, -5.0269e-02],
        [ 5.8429e-02, -5.9871e-02],
        [-3.7200e-02,  3.7948e-02],
        [ 7.0324e-02, -7.1183e-02],
        [ 9.6383e-03, -1.0133e-02],
        [ 4.6922e-02, -4.7694e-02],
        [ 8.7643e-02, -8.8980e-02],
        [ 3.8593e-02, -3.9564e-02],
        [ 1.1643e-02, -1.1224e-02],
        [ 1.2811e-02, -1.2681e-02],
        [ 1.0079e-01, -1.0213e-01],
        [-6.7658e-03,  6.4583e-03],
        [ 5.9786e-04, -9.8397e-04],
        [ 1.1464e-02, -1.1231e-02],
        [ 4.1512e-02, -4.3380e-02],
        [-1.0133e-01,  1.0367e-01],
        [-2.3181e-01,  2.3700e-01],
        [ 2.4100e-01, -2.4037e-01],


In [141]:
# Print precision, recall, fbeta-score and confusion matrix

print('\nprecision, recall, fbeta-score:')
print(precision_recall_fscore_support(test_y_df[0], result_test, average='weighted'))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(test_y_df[0], result_test).ravel()
print((tn, fp, fn, tp))


precision, recall, fbeta-score:
(0.5154106025991739, 0.5059760956175299, 0.4974081065255934, None)

confusion matrix(tn, fp, fn, tp):
(50, 82, 42, 77)


# Discussion