# HW3 - Stock Movement Prediction - nasdaq100

In [1]:
# Read data

import pandas as pd
import numpy as np

train_data_path = './nasdaq-train.csv'
test_data_path = './nasdaq-test.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

print(train_df.shape)
print(train_df.head())

(2263, 6)
          Date  Open Price  Close Price  High Price  Low Price     Volume
0  02-Jan-2009     1212.74      1263.70     1266.45    1208.91  248576128
1  05-Jan-2009     1254.70      1262.52     1274.11    1244.89  298954208
2  06-Jan-2009     1274.36      1274.49     1286.08    1265.53  343802624
3  07-Jan-2009     1249.98      1238.60     1256.34    1228.32  388103616
4  08-Jan-2009     1231.75      1252.52     1252.52    1223.81  326125920


In [2]:
# Drop unnecessary columns

train_df.drop(columns=['Date', 'Volume'], inplace=True) # , 'Volume', 'High Price', 'Low Price'
test_df.drop(columns=['Date', 'Volume'], inplace=True) # , 'Volume', 'High Price', 'Low Price'

print(train_df.shape)
print(train_df.head())

(2263, 4)
   Open Price  Close Price  High Price  Low Price
0     1212.74      1263.70     1266.45    1208.91
1     1254.70      1262.52     1274.11    1244.89
2     1274.36      1274.49     1286.08    1265.53
3     1249.98      1238.60     1256.34    1228.32
4     1231.75      1252.52     1252.52    1223.81


In [3]:
# Add the column `Tomorrow Movement` by comparing the `Close Price` with previous days as the training target
# Add the column `Tomorrow Open` by shifting the column `Open Price` as one of the new features

train_df['Tomorrow Movement'] = np.where(train_df['Close Price'].diff() >= 0, 1, 0)
test_df['Tomorrow Movement'] = np.where(test_df['Close Price'].diff() >= 0, 1, 0)

train_df['Tomorrow Movement'] = train_df['Tomorrow Movement'].shift(-1)
test_df['Tomorrow Movement'] = test_df['Tomorrow Movement'].shift(-1)

train_df['Tomorrow Open'] = train_df['Open Price'].shift(-1)
test_df['Tomorrow Open'] = test_df['Open Price'].shift(-1)

print(train_df.head())
print(train_df.tail())

   Open Price  Close Price  High Price  Low Price  Tomorrow Movement  \
0     1212.74      1263.70     1266.45    1208.91                0.0   
1     1254.70      1262.52     1274.11    1244.89                1.0   
2     1274.36      1274.49     1286.08    1265.53                0.0   
3     1249.98      1238.60     1256.34    1228.32                1.0   
4     1231.75      1252.52     1252.52    1223.81                0.0   

   Tomorrow Open  
0        1254.70  
1        1274.36  
2        1249.98  
3        1231.75  
4        1252.60  
      Open Price  Close Price  High Price  Low Price  Tomorrow Movement  \
2258     6462.55      6465.17     6467.83    6449.00                0.0   
2259     6427.32      6433.16     6438.24    6407.99                1.0   
2260     6437.06      6435.15     6448.94    6425.92                1.0   
2261     6449.52      6441.42     6452.07    6432.68                0.0   
2262     6439.90      6396.42     6442.53    6396.42                NaN   

  

In [4]:
# Add other new features `S_10`, `Corr`, `Open-Close`, `Open-Open` (explanation is described below)

train_df['S_10'] = train_df['Close Price'].rolling(window=10).mean()
train_df['Corr'] = train_df['Close Price'].rolling(window=10).corr(train_df['S_10'])
train_df['Open-Close'] = train_df['Open Price'] - train_df['Close Price'].shift(1)
train_df['Open-Open'] = train_df['Open Price'] - train_df['Open Price'].shift(1)
train_df = train_df.dropna()
new_train_df = train_df.iloc[:,:10]

print(new_train_df.shape)
print(new_train_df.head())

test_df['S_10'] = test_df['Close Price'].rolling(window=10).mean()
test_df['Corr'] = test_df['Close Price'].rolling(window=10).corr(test_df['S_10'])
test_df['Open-Close'] = test_df['Open Price'] - test_df['Close Price'].shift(1)
test_df['Open-Open'] = test_df['Open Price'] - test_df['Open Price'].shift(1)
test_df = test_df.dropna()
new_test_df = test_df.iloc[:,:10]

print(new_test_df.shape)
print(new_test_df.head())

(2244, 10)
    Open Price  Close Price  High Price  Low Price  Tomorrow Movement  \
18     1217.75      1203.85     1220.72    1198.76                0.0   
19     1212.31      1180.25     1216.46    1176.82                1.0   
20     1168.13      1195.75     1203.53    1167.49                1.0   
21     1198.79      1215.66     1218.91    1180.25                0.0   
22     1215.64      1215.43     1243.68    1210.98                1.0   

    Tomorrow Open      S_10      Corr  Open-Close  Open-Open  
18        1212.31  1186.673 -0.278443      -18.16       3.91  
19        1168.13  1186.346 -0.309854        8.46      -5.44  
20        1198.79  1186.107 -0.665739      -12.12     -44.18  
21        1215.64  1194.012 -0.105050        3.04      30.66  
22        1200.26  1196.979  0.180301       -0.02      16.85  
(233, 10)
    Open Price  Close Price  High Price  Low Price  Tomorrow Movement  \
18     7000.68      6988.32     7020.64    6974.70                0.0   
19     6910.39  

In [5]:
# Divide x and y data

data_train_x = new_train_df.drop(columns=['Tomorrow Movement'])
data_train_y = new_train_df['Tomorrow Movement']

data_test_x = new_test_df.drop(columns=['Tomorrow Movement'])
data_test_y = new_test_df['Tomorrow Movement']

print(data_train_x.shape)
print(data_train_x.head())
print(data_train_y.shape)
print(data_train_y.head())
print('-----')
print(data_test_x.shape)
print(data_test_x.head())
print(data_test_y.shape)
print(data_test_y.head())

(2244, 9)
    Open Price  Close Price  High Price  Low Price  Tomorrow Open      S_10  \
18     1217.75      1203.85     1220.72    1198.76        1212.31  1186.673   
19     1212.31      1180.25     1216.46    1176.82        1168.13  1186.346   
20     1168.13      1195.75     1203.53    1167.49        1198.79  1186.107   
21     1198.79      1215.66     1218.91    1180.25        1215.64  1194.012   
22     1215.64      1215.43     1243.68    1210.98        1200.26  1196.979   

        Corr  Open-Close  Open-Open  
18 -0.278443      -18.16       3.91  
19 -0.309854        8.46      -5.44  
20 -0.665739      -12.12     -44.18  
21 -0.105050        3.04      30.66  
22  0.180301       -0.02      16.85  
(2244,)
18    0.0
19    1.0
20    1.0
21    0.0
22    1.0
Name: Tomorrow Movement, dtype: float64
-----
(233, 9)
    Open Price  Close Price  High Price  Low Price  Tomorrow Open      S_10  \
18     7000.68      6988.32     7020.64    6974.70        6910.39  6890.981   
19     6910.39  

# Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score

lr_model = LogisticRegression(max_iter=500)
#lr_model = SGDClassifier(loss='log',  max_iter=800)
lr_model.fit(data_train_x, data_train_y)

predict_train_y = lr_model.predict(data_train_x)
print('training accuracy:')
print(accuracy_score(data_train_y, predict_train_y))

lr_predict_test_y = lr_model.predict(data_test_x)
print('\ntesting accuracy:')
print(accuracy_score(data_test_y, lr_predict_test_y))

print('\ntesting result prob:')
print(lr_model.predict_proba(data_test_x))

print('\npredicted testing labels:')
print(lr_predict_test_y)

training accuracy:
0.6898395721925134

testing accuracy:
0.6952789699570815

testing result prob:
[[9.83707414e-01 1.62925864e-02]
 [1.11451883e-01 8.88548117e-01]
 [8.54733435e-01 1.45266565e-01]
 [8.92634315e-01 1.07365685e-01]
 [9.87583615e-01 1.24163851e-02]
 [9.99354735e-01 6.45265353e-04]
 [9.86999995e-01 1.30000054e-02]
 [2.26313089e-01 7.73686911e-01]
 [5.94565977e-03 9.94054340e-01]
 [7.61714038e-02 9.23828596e-01]
 [8.97943846e-01 1.02056154e-01]
 [8.87045941e-01 1.12954059e-01]
 [4.43754147e-02 9.55624585e-01]
 [6.01058722e-01 3.98941278e-01]
 [7.66233154e-01 2.33766846e-01]
 [1.46179898e-01 8.53820102e-01]
 [6.33552587e-02 9.36644741e-01]
 [2.87746373e-02 9.71225363e-01]
 [8.40213940e-02 9.15978606e-01]
 [3.88180216e-01 6.11819784e-01]
 [3.89168085e-02 9.61083192e-01]
 [2.50096423e-01 7.49903577e-01]
 [9.94580544e-01 5.41945615e-03]
 [9.61076871e-01 3.89231286e-02]
 [1.14895242e-01 8.85104758e-01]
 [9.38354582e-01 6.16454180e-02]
 [2.51787669e-01 7.48212331e-01]
 [3.8768175

In [7]:
# Print precision, recall, fbeta-score and confusion matrix

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

print('precision, recall, fbeta-score:')
print(precision_recall_fscore_support(data_test_y, lr_predict_test_y, average='weighted'))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(data_test_y, lr_predict_test_y).ravel()
print((tn, fp, fn, tp))

precision, recall, fbeta-score:
(0.6953338947871519, 0.6952789699570815, 0.6942611409862763, None)

confusion matrix(tn, fp, fn, tp):
(71, 40, 31, 91)


# SVM

In [8]:
# Normalize data

from sklearn.preprocessing import MinMaxScaler, RobustScaler

#scaler = MinMaxScaler()
scaler = RobustScaler()
scaler.fit(data_train_x) #scaler.fit(train_df.append(test_df, ignore_index=True))

train_normalize = scaler.transform(data_train_x)
train_normalize = np.transpose(train_normalize)

normalize_train_x = pd.DataFrame({
    'Open Price': train_normalize[0],
    'Close Price': train_normalize[1],
    'High Price': train_normalize[2],
    'Low Price': train_normalize[3],
    'Tomorrow Open': train_normalize[4],
    'S_10': train_normalize[5],
    'Corr': train_normalize[6],
    'Open-Close': train_normalize[7],
    'Open-Open': train_normalize[8],
})

test_normalize = scaler.transform(data_test_x)
test_normalize = np.transpose(test_normalize)
normalize_test_x = pd.DataFrame({
    'Open Price': test_normalize[0],
    'Close Price': test_normalize[1],
    'High Price': test_normalize[2],
    'Low Price': test_normalize[3],
    'Tomorrow Open': test_normalize[4],
    'S_10': test_normalize[5],
    'Corr': test_normalize[6],
    'Open-Close': test_normalize[7],
    'Open-Open': test_normalize[8],
})

data_train_y = np.where(data_train_y == 0, -1, 1)
data_test_y = np.where(data_test_y == 0, -1, 1)

print(normalize_train_x.head())
print(data_train_y[:5])

   Close Price      Corr  High Price  Low Price  Open Price  Open-Close  \
0    -0.855107 -1.025865   -0.850094  -0.855701   -0.851757   -1.067904   
1    -0.866038 -1.065955   -0.852062  -0.865924   -0.854285    0.363856   
2    -0.858859 -1.520172   -0.858036  -0.870271   -0.874822   -0.743042   
3    -0.849637 -0.804563   -0.850930  -0.864326   -0.860570    0.072341   
4    -0.849744 -0.440368   -0.839486  -0.850008   -0.852738   -0.092241   

   Open-Open      S_10  Tomorrow Open  
0  -0.017558 -0.844289      -0.857295  
1  -0.291170 -0.844440      -0.877831  
2  -1.424830 -0.844551      -0.863579  
3   0.765235 -0.840887      -0.855747  
4   0.361109 -0.839511      -0.862896  
[-1  1  1 -1  1]


In [9]:
from sklearn.svm import SVC

svc_model = SVC(kernel='linear', C=3000, tol=1e-5)
svc_model.fit(normalize_train_x, data_train_y)

predict_train_y = svc_model.predict(normalize_train_x)
print('training accuracy:')
print(accuracy_score(data_train_y, predict_train_y))

svc_predict_test_y = svc_model.predict(normalize_test_x)
print('\ntesting accuracy:')
print(accuracy_score(data_test_y, svc_predict_test_y))
print(svc_predict_test_y)

training accuracy:
0.6836007130124777

testing accuracy:
0.6952789699570815
[-1  1 -1 -1 -1 -1 -1  1  1  1 -1 -1  1 -1 -1  1  1  1  1  1  1  1 -1 -1
  1 -1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1  1  1 -1  1 -1  1 -1  1 -1  1
  1 -1  1  1  1  1  1 -1 -1  1  1 -1  1  1  1 -1  1 -1 -1  1  1  1  1 -1
  1 -1  1 -1 -1  1  1 -1 -1  1 -1  1 -1  1  1  1  1  1 -1 -1  1  1  1 -1
 -1 -1  1  1  1 -1  1  1 -1  1 -1  1  1  1  1  1 -1  1  1  1 -1 -1 -1  1
 -1  1  1 -1  1 -1  1  1 -1  1  1  1 -1 -1 -1  1  1 -1  1 -1  1  1 -1 -1
  1  1  1  1 -1 -1 -1 -1  1 -1  1 -1 -1  1  1 -1  1 -1  1  1 -1 -1  1  1
 -1  1 -1  1 -1 -1 -1 -1 -1 -1  1 -1  1  1 -1  1  1 -1 -1  1 -1  1 -1  1
  1 -1 -1 -1  1 -1 -1 -1  1  1 -1 -1 -1 -1  1 -1  1 -1  1 -1  1  1 -1  1
 -1 -1 -1  1  1  1 -1 -1  1 -1 -1  1 -1  1 -1  1  1]


In [10]:
# Print precision, recall, fbeta-score and confusion matrix

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

print('precision, recall, fbeta-score:')
print(precision_recall_fscore_support(data_test_y, svc_predict_test_y, average='weighted'))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(data_test_y, svc_predict_test_y).ravel()
print((tn, fp, fn, tp))

precision, recall, fbeta-score:
(0.695036719122556, 0.6952789699570815, 0.6950423721854327, None)

confusion matrix(tn, fp, fn, tp):
(74, 37, 34, 88)


# Neural Network

In [11]:
left_col = pd.DataFrame(data=np.where(data_train_y == -1, 1, 0)[:])
data_train_y = pd.DataFrame(data=np.where(data_train_y == -1, 0, 1)[:])
data_train_y = pd.concat( [ left_col, data_train_y ], axis=1, ignore_index=True )

left_col = pd.DataFrame(data=np.where(data_test_y == -1, 1, 0)[:])
data_test_y = pd.DataFrame(data=np.where(data_test_y == -1, 0, 1)[:])
data_test_y = pd.concat( [ left_col, data_test_y ], axis=1, ignore_index=True )

print(normalize_train_x.shape)
print(normalize_train_x.head())

print(data_train_y.shape)
print(data_train_y.head())

(2244, 9)
   Close Price      Corr  High Price  Low Price  Open Price  Open-Close  \
0    -0.855107 -1.025865   -0.850094  -0.855701   -0.851757   -1.067904   
1    -0.866038 -1.065955   -0.852062  -0.865924   -0.854285    0.363856   
2    -0.858859 -1.520172   -0.858036  -0.870271   -0.874822   -0.743042   
3    -0.849637 -0.804563   -0.850930  -0.864326   -0.860570    0.072341   
4    -0.849744 -0.440368   -0.839486  -0.850008   -0.852738   -0.092241   

   Open-Open      S_10  Tomorrow Open  
0  -0.017558 -0.844289      -0.857295  
1  -0.291170 -0.844440      -0.877831  
2  -1.424830 -0.844551      -0.863579  
3   0.765235 -0.840887      -0.855747  
4   0.361109 -0.839511      -0.862896  
(2244, 2)
   0  1
0  1  0
1  0  1
2  0  1
3  1  0
4  0  1


In [12]:
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

class M_NN(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(M_NN, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        h = self.linear1(x)
        acti_out = F.relu(h)
        y_pred = self.linear2(h) #.clamp(0,1)
        return y_pred


# N is batch size
N, D_in, H, D_out = 300, 9, 100, 2

model = M_NN(D_in, H, D_out)
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

for t in range(10000):
    for batch_num in range(N, len(normalize_train_x), N):
        
        y_pred = model(torch.FloatTensor(normalize_train_x[batch_num-N:batch_num].values.tolist()))

        loss = criterion(y_pred, torch.FloatTensor(data_train_y[batch_num-N:batch_num].values.tolist()))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (t%100 == 0):
        print(t, loss.item())

0 416.1724548339844
100 405.4530029296875
200 404.59356689453125
300 404.446533203125
400 404.4302978515625
500 404.3185119628906
600 404.11614990234375
700 403.8304443359375
800 403.4712219238281
900 403.052490234375
1000 402.5887756347656
1100 402.091552734375
1200 401.5695495605469
1300 401.02911376953125
1400 400.47540283203125
1500 399.9125061035156
1600 399.34405517578125
1700 398.7726135253906
1800 398.20068359375
1900 397.6304016113281
2000 397.0627746582031
2100 396.4991760253906
2200 395.9407043457031
2300 395.38775634765625
2400 394.84100341796875
2500 394.30072021484375
2600 393.7673034667969
2700 393.2408752441406
2800 392.7214660644531
2900 392.2092590332031
3000 391.70404052734375
3100 391.20611572265625
3200 390.71527099609375
3300 390.23138427734375
3400 389.7544250488281
3500 389.2845458984375
3600 388.8213806152344
3700 388.3648681640625
3800 387.9148864746094
3900 387.4715881347656
4000 387.03466796875
4100 386.6037902832031
4200 386.1792907714844
4300 385.760925292

In [13]:
nn_predict_train_y = model.forward( torch.FloatTensor(normalize_train_x.values.tolist()))
result_train = np.where(nn_predict_train_y[:, 0] > nn_predict_train_y[:, 1], 1, 0)
print('training accuracy:')
print(accuracy_score(data_train_y[0], result_train))

nn_predict_y = model.forward( torch.FloatTensor(normalize_test_x.values.tolist()))
result = np.where(nn_predict_y[:, 0] > nn_predict_y[:, 1], 1, 0)
print('\ntesting accuracy:')
print(accuracy_score(data_test_y[0], result))

print('predicted testing prob:')
print(nn_predict_y)
print('predicted testing labels:')
print(result)


# Print precision, recall, fbeta-score and confusion matrix

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

print('precision, recall, fbeta-score:')
print(precision_recall_fscore_support(data_test_y[0], result, average='weighted'))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(data_test_y[0], result).ravel()
print((tn, fp, fn, tp))

training accuracy:
0.625222816399287

testing accuracy:
0.6137339055793991
predicted testing prob:
tensor([[ 9.3294e-01, -9.3287e-01],
        [-2.6632e+00,  2.6634e+00],
        [-6.9241e-01,  6.9212e-01],
        [ 2.2451e-01, -2.2288e-01],
        [ 2.0254e+00, -2.0245e+00],
        [ 5.5696e+00, -5.5623e+00],
        [ 1.9845e-01, -1.9040e-01],
        [-1.9294e+00,  1.9334e+00],
        [-2.6506e+00,  2.6534e+00],
        [-4.0797e+00,  4.0795e+00],
        [-7.1704e-01,  7.1970e-01],
        [-6.0857e-01,  6.1023e-01],
        [-4.1312e+00,  4.1361e+00],
        [-2.3793e+00,  2.3811e+00],
        [-1.8488e-01,  1.8753e-01],
        [-2.0095e+00,  2.0129e+00],
        [-2.1268e+00,  2.1307e+00],
        [-3.0324e+00,  3.0334e+00],
        [-3.4640e+00,  3.4661e+00],
        [-2.3237e+00,  2.3252e+00],
        [-2.8294e+00,  2.8307e+00],
        [-1.3304e+00,  1.3317e+00],
        [ 2.3458e+00, -2.3442e+00],
        [-8.5531e-04,  4.5475e-03],
        [-3.5986e+00,  3.6022e+00],
 