# HW3 - Stock Movement Prediction - facebook inc.

In [15]:
# Read data

import pandas as pd
import numpy as np

train_data_path = './fb-train.csv'
test_data_path = './fb-test.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

print(train_df.shape)
print(train_df.head())

(1414, 6)
          Date  Open Price  Close Price  High Price  Low Price     Volume
0  17-May-2012       38.00        38.00       38.00      38.00          0
1  18-May-2012       42.05        38.23       45.00      38.00  580587712
2  21-May-2012       36.53        34.03       36.66      33.00  168309824
3  22-May-2012       32.61        31.00       33.59      30.94  102053824
4  23-May-2012       31.37        32.00       32.50      31.36   73721136


In [16]:
# Drop unnecessary columns

train_df.drop(columns=['Date', 'Volume'], inplace=True) # , 'Volume', 'High Price', 'Low Price'
test_df.drop(columns=['Date', 'Volume'], inplace=True) # , 'Volume', 'High Price', 'Low Price'

print(train_df.shape)
print(train_df.head())

(1414, 4)
   Open Price  Close Price  High Price  Low Price
0       38.00        38.00       38.00      38.00
1       42.05        38.23       45.00      38.00
2       36.53        34.03       36.66      33.00
3       32.61        31.00       33.59      30.94
4       31.37        32.00       32.50      31.36


In [17]:
# Add the column `Tomorrow Movement` by comparing the `Close Price` with previous days as the training target
# Add the column `Tomorrow Open` by shifting the column `Open Price` as one of the new features

train_df['Tomorrow Movement'] = np.where(train_df['Close Price'].diff() >= 0, 1, 0)
test_df['Tomorrow Movement'] = np.where(test_df['Close Price'].diff() >= 0, 1, 0)

train_df['Tomorrow Movement'] = train_df['Tomorrow Movement'].shift(-1)
test_df['Tomorrow Movement'] = test_df['Tomorrow Movement'].shift(-1)

train_df['Tomorrow Open'] = train_df['Open Price'].shift(-1)
test_df['Tomorrow Open'] = test_df['Open Price'].shift(-1)

print(train_df.head())
print(train_df.tail())

   Open Price  Close Price  High Price  Low Price  Tomorrow Movement  \
0       38.00        38.00       38.00      38.00                1.0   
1       42.05        38.23       45.00      38.00                0.0   
2       36.53        34.03       36.66      33.00                0.0   
3       32.61        31.00       33.59      30.94                1.0   
4       31.37        32.00       32.50      31.36                1.0   

   Tomorrow Open  
0          42.05  
1          36.53  
2          32.61  
3          31.37  
4          32.95  
      Open Price  Close Price  High Price  Low Price  Tomorrow Movement  \
1409      177.14       177.20      177.53     176.23                0.0   
1410      176.63       175.99      177.00     174.67                1.0   
1411      176.55       177.62      178.44     176.26                1.0   
1412      177.95       177.92      178.94     177.68                0.0   
1413      178.00       176.46      178.85     176.46                NaN   

  

In [18]:
# Add other new features `S_10`, `Corr`, `Open-Close`, `Open-Open` (explanation is described below)

train_df['S_10'] = train_df['Close Price'].rolling(window=10).mean()
train_df['Corr'] = train_df['Close Price'].rolling(window=10).corr(train_df['S_10'])
train_df['Open-Close'] = train_df['Open Price'] - train_df['Close Price'].shift(1)
train_df['Open-Open'] = train_df['Open Price'] - train_df['Open Price'].shift(1)
train_df = train_df.dropna()
new_train_df = train_df.iloc[:,:10]

print(new_train_df.shape)
print(new_train_df.head())

test_df['S_10'] = test_df['Close Price'].rolling(window=10).mean()
test_df['Corr'] = test_df['Close Price'].rolling(window=10).corr(test_df['S_10'])
test_df['Open-Close'] = test_df['Open Price'] - test_df['Close Price'].shift(1)
test_df['Open-Open'] = test_df['Open Price'] - test_df['Open Price'].shift(1)
test_df = test_df.dropna()
new_test_df = test_df.iloc[:,:10]

print(new_test_df.shape)
print(new_test_df.head())

(1395, 10)
    Open Price  Close Price  High Price  Low Price  Tomorrow Movement  \
18       27.66        27.27       28.10      27.10                1.0   
19       27.65        28.29       28.32      27.38                1.0   
20       28.50        30.01       30.10      28.35                1.0   
21       29.95        31.41       32.08      29.41                1.0   
22       31.53        31.91       32.18      30.70                0.0   

    Tomorrow Open    S_10      Corr  Open-Close  Open-Open  
18          27.65  27.198  0.544501        0.26       0.18  
19          28.50  27.067 -0.230716        0.38      -0.01  
20          29.95  27.296 -0.578315        0.21       0.85  
21          31.53  27.747 -0.494005       -0.06       1.45  
22          31.92  28.351 -0.079347        0.12       1.58  
(233, 10)
    Open Price  Close Price  High Price  Low Price  Tomorrow Movement  \
18      188.75       185.98      188.84     185.63                1.0   
19      183.01       187.12 

In [19]:
# Divide x and y data

data_train_x = new_train_df.drop(columns=['Tomorrow Movement'])
data_train_y = new_train_df['Tomorrow Movement']

data_test_x = new_test_df.drop(columns=['Tomorrow Movement'])
data_test_y = new_test_df['Tomorrow Movement']

print(data_train_x.shape)
print(data_train_x.head())
print(data_train_y.shape)
print(data_train_y.head())
print('-----')
print(data_test_x.shape)
print(data_test_x.head())
print(data_test_y.shape)
print(data_test_y.head())

(1395, 9)
    Open Price  Close Price  High Price  Low Price  Tomorrow Open    S_10  \
18       27.66        27.27       28.10      27.10          27.65  27.198   
19       27.65        28.29       28.32      27.38          28.50  27.067   
20       28.50        30.01       30.10      28.35          29.95  27.296   
21       29.95        31.41       32.08      29.41          31.53  27.747   
22       31.53        31.91       32.18      30.70          31.92  28.351   

        Corr  Open-Close  Open-Open  
18  0.544501        0.26       0.18  
19 -0.230716        0.38      -0.01  
20 -0.578315        0.21       0.85  
21 -0.494005       -0.06       1.45  
22 -0.079347        0.12       1.58  
(1395,)
18    1.0
19    1.0
20    1.0
21    1.0
22    0.0
Name: Tomorrow Movement, dtype: float64
-----
(233, 9)
    Open Price  Close Price  High Price  Low Price  Tomorrow Open     S_10  \
18      188.75       185.98      188.84     185.63         183.01  184.181   
19      183.01       187.12   

# Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score

lr_model = LogisticRegression(max_iter=500)
#lr_model = SGDClassifier(loss='log',  max_iter=800)
lr_model.fit(data_train_x, data_train_y)

predict_train_y = lr_model.predict(data_train_x)
print('training accuracy:')
print(accuracy_score(data_train_y, predict_train_y))

lr_predict_test_y = lr_model.predict(data_test_x)
print('\ntesting accuracy:')
print(accuracy_score(data_test_y, lr_predict_test_y))

print('\ntesting result prob:')
print(lr_model.predict_proba(data_test_x))

print('\npredicted testing labels:')
print(lr_predict_test_y)

training accuracy:
0.6437275985663082

testing accuracy:
0.6824034334763949

testing result prob:
[[9.47158943e-01 5.28410566e-02]
 [2.82498932e-01 7.17501068e-01]
 [1.09052237e-01 8.90947763e-01]
 [8.29468155e-01 1.70531845e-01]
 [9.64185526e-01 3.58144741e-02]
 [8.97035663e-01 1.02964337e-01]
 [8.99641401e-01 1.00358599e-01]
 [1.26007199e-01 8.73992801e-01]
 [4.59912061e-03 9.95400879e-01]
 [3.01534758e-01 6.98465242e-01]
 [6.59958853e-01 3.40041147e-01]
 [2.21393329e-01 7.78606671e-01]
 [3.45070654e-01 6.54929346e-01]
 [7.09113590e-01 2.90886410e-01]
 [7.94337847e-01 2.05662153e-01]
 [2.28572772e-01 7.71427228e-01]
 [2.16521800e-01 7.83478200e-01]
 [2.02203819e-01 7.97796181e-01]
 [2.16278030e-01 7.83721970e-01]
 [5.83435658e-01 4.16564342e-01]
 [1.60501105e-01 8.39498895e-01]
 [1.53062274e-01 8.46937726e-01]
 [9.15266847e-01 8.47331527e-02]
 [6.61083524e-01 3.38916476e-01]
 [2.07255926e-01 7.92744074e-01]
 [6.26808624e-01 3.73191376e-01]
 [6.79557092e-01 3.20442908e-01]
 [8.8864914

In [21]:
# Print precision, recall, fbeta-score and confusion matrix

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

print('precision, recall, fbeta-score:')
print(precision_recall_fscore_support(data_test_y, lr_predict_test_y, average='weighted'))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(data_test_y, lr_predict_test_y).ravel()
print((tn, fp, fn, tp))

precision, recall, fbeta-score:
(0.6849727015269627, 0.6824034334763949, 0.6804216988464749, None)

confusion matrix(tn, fp, fn, tp):
(69, 45, 29, 90)


# SVM

In [22]:
# Normalize data

from sklearn.preprocessing import MinMaxScaler, RobustScaler

#scaler = MinMaxScaler()
scaler = RobustScaler()
scaler.fit(data_train_x) #scaler.fit(train_df.append(test_df, ignore_index=True))

train_normalize = scaler.transform(data_train_x)
train_normalize = np.transpose(train_normalize)

normalize_train_x = pd.DataFrame({
    'Open Price': train_normalize[0],
    'Close Price': train_normalize[1],
    'High Price': train_normalize[2],
    'Low Price': train_normalize[3],
    'Tomorrow Open': train_normalize[4],
    'S_10': train_normalize[5],
    'Corr': train_normalize[6],
    'Open-Close': train_normalize[7],
    'Open-Open': train_normalize[8],
})

test_normalize = scaler.transform(data_test_x)
test_normalize = np.transpose(test_normalize)
normalize_test_x = pd.DataFrame({
    'Open Price': test_normalize[0],
    'Close Price': test_normalize[1],
    'High Price': test_normalize[2],
    'Low Price': test_normalize[3],
    'Tomorrow Open': test_normalize[4],
    'S_10': test_normalize[5],
    'Corr': test_normalize[6],
    'Open-Close': test_normalize[7],
    'Open-Open': test_normalize[8],
})

data_train_y = np.where(data_train_y == 0, -1, 1)
data_test_y = np.where(data_test_y == 0, -1, 1)

print(normalize_train_x.head())
print(data_train_y[:5])

   Close Price      Corr  High Price  Low Price  Open Price  Open-Close  \
0    -0.756711  0.106105   -0.750319  -0.746640   -0.746098    0.300000   
1    -0.742186 -0.833286   -0.747197  -0.742637   -0.746241    0.500000   
2    -0.717693 -1.254500   -0.721930  -0.728768   -0.734127    0.216667   
3    -0.697757 -1.152335   -0.693825  -0.713612   -0.713461   -0.233333   
4    -0.690637 -0.649861   -0.692406  -0.695167   -0.690943    0.066667   

   Open-Open      S_10  Tomorrow Open  
0   0.053691 -0.745188      -0.745710  
1  -0.073826 -0.747052      -0.733604  
2   0.503356 -0.743794      -0.712953  
3   0.906040 -0.737377      -0.690451  
4   0.993289 -0.728785      -0.684896  
[ 1  1  1  1 -1]


In [23]:
from sklearn.svm import SVC

svc_model = SVC(kernel='linear', C=3000, tol=1e-5)
svc_model.fit(normalize_train_x, data_train_y)

predict_train_y = svc_model.predict(normalize_train_x)
print('training accuracy:')
print(accuracy_score(data_train_y, predict_train_y))

svc_predict_test_y = svc_model.predict(normalize_test_x)
print('\ntesting accuracy:')
print(accuracy_score(data_test_y, svc_predict_test_y))
print(svc_predict_test_y)

training accuracy:
0.6379928315412187

testing accuracy:
0.6824034334763949
[-1  1  1 -1 -1 -1 -1  1  1  1 -1  1  1 -1 -1  1  1  1  1 -1  1  1 -1 -1
  1 -1 -1  1  1  1  1 -1  1 -1 -1 -1 -1  1  1 -1 -1  1 -1  1 -1  1 -1  1
  1 -1  1  1  1  1 -1  1 -1  1  1  1  1  1  1  1  1 -1 -1  1  1  1  1 -1
  1 -1 -1 -1  1  1  1 -1 -1  1 -1  1  1  1 -1  1 -1 -1 -1 -1  1  1  1 -1
 -1 -1  1  1  1 -1  1  1 -1  1 -1 -1  1 -1  1 -1 -1  1  1  1 -1 -1 -1  1
  1  1  1 -1  1  1  1  1 -1  1  1 -1  1  1 -1  1  1 -1  1  1  1  1  1 -1
  1  1  1  1  1 -1 -1 -1  1 -1  1  1 -1  1  1 -1 -1  1  1  1 -1 -1 -1  1
 -1 -1 -1  1 -1  1 -1  1 -1 -1  1 -1  1  1 -1  1  1 -1 -1  1 -1  1 -1  1
 -1  1 -1  1  1 -1 -1 -1  1  1 -1 -1 -1 -1  1 -1  1 -1  1 -1 -1  1  1  1
 -1 -1  1  1  1  1 -1 -1  1 -1 -1 -1 -1  1 -1  1  1]


In [24]:
# Print precision, recall, fbeta-score and confusion matrix

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

print('precision, recall, fbeta-score:')
print(precision_recall_fscore_support(data_test_y, svc_predict_test_y, average='weighted'))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(data_test_y, svc_predict_test_y).ravel()
print((tn, fp, fn, tp))

precision, recall, fbeta-score:
(0.6831712054788209, 0.6824034334763949, 0.6815222651392658, None)

confusion matrix(tn, fp, fn, tp):
(72, 42, 32, 87)


# Neural Network

In [25]:
left_col = pd.DataFrame(data=np.where(data_train_y == -1, 1, 0)[:])
data_train_y = pd.DataFrame(data=np.where(data_train_y == -1, 0, 1)[:])
data_train_y = pd.concat( [ left_col, data_train_y ], axis=1, ignore_index=True )

left_col = pd.DataFrame(data=np.where(data_test_y == -1, 1, 0)[:])
data_test_y = pd.DataFrame(data=np.where(data_test_y == -1, 0, 1)[:])
data_test_y = pd.concat( [ left_col, data_test_y ], axis=1, ignore_index=True )

print(normalize_train_x.shape)
print(normalize_train_x.head())

print(data_train_y.shape)
print(data_train_y.head())

(1395, 9)
   Close Price      Corr  High Price  Low Price  Open Price  Open-Close  \
0    -0.756711  0.106105   -0.750319  -0.746640   -0.746098    0.300000   
1    -0.742186 -0.833286   -0.747197  -0.742637   -0.746241    0.500000   
2    -0.717693 -1.254500   -0.721930  -0.728768   -0.734127    0.216667   
3    -0.697757 -1.152335   -0.693825  -0.713612   -0.713461   -0.233333   
4    -0.690637 -0.649861   -0.692406  -0.695167   -0.690943    0.066667   

   Open-Open      S_10  Tomorrow Open  
0   0.053691 -0.745188      -0.745710  
1  -0.073826 -0.747052      -0.733604  
2   0.503356 -0.743794      -0.712953  
3   0.906040 -0.737377      -0.690451  
4   0.993289 -0.728785      -0.684896  
(1395, 2)
   0  1
0  0  1
1  0  1
2  0  1
3  0  1
4  1  0


In [26]:
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

class M_NN(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(M_NN, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        h = self.linear1(x)
        acti_out = F.relu(h)
        y_pred = self.linear2(h) #.clamp(0,1)
        return y_pred


# N is batch size
N, D_in, H, D_out = 300, 9, 100, 2

model = M_NN(D_in, H, D_out)
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

for t in range(10000):
    for batch_num in range(N, len(normalize_train_x), N):
        
        y_pred = model(torch.FloatTensor(normalize_train_x[batch_num-N:batch_num].values.tolist()))

        loss = criterion(y_pred, torch.FloatTensor(data_train_y[batch_num-N:batch_num].values.tolist()))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (t%100 == 0):
        print(t, loss.item())

0 409.946533203125
100 409.06732177734375
200 408.2174987792969
300 407.1663818359375
400 405.8822326660156
500 404.51605224609375
600 404.3182373046875
700 403.8816223144531
800 403.02685546875
900 403.74755859375
1000 402.9700927734375
1100 402.6396484375
1200 402.3293151855469
1300 402.01214599609375
1400 401.6769104003906
1500 401.3251037597656
1600 400.96319580078125
1700 400.59814453125
1800 400.23516845703125
1900 399.8778076171875
2000 399.528076171875
2100 399.18695068359375
2200 398.8548889160156
2300 398.5320129394531
2400 398.2181091308594
2500 397.91265869140625
2600 397.6156005859375
2700 397.32611083984375
2800 397.044189453125
2900 396.7690124511719
3000 396.5002136230469
3100 396.2375793457031
3200 395.9805603027344
3300 395.72857666015625
3400 395.4812927246094
3500 395.23846435546875
3600 394.99969482421875
3700 394.7644958496094
3800 394.5328063964844
3900 394.304443359375
4000 394.07904052734375
4100 393.8563537597656
4200 393.6362609863281
4300 393.4186706542969
4

In [27]:
nn_predict_train_y = model.forward( torch.FloatTensor(normalize_train_x.values.tolist()))
result_train = np.where(nn_predict_train_y[:, 0] > nn_predict_train_y[:, 1], 1, 0)
print('training accuracy:')
print(accuracy_score(data_train_y[0], result_train))

nn_predict_y = model.forward( torch.FloatTensor(normalize_test_x.values.tolist()))
result = np.where(nn_predict_y[:, 0] > nn_predict_y[:, 1], 1, 0)
print('\ntesting accuracy:')
print(accuracy_score(data_test_y[0], result))

print('predicted testing prob:')
print(nn_predict_y)
print('predicted testing labels:')
print(result)


# Print precision, recall, fbeta-score and confusion matrix

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

print('precision, recall, fbeta-score:')
print(precision_recall_fscore_support(data_test_y[0], result, average='weighted'))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(data_test_y[0], result).ravel()
print((tn, fp, fn, tp))

training accuracy:
0.617921146953405

testing accuracy:
0.6437768240343348
predicted testing prob:
tensor([[ 1.3561e+00, -1.3567e+00],
        [-2.5659e+00,  2.5644e+00],
        [-7.3496e-01,  7.3416e-01],
        [ 1.2218e+00, -1.2242e+00],
        [ 2.0628e+00, -2.0644e+00],
        [ 6.8311e-02, -7.3388e-02],
        [-7.9516e-01,  7.9322e-01],
        [-1.2471e+00,  1.2456e+00],
        [-3.5335e+00,  3.5291e+00],
        [-1.0463e+00,  1.0456e+00],
        [ 2.7909e-01, -2.7901e-01],
        [-1.7643e+00,  1.7639e+00],
        [-1.1007e+00,  1.0994e+00],
        [ 1.0446e+00, -1.0440e+00],
        [ 2.7929e-02, -2.8636e-02],
        [-2.0660e+00,  2.0653e+00],
        [-6.4772e-01,  6.4583e-01],
        [-8.1421e-01,  8.1420e-01],
        [-8.5201e-01,  8.5182e-01],
        [ 7.3509e-01, -7.3477e-01],
        [-1.5227e+00,  1.5220e+00],
        [-1.3399e+00,  1.3383e+00],
        [ 1.1476e+00, -1.1492e+00],
        [-1.5360e+00,  1.5353e+00],
        [-1.1081e+00,  1.1072e+00],
 