# HW3 - Stock Movement Prediction

包含的檔案：
- hw3.ipynb
- README.md

欄位定義：


In [2]:
# Read data

import pandas as pd
import numpy as np

train_data_path = './train.csv'
test_data_path = './test.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

print(train_df.shape)
print(train_df.head())

(2264, 6)
          Date  Open Price  Close Price  High Price  Low Price      Volume
0  02-Jan-2009      902.99       931.80      934.73     899.35  4048270080
1  05-Jan-2009      929.17       927.45      936.63     919.53  5413910016
2  06-Jan-2009      931.17       934.70      943.85     927.28  5392620032
3  07-Jan-2009      927.45       906.65      927.45     902.37  4704940032
4  08-Jan-2009      905.73       909.73      910.00     896.81  4991549952


In [3]:
# Drop unnecessary columns

train_df.drop(columns=['Date'], inplace=True) # , 'Volume', 'High Price', 'Low Price'
test_df.drop(columns=['Date'], inplace=True) # , 'Volume', 'High Price', 'Low Price'

print(train_df.shape)
print(train_df.head())

(2264, 5)
   Open Price  Close Price  High Price  Low Price      Volume
0      902.99       931.80      934.73     899.35  4048270080
1      929.17       927.45      936.63     919.53  5413910016
2      931.17       934.70      943.85     927.28  5392620032
3      927.45       906.65      927.45     902.37  4704940032
4      905.73       909.73      910.00     896.81  4991549952


In [4]:
# Add a column `Movement` as the target

train_df['Movement'] = np.where(train_df['Close Price'].diff() >= 0, 1, 0)
test_df['Movement'] = np.where(test_df['Close Price'].diff() >= 0, 1, 0)

print(train_df.head())

   Open Price  Close Price  High Price  Low Price      Volume  Movement
0      902.99       931.80      934.73     899.35  4048270080         0
1      929.17       927.45      936.63     919.53  5413910016         0
2      931.17       934.70      943.85     927.28  5392620032         1
3      927.45       906.65      927.45     902.37  4704940032         0
4      905.73       909.73      910.00     896.81  4991549952         1


In [5]:
# Normalize data

from sklearn.preprocessing import MinMaxScaler, RobustScaler

scaler = MinMaxScaler()
#scaler = RobustScaler()
scaler.fit(train_df) #scaler.fit(train_df.append(test_df, ignore_index=True))

train_normalize = scaler.transform(train_df)
train_normalize = np.transpose(train_normalize)
normalize_train_df = pd.DataFrame({
    'Open Price': train_normalize[0],
    'Close Price': train_normalize[1],
    'High Price': train_normalize[2],
    'Low Price': train_normalize[3],
    'Volume': train_normalize[4],
})

test_normalize = scaler.transform(test_df)
test_normalize = np.transpose(test_normalize)
normalize_test_df = pd.DataFrame({
    'Open Price': test_normalize[0],
    'Close Price': test_normalize[1],
    'High Price': test_normalize[2],
    'Low Price': test_normalize[3],
    'Volume': test_normalize[4],
})

print(normalize_train_df.head())

   Close Price  High Price  Low Price  Open Price    Volume
0     0.126771    0.119748   0.115178    0.111109  0.410385
1     0.124611    0.120698   0.125173    0.124112  0.569145
2     0.128211    0.124309   0.129011    0.125105  0.566670
3     0.114281    0.116107   0.116674    0.123257  0.486725
4     0.115811    0.107381   0.113920    0.112470  0.520044


In [6]:
# Make input & output training data

data_train_x = normalize_train_df.iloc[0::, :]
data_train_x.reset_index(drop=True, inplace=True)

for i in range(1, 5):
    temp_df = normalize_train_df.iloc[i::, :]
    temp_df.reset_index(drop=True, inplace=True)
    data_train_x = pd.concat([data_train_x, temp_df], axis=1, ignore_index=True)

#data_train_y = train_df['Movement']
left_col = pd.DataFrame(data=np.where(train_df['Movement'] == 0, 1, 0)[:])
data_train_y = pd.concat( [ left_col, train_df['Movement'] ], axis=1, ignore_index=True )

# Make input & output testing data

data_test_x = normalize_test_df.iloc[0::, :]
data_test_x.reset_index(drop=True, inplace=True)

for i in range(1, 5):
    temp_df = normalize_test_df.iloc[i::, :]
    temp_df.reset_index(drop=True, inplace=True)
    data_test_x = pd.concat([data_test_x, temp_df], axis=1, ignore_index=True)

#data_test_y = test_df['Movement']
left_col = pd.DataFrame(data=np.where(test_df['Movement'] == 0, 1, 0)[:])
data_test_y = pd.concat( [ left_col, test_df['Movement'] ], axis=1, ignore_index=True )

# Drop incomplete rows in `data_train_x`, `data_train_y`, `data_test_x`, `data_test_y`

data_train_x.drop(data_train_x.tail(5).index, inplace=True)
data_train_y.drop(data_train_y.head(5).index, inplace=True)
data_train_y.reset_index(drop=True, inplace=True)

data_test_x.drop(data_test_x.tail(5).index, inplace=True)
data_test_y.drop(data_test_y.head(5).index, inplace=True)
data_test_y.reset_index(drop=True, inplace=True)

print(data_train_x.shape)
print(data_train_x.head())
print(data_train_y.shape)
print(data_train_y.head())
print('-----')
print(data_test_x.shape)
print(data_test_x.head())
print(data_test_y.shape)
print(data_test_y.head())

(2259, 25)
         0         1         2         3         4         5         6   \
0  0.126771  0.119748  0.115178  0.111109  0.410385  0.124611  0.120698   
1  0.124611  0.120698  0.125173  0.124112  0.569145  0.128211  0.124309   
2  0.128211  0.124309  0.129011  0.125105  0.566670  0.114281  0.116107   
3  0.114281  0.116107  0.116674  0.123257  0.486725  0.115811  0.107381   
4  0.115811  0.107381  0.113920  0.112470  0.520044  0.106186  0.108346   

         7         8         9     ...           15        16        17  \
0  0.125173  0.124112  0.569145    ...     0.114281  0.116107  0.116674   
1  0.129011  0.125105  0.566670    ...     0.115811  0.107381  0.113920   
2  0.116674  0.123257  0.486725    ...     0.106186  0.108346  0.109711   
3  0.113920  0.112470  0.520044    ...     0.096209  0.097580  0.097829   
4  0.109711  0.114546  0.488069    ...     0.096969  0.090889  0.096690   

         18        19        20        21        22        23        24  
0  0.123257  

# Logistic Regression

In [7]:
"""
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score

#lr_model = LogisticRegression(max_iter=1000000)
lr_model = SGDClassifier(loss='log',  max_iter=800)
lr_model.fit(data_train_x, data_train_y)

predict_train_y = lr_model.predict(data_train_x)
print(accuracy_score(data_train_y, predict_train_y))

predict_test_y = lr_model.predict(data_test_x)
print(accuracy_score(data_test_y, predict_test_y))

print(lr_model.predict_proba(data_test_x))
print(predict_test_y)
"""

"\nfrom sklearn.linear_model import LogisticRegression, SGDClassifier\nfrom sklearn.metrics import accuracy_score\n\n#lr_model = LogisticRegression(max_iter=1000000)\nlr_model = SGDClassifier(loss='log',  max_iter=800)\nlr_model.fit(data_train_x, data_train_y)\n\npredict_train_y = lr_model.predict(data_train_x)\nprint(accuracy_score(data_train_y, predict_train_y))\n\npredict_test_y = lr_model.predict(data_test_x)\nprint(accuracy_score(data_test_y, predict_test_y))\n\nprint(lr_model.predict_proba(data_test_x))\nprint(predict_test_y)\n"

# SVM

In [8]:
"""
from sklearn.svm import SVC

svc_model = SVC()
svc_model.fit(data_train_x.values.tolist(), data_train_y.values.tolist())

predict_train_y = svc_model.predict(data_train_x.values.tolist())
print(accuracy_score(data_train_y, predict_train_y))
predict_test_y = svc_model.predict(data_test_x.values.tolist())
print(accuracy_score(data_test_y, predict_test_y))

print(predict_test_y)
"""

'\nfrom sklearn.svm import SVC\n\nsvc_model = SVC()\nsvc_model.fit(data_train_x.values.tolist(), data_train_y.values.tolist())\n\npredict_train_y = svc_model.predict(data_train_x.values.tolist())\nprint(accuracy_score(data_train_y, predict_train_y))\npredict_test_y = svc_model.predict(data_test_x.values.tolist())\nprint(accuracy_score(data_test_y, predict_test_y))\n\nprint(predict_test_y)\n'

# Neural Network

In [9]:
print(data_train_x[:5])
print(len(data_train_x))
print(data_test_y[0])

         0         1         2         3         4         5         6   \
0  0.126771  0.119748  0.115178  0.111109  0.410385  0.124611  0.120698   
1  0.124611  0.120698  0.125173  0.124112  0.569145  0.128211  0.124309   
2  0.128211  0.124309  0.129011  0.125105  0.566670  0.114281  0.116107   
3  0.114281  0.116107  0.116674  0.123257  0.486725  0.115811  0.107381   
4  0.115811  0.107381  0.113920  0.112470  0.520044  0.106186  0.108346   

         7         8         9     ...           15        16        17  \
0  0.125173  0.124112  0.569145    ...     0.114281  0.116107  0.116674   
1  0.129011  0.125105  0.566670    ...     0.115811  0.107381  0.113920   
2  0.116674  0.123257  0.486725    ...     0.106186  0.108346  0.109711   
3  0.113920  0.112470  0.520044    ...     0.096209  0.097580  0.097829   
4  0.109711  0.114546  0.488069    ...     0.096969  0.090889  0.096690   

         18        19        20        21        22        23        24  
0  0.123257  0.486725  0

In [13]:
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

class M_NN(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(M_NN, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h = self.linear1(x)
        sigmoid_out = F.sigmoid(h)
        y_pred = self.linear2(sigmoid_out) #.clamp(0,1)
        sigmoid2_out = F.sigmoid(y_pred)
        return sigmoid2_out


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 100, 25, 100, 2 # batch !!!!!

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = M_NN(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

for t in range(1000):
    for batch_num in range(N, len(data_train_x), N):
        # Forward pass: Compute predicted y by passing x to the model
        y_pred = model(torch.FloatTensor(data_train_x[batch_num-N:batch_num].values.tolist()))

        # Compute and print loss
        loss = criterion(y_pred,  torch.FloatTensor(data_train_y[batch_num-N:batch_num].values.tolist()))
        

        # Zero gradients, perform a backward pass, and update the weights.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (t%50 == 0):
        print(t, loss.item())
    
nn_predict_y = model.forward( torch.FloatTensor(data_test_x.values.tolist()))
print(nn_predict_y)
result = np.where(nn_predict_y[:, 0] > 0, 0, 1)
#print(result)
#print(data_test_y)

print(accuracy_score(data_test_y[0], result))

0 143.2679443359375
50 137.3546600341797
100 137.16323852539062
150 137.07626342773438
200 137.0266876220703
250 136.99612426757812
300 136.97616577148438
350 136.9625701904297
400 136.95303344726562
450 136.94627380371094
500 136.94143676757812
550 136.93801879882812
600 136.9356231689453
650 136.93409729003906
700 136.93319702148438
750 136.9327850341797
800 136.9328155517578
850 136.93321228027344
900 136.93382263183594
950 136.93470764160156
tensor([[0.0030, 0.1735],
        [0.0030, 0.1732],
        [0.0030, 0.1731],
        [0.0030, 0.1730],
        [0.0030, 0.1731],
        [0.0030, 0.1729],
        [0.0029, 0.1724],
        [0.0029, 0.1725],
        [0.0030, 0.1724],
        [0.0030, 0.1722],
        [0.0029, 0.1722],
        [0.0029, 0.1720],
        [0.0029, 0.1717],
        [0.0029, 0.1717],
        [0.0029, 0.1718],
        [0.0029, 0.1714],
        [0.0029, 0.1715],
        [0.0029, 0.1716],
        [0.0030, 0.1720],
        [0.0030, 0.1720],
        [0.0030, 0.1721],
    

# Discussion
- How did you preprocess this dataset ?
- Which classifier reaches the highest classification accuracy in this dataset ?
    - Why ?
    - Can this result remain if the dataset is different ?
- How did you improve your classifiers ?
