# Part 1 - Data Preprocessing

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Load data 

In [10]:
dataset = pd.read_csv('DJI_daily.csv', header=0)
dataset = dataset.dropna()
dataset

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2003-12-31,10426.299805,10462.440430,10407.040039,10453.919922,10453.919922,138670000
1,2004-01-02,10452.740234,10527.030273,10384.299805,10409.849609,10409.849609,168890000
2,2004-01-05,10411.849609,10544.070313,10411.849609,10544.070313,10544.070313,221290000
3,2004-01-06,10543.849609,10549.179688,10499.849609,10538.660156,10538.660156,191460000
4,2004-01-07,10535.459961,10539.459961,10466.290039,10529.030273,10529.030273,225490000
5,2004-01-08,10530.070313,10592.589844,10530.070313,10592.440430,10592.440430,237770000
6,2004-01-09,10589.250000,10589.250000,10448.669922,10458.889648,10458.889648,223250000
7,2004-01-12,10461.549805,10491.629883,10444.150391,10485.179688,10485.179688,197960000
8,2004-01-13,10485.179688,10509.849609,10367.410156,10427.179688,10427.179688,197310000
9,2004-01-14,10428.669922,10548.509766,10428.669922,10538.370117,10538.370117,186280000


Technical indicator (KD)

In [39]:
# RSV
RSV = 100* ((dataset['Adj Close'] - dataset['Low'].rolling(window=9).min() ) /
            (dataset['High'].rolling(window=9).max()-dataset['Low'].rolling(window=9).min()))
RSV.fillna(method='bfill', inplace=True)
dataset['rsv']=round(RSV,3)

# K-value
dataset['k'] = 0
# print(file.head())
for idx, row in dataset.iterrows(): 
    # idx是在row裡  row在file裡 用file.interrows() 拆成一列一列
 iRSV = row['rsv']
 if idx == 0:
  K=(RSV[0]*1/3)+(50*2/3)     #將第一天的k值預設為50
 else:
  bk = dataset.loc[idx - 1, 'k'] #取前一個k值 #取某一欄或某一列
  K=(iRSV*1/3)+(bk*2/3)
 dataset.loc[idx, 'k'] = round(K,3)

# D-value
dataset['d'] = 0
K = dataset['k']
for idx, row in dataset.iterrows():
 iK = row['k']
 if idx == 0:
  D=(K[0]*1/3)+(50*2/3)
 else:
  bd = dataset.loc[idx - 1, 'd']
  D=(iK*1/3)+(bd*2/3)
 dataset.loc[idx, 'd'] = round(D,3)

# Golden cross or Death cross
dataset['gx'] = np.where(dataset['k'] > dataset['d'], 1, 0)
dataset

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,rsv,k,d,gx
0,2003-12-31,10426.299805,10462.440430,10407.040039,10453.919922,10453.919922,138670000,26.543,42.181,47.394,0
1,2004-01-02,10452.740234,10527.030273,10384.299805,10409.849609,10409.849609,168890000,26.543,36.968,43.919,0
2,2004-01-05,10411.849609,10544.070313,10411.849609,10544.070313,10544.070313,221290000,26.543,33.493,40.444,0
3,2004-01-06,10543.849609,10549.179688,10499.849609,10538.660156,10538.660156,191460000,26.543,31.176,37.355,0
4,2004-01-07,10535.459961,10539.459961,10466.290039,10529.030273,10529.030273,225490000,26.543,29.632,34.781,0
5,2004-01-08,10530.070313,10592.589844,10530.070313,10592.440430,10592.440430,237770000,26.543,28.602,32.721,0
6,2004-01-09,10589.250000,10589.250000,10448.669922,10458.889648,10458.889648,223250000,26.543,27.916,31.119,0
7,2004-01-12,10461.549805,10491.629883,10444.150391,10485.179688,10485.179688,197960000,26.543,27.458,29.899,0
8,2004-01-13,10485.179688,10509.849609,10367.410156,10427.179688,10427.179688,197310000,26.543,27.153,28.984,0
9,2004-01-14,10428.669922,10548.509766,10428.669922,10538.370117,10538.370117,186280000,75.922,43.409,33.792,1


In [18]:
X_train = dataset.iloc[:3776, 5:-1].values
y_train = dataset.iloc[:3776, -1].values
X_train, y_train = np.array(X_train),  np.array(y_train)

Split dataset into training and test set

In [None]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

Feature scaling

In [13]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
#X_test = sc.transform(X_test)

In [15]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4
0,-0.857602,-0.678661,-1.025221,-0.684328,-0.537531
1,-0.867311,-0.395870,-1.025221,-0.893957,-0.695646
2,-0.837743,0.094475,-1.025221,-1.033697,-0.853761
3,-0.838935,-0.184666,-1.025221,-1.126870,-0.994313
4,-0.841056,0.133778,-1.025221,-1.188958,-1.111432
5,-0.827088,0.248691,-1.025221,-1.230378,-1.205164
6,-0.856508,0.112816,-1.025221,-1.257964,-1.278056
7,-0.850716,-0.123841,-1.025221,-1.276381,-1.333567
8,-0.863493,-0.129923,-1.025221,-1.288646,-1.375201
9,-0.838999,-0.233139,0.525069,-0.634946,-1.156433


Creating a data structure with timesteps and one output

In [None]:
#X_train = []
# 60 previous stock prices
#y_train = []
#for i in range(60, 4810):
#    X_train.append(dataset_train_scaled[i-60:i, 0]) 
    # get the 60 previous stock prices
#    y_train.append(dataset_train_scaled[i, 0])
#X_train, y_train = np.array(X_train),  np.array(y_train)

Reshaping - 3 dimensions

In [19]:
X_train = np.reshape(X_train, (X_train.shape[0], 
                               X_train.shape[1], 
                               1))

#X_test = np.reshape(X_test,   (X_test.shape[0], 
#                               X_test.shape[1], 
#                               1))

# Part 2 - Building the RNN

In [20]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Initializing the RNN

In [21]:
regressor = Sequential()

Adding the first LSTM layer and also some Dropout regularisation

In [22]:
regressor.add(LSTM(units = 50, 
                   return_sequences = True, 
                   input_shape = (X_train.shape[1], 1)))
regressor.add(Dropout(0.2))

Adding the second LSTM layer and also some Dropout regularisation

In [23]:
regressor.add(LSTM(units = 50, 
                   return_sequences = True))
regressor.add(Dropout(0.2))

Adding the third LSTM layer and also some Dropout regularisation

In [24]:
regressor.add(LSTM(units = 50, 
                   return_sequences = True))
regressor.add(Dropout(0.2))

Adding the fourth LSTM layer and also some Dropout regularisation

In [25]:
regressor.add(LSTM(units = 50))
regressor.add(Dropout(0.2))

Adding the output layer

In [26]:
regressor.add(Dense(units = 1))

Compiling the RNN

In [27]:
regressor.compile(optimizer = 'adam',
                  loss = 'mean_squared_error')

Fitting the RNN to the training set

In [28]:
regressor.fit(X_train, y_train, epochs = 100, batch_size = 32, verbose =1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1a477de4a8>

Evaluate the regressor

In [None]:
#X_test = np.reshape(X_test,   (X_test.shape[0], 
#                               X_test.shape[1],
#                               1))

In [None]:
#loss, accuracy = regressor.evaluate(X_test, y_test, verbose=0)

# Part 3 - Making the predictions 

In [29]:
X_test = dataset.iloc[3776:, 5:-1].values
y_test = dataset.iloc[3776:, -1].values

X_test = np.reshape(X_test,   (X_test.shape[0], 
                               X_test.shape[1],
                               1))

In [36]:
y_pred = regressor.predict(X_test)
pd.DataFrame(y_pred)

Unnamed: 0,0
0,0.985012
1,0.985442
2,0.979392
3,0.979104
4,0.978256
5,0.978228
6,0.990634
7,1.0024
8,1.00896
9,1.01311


In [37]:
y_pred = (y_pred > 0.5)
pd.DataFrame(y_pred)

Unnamed: 0,0
0,True
1,True
2,True
3,True
4,True
5,True
6,True
7,True
8,True
9,True


In [33]:
pd.DataFrame(y_test)

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,1
8,1
9,1


# TREND

In [42]:
trend = pd.read_csv('trend.csv').dropna()
trend

Unnamed: 0,date,2012_pre,2016_hillary,2016_pre,all_pre,obama,clinton_hillary,hillary,auguration,auguration_obama,...,trump_melania,trump_news,trump_obama,trump_speech_today,trump_tiffany,trump_today,trump_tweets_today,trump,twitter_obama,twitter_trump
0,2004/1/1,0.0,0.0,1.0,28.0,0.0,3.0,4.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0
1,2004/2/1,0.0,0.0,1.0,33.0,0.0,2.0,3.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0
2,2004/3/1,0.0,0.0,1.0,25.0,1.0,2.0,3.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0
3,2004/4/1,0.0,0.0,1.0,23.0,0.0,2.0,3.0,1.0,0.0,...,2.0,0.0,0.0,0.0,2.0,0.0,0.0,3.0,0.0,0.0
4,2004/5/1,0.0,0.0,1.0,22.0,0.0,2.0,3.0,1.0,0.0,...,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
5,2004/6/1,0.0,0.0,1.0,26.0,1.0,2.0,5.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6,2004/7/1,0.0,0.0,1.0,20.0,8.0,3.0,5.0,1.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
7,2004/8/1,0.0,0.0,2.0,19.0,3.0,2.0,3.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,2004/9/1,0.0,0.0,3.0,28.0,1.0,2.0,3.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
9,2004/10/1,0.0,0.0,8.0,46.0,1.0,2.0,3.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
