# Model for Microsoft
Sentiment is built upon the Reuters titles dataset.
Historical data is taken from yahoo finance

In [1]:
from tqdm import tqdm
import numpy as np
import pickle
import nltk
import string
import os
import pandas as pd
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import re
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import time
import sys
import time
from tqdm._tqdm_notebook import tqdm_notebook
from keras.models import Sequential, load_model
from keras import layers
from keras.optimizers import RMSprop
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger
from keras import optimizers
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import logging
from datetime import datetime, timedelta
from io import StringIO
import copy

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
Using TensorFlow backend.


## Scraping historical data from yahoo finance

In [2]:
class YahooFinanceHistory:
    timeout = 2
    crumb_link = 'https://finance.yahoo.com/quote/{0}/history?p={0}'
    crumble_regex = r'CrumbStore":{"crumb":"(.*?)"}'
    quote_link = 'https://query1.finance.yahoo.com/v7/finance/download/{quote}?period1={dfrom}&period2={dto}&interval=1d&events=history&crumb={crumb}'

    def __init__(self, symbol, days_back=7):
        self.symbol = symbol
        self.session = requests.Session()
        self.dt = timedelta(days=days_back)

#requesting crumb and cookie
    def get_crumb(self):
        response = self.session.get(self.crumb_link.format(self.symbol), timeout=self.timeout)
        response.raise_for_status()
        match = re.search(self.crumble_regex, response.text)
        if not match:
            raise ValueError('Could not get crumb from Yahoo Finance')
        else:
            self.crumb = match.group(1)

#requesting data
    def get_quote(self):
        if not hasattr(self, 'crumb') or len(self.session.cookies) == 0:
            self.get_crumb()
        now = datetime.utcnow()
        dateto = int(now.timestamp())
        datefrom = int((now - self.dt).timestamp())
        url = self.quote_link.format(quote=self.symbol, dfrom=datefrom, dto=dateto, crumb=self.crumb)
        response = self.session.get(url)
        response.raise_for_status()
        return pd.read_csv(StringIO(response.text), parse_dates=['Date'])

In [3]:
#extracting data about Apple from 4000 days back
df_v = YahooFinanceHistory('AAPL', days_back=4000).get_quote()

In [4]:
#sorting dates, chronologically
df_v.sort_values(by='Date')

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2009-06-04,20.018572,20.597143,20.005714,20.534286,17.776474,137658500
1,2009-06-05,20.758572,20.914286,20.458570,20.667143,17.891493,158179000
2,2009-06-08,20.545713,20.604286,19.918571,20.549999,17.790079,232913100
3,2009-06-09,20.544285,20.651428,20.078571,20.388571,17.650331,169241100
4,2009-06-10,20.325714,20.335714,19.757143,20.035715,17.344868,172155900
...,...,...,...,...,...,...,...
2752,2020-05-11,308.100006,317.049988,307.239990,315.010010,315.010010,36405900
2753,2020-05-12,317.829987,319.690002,310.910004,311.410004,311.410004,40575300
2754,2020-05-13,312.149994,315.950012,303.209991,307.649994,307.649994,50155600
2755,2020-05-14,304.510010,309.790009,301.529999,309.540009,309.540009,39732300


In [5]:
df_v.dtypes

Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Adj Close           float64
Volume                int64
dtype: object

In [6]:
df_v

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2009-06-04,20.018572,20.597143,20.005714,20.534286,17.776474,137658500
1,2009-06-05,20.758572,20.914286,20.458570,20.667143,17.891493,158179000
2,2009-06-08,20.545713,20.604286,19.918571,20.549999,17.790079,232913100
3,2009-06-09,20.544285,20.651428,20.078571,20.388571,17.650331,169241100
4,2009-06-10,20.325714,20.335714,19.757143,20.035715,17.344868,172155900
...,...,...,...,...,...,...,...
2752,2020-05-11,308.100006,317.049988,307.239990,315.010010,315.010010,36405900
2753,2020-05-12,317.829987,319.690002,310.910004,311.410004,311.410004,40575300
2754,2020-05-13,312.149994,315.950012,303.209991,307.649994,307.649994,50155600
2755,2020-05-14,304.510010,309.790009,301.529999,309.540009,309.540009,39732300


## Sentiment for all the articles with "Apple" in the body of an article

In [7]:
#reading a file
df2 = pd.read_csv('df_AP.csv')

In [8]:
df2

Unnamed: 0.1,Unnamed: 0,Date,compound_mean
0,0,2011-07-06,0.143650
1,1,2011-07-07,-0.284200
2,2,2011-07-08,-0.102600
3,3,2011-07-11,0.236200
4,4,2011-07-12,-0.096240
...,...,...,...
1341,1341,2017-01-06,0.636900
1342,1342,2017-01-08,0.296000
1343,1343,2017-01-10,0.024698
1344,1344,2017-01-11,0.024698


In [9]:
#deleting column Unnamed
df2 = df2.drop(['Unnamed: 0'], axis=1)

In [10]:
df2

Unnamed: 0,Date,compound_mean
0,2011-07-06,0.143650
1,2011-07-07,-0.284200
2,2011-07-08,-0.102600
3,2011-07-11,0.236200
4,2011-07-12,-0.096240
...,...,...
1341,2017-01-06,0.636900
1342,2017-01-08,0.296000
1343,2017-01-10,0.024698
1344,2017-01-11,0.024698


In [11]:
df2.dtypes

Date              object
compound_mean    float64
dtype: object

In [12]:
#changing column Date type to datetime type
df2.Date=pd.to_datetime(df2['Date'])

In [13]:
#merging dataframe with historical data with dataframe with sentiments 
df3 = pd.merge(df_v,df2,on='Date')

In [14]:
df3

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,compound_mean
0,2011-07-06,49.849998,50.585712,49.529999,50.251427,43.502525,111156500,0.143650
1,2011-07-07,50.667141,51.142857,50.571430,51.028572,44.175301,99915900,-0.284200
2,2011-07-08,50.477142,51.428570,50.314285,51.387142,44.485714,122408300,-0.102600
3,2011-07-11,50.905716,51.395714,50.402859,50.571430,43.779568,110668600,0.236200
4,2011-07-12,50.504284,51.097141,49.802856,50.535713,43.748634,112902300,-0.096240
...,...,...,...,...,...,...,...,...
1160,2017-01-05,115.919998,116.860001,115.809998,116.610001,110.829552,22193600,0.458800
1161,2017-01-06,116.779999,118.160004,116.470001,117.910004,112.065109,31751900,0.636900
1162,2017-01-10,118.769997,119.379997,118.300003,119.110001,113.205620,24462100,0.024698
1163,2017-01-11,118.739998,119.930000,118.599998,119.750000,113.813881,27588600,0.024698


# Machine learning for prediction of label for the next day 

In [15]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import sklearn

In [16]:
#deepcopying dataframe, so there would be no need to run everything from the beggining
df = copy.deepcopy(df3)

In [17]:
#addind label of 1(up) or 0(down) for the price of a next day
def add_label(dfi):
    idx = len(dfi.columns)
    new_col = np.where(dfi['Close'] >= dfi['Close'].shift(1), 1, 0)  
    dfi.insert(loc=idx, column='Label', value=new_col)
    dfi = dfi.fillna(0)
    df['Label'] =  df['Label'].shift(-1, axis = 0)
    df.drop(df.index[len(df)-1], inplace = True)

In [19]:
add_label(df)

In [20]:
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,compound_mean,Label
0,2011-07-06,49.849998,50.585712,49.529999,50.251427,43.502525,111156500,0.143650,1.0
1,2011-07-07,50.667141,51.142857,50.571430,51.028572,44.175301,99915900,-0.284200,1.0
2,2011-07-08,50.477142,51.428570,50.314285,51.387142,44.485714,122408300,-0.102600,0.0
3,2011-07-11,50.905716,51.395714,50.402859,50.571430,43.779568,110668600,0.236200,0.0
4,2011-07-12,50.504284,51.097141,49.802856,50.535713,43.748634,112902300,-0.096240,1.0
...,...,...,...,...,...,...,...,...,...
1159,2017-01-04,115.849998,116.510002,115.750000,116.019997,110.268791,21118100,0.024698,1.0
1160,2017-01-05,115.919998,116.860001,115.809998,116.610001,110.829552,22193600,0.458800,1.0
1161,2017-01-06,116.779999,118.160004,116.470001,117.910004,112.065109,31751900,0.636900,1.0
1162,2017-01-10,118.769997,119.379997,118.300003,119.110001,113.205620,24462100,0.024698,1.0


In [21]:
df.dtypes

Date             datetime64[ns]
Open                    float64
High                    float64
Low                     float64
Close                   float64
Adj Close               float64
Volume                    int64
compound_mean           float64
Label                   float64
dtype: object

In [26]:
array = df.values

In [27]:
#creating training and testing datasets
X = array[:,1:8]
Y = array[:,8]

In [28]:
#standardising features, fitting and transforming X
X = sklearn.preprocessing.MinMaxScaler().fit_transform(X)

In [29]:
#casting Y to data type integer
Y = Y.astype('int')

In [30]:
print(X[1])
print(df.columns[1:8])

[0.00965776 0.00663629 0.01275638 0.00939164 0.00856365 0.23789287
 0.31203994]
Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'compound_mean'], dtype='object')


In [31]:
#choosing best features for the model
test = SelectKBest(score_func=chi2, k=5)
fit = test.fit(X, Y)
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
print(features[0:5,:])

[3.626e-01 3.989e-01 4.030e-01 4.352e-01 4.028e-01 2.409e-02 1.947e-04]
[[0.    0.    0.    0.    0.   ]
 [0.01  0.007 0.013 0.009 0.009]
 [0.007 0.01  0.01  0.014 0.013]
 [0.012 0.01  0.011 0.004 0.004]
 [0.008 0.006 0.003 0.003 0.003]]


In [32]:
#High, Low, Close, Volume and Adj Close give the most information. Compound mean is the least relevant feature
#as is's picked last. We will compare in this case machine learning models with and without compound mean
features

array([[0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.01 , 0.007, 0.013, 0.009, 0.009],
       [0.007, 0.01 , 0.01 , 0.014, 0.013],
       ...,
       [0.791, 0.805, 0.82 , 0.818, 0.873],
       [0.815, 0.819, 0.842, 0.832, 0.887],
       [0.814, 0.826, 0.846, 0.84 , 0.895]])

In [33]:
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from sklearn.model_selection import train_test_split


#building and training the model
X_train, X_test, y_train, y_test = train_test_split(features, Y, test_size=0.25)
model = Sequential()
model.add(Dense(16, input_shape=(5,) ))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,
                       patience=100, min_delta=0.0001, restore_best_weights = True)
history = model.fit(X_train, y_train, epochs=100,
                    validation_data= (X_test,y_test),
                              callbacks=[es])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 873 samples, validate on 291 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100


Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [34]:
#evaluating the model
model.evaluate(X_test, y_test)



[0.6938442301094737, 0.4467353820800781]

### Now we will take also less relevant features. After this we will determine what to use in LSTM

In [35]:
#choosing the less relevant feature
test = SelectKBest(score_func=chi2, k=7)
fit = test.fit(X, Y)
np.set_printoptions(precision=3)
features = fit.transform(X)

In [38]:
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from sklearn.model_selection import train_test_split

#and building&training a model once more
X_train, X_test, y_train, y_test = train_test_split(features, Y, test_size=0.25)
model = Sequential()
model.add(Dense(16, input_shape=(7,) ))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,
                       patience=100, min_delta=0.0001, restore_best_weights = True)
history = model.fit(X_train, y_train, epochs=100,
                    validation_data= (X_test,y_test),
                              callbacks=[es])

Train on 873 samples, validate on 291 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [39]:
#It seems, that including more feature yields better results - however in LSTM it yields worse esults
model.evaluate(X_test, y_test)



[0.7083165655840713, 0.45704466104507446]

# LSTM AND GRU METHOD

In [139]:
#deepcopying dataframe, so there would be no need to run everything from the beggining
normalized_df1 = copy.deepcopy(df3)

In [140]:
#adding new columns
normalized_df1 = normalized_df1[['Open','High','Low','Close','compound_mean']]

In [141]:
def normalized_df(df):
    normalized_df=(df-df.mean())/df.std()
    return normalized_df

In [142]:
normalized_df2 = copy.deepcopy(normalized_df1)

In [143]:
mean = normalized_df1.mean(axis = 0)
normalized_df1 -= mean
std = normalized_df1.std(axis=0)
normalized_df1 /= std

In [144]:
#creating function add label: up or down or stable
def add_label(df):
    idx = len(df.columns)
    new_col = np.where(df['Close'] >= df['Close'].shift(1), 1, 0)  
    df.insert(loc=idx, column='Label', value=new_col)
    df = df.fillna(0)

In [145]:
#using add label
add_label(normalized_df1)

In [146]:
normalized_df1 = normalized_df1.values

In [147]:
normalized_df1

array([[-1.775, -1.768, -1.763, -1.757,  0.371,  0.   ],
       [-1.739, -1.743, -1.716, -1.722, -1.005,  1.   ],
       [-1.747, -1.73 , -1.728, -1.706, -0.421,  1.   ],
       ...,
       [ 1.311,  1.293,  1.341,  1.328, -0.012,  1.   ],
       [ 1.309,  1.317,  1.355,  1.357, -0.012,  1.   ],
       [ 1.317,  1.289,  1.337,  1.334,  1.207,  0.   ]])

In [148]:
#creating a generator
from keras.utils import to_categorical
def generator(data, lookback, delay, min_index, max_index,
              shuffle=False, batch_size=32, step=5):
    if max_index is None:
        max_index = len(data) - delay - 1
    i = min_index + lookback
    while 1:
        if shuffle:
            rows = np.random.randint(
                min_index + lookback, max_index, size=batch_size)
        else:
            if i + batch_size >= max_index:
                i = min_index + lookback
            rows = np.arange(i, min(i + batch_size, max_index))
            i += len(rows)
        samples = np.zeros((len(rows),
                           lookback // step,
                           data.shape[-1]))
        targets = np.zeros((len(rows),))
        for j, row in enumerate(rows):
            indices = range(rows[j] - lookback, rows[j], step)
            samples[j] = data[indices]
            targets[j] = data[rows[j] + delay][-1]
        yield samples, to_categorical(targets)

In [149]:
lookback = 30
step = 10
delay = 1
batch_size = 32

In [150]:
#splitting data into training, testing and validation sets
train_gen = generator(normalized_df1,
                      lookback=lookback,
                      delay=delay,
                      min_index=0,
                      max_index=round(0.6*len(normalized_df1)),
                      shuffle=False,
                      step=step,
                      batch_size=batch_size)
val_gen = generator(normalized_df1,
                    lookback=lookback,
                    delay=delay,
                    min_index=round(0.6*len(normalized_df1))+1,
                    max_index=round(0.8*len(normalized_df1)),
                    step=step,
                    batch_size=batch_size)
test_gen = generator(normalized_df1,
                     lookback=lookback,
                     delay=delay,
                     min_index=round(0.8*len(normalized_df1))+1,
                     max_index=None,
                     step=step,
                     batch_size=batch_size)

val_steps = (round(0.8*len(normalized_df1)) - round(0.6*len(normalized_df1))+1 - lookback) # how many steps to draw from val_gen in order to see the entire validation set
test_steps = (len(normalized_df1) - round(0.8*len(normalized_df1))+1 - lookback)
# How many steps to draw from test_gen in order to see the entire test set

In [156]:
#creating LSTM model and training it
model = Sequential()
model.add(LSTM(100, return_sequences=True,
                    input_shape=(None, normalized_df1.shape[-1]),
                    kernel_initializer='random_uniform'))
model.add(Dropout(0.4))
model.add(LSTM(60, dropout=0.0, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(20,activation='relu'))
model.add(layers.Dense(2, activation='softmax'))
model.compile(loss='mean_squared_error', optimizer=RMSprop(),metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,
                       patience=50, min_delta=0.0001, restore_best_weights = True)

history = model.fit_generator(train_gen,
                              steps_per_epoch=2,
                              epochs=100,
                              validation_data=val_gen,
                              validation_steps=val_steps,
                              callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Restoring model weights from the end of the best epoch
Epoch 00051: early stopping


In [157]:
#evaluating our model
test_loss, test_acc = model.evaluate_generator(test_gen, steps=3)
print('test acc:', test_acc)
print("test_loss:", test_loss)

test acc: 0.5104166865348816
test_loss: 0.24977262318134308


In [158]:
#building and training GRU model
model = Sequential()
model.add(layers.GRU(32,
                     dropout=0.3,
                     recurrent_dropout=0.2,
                     return_sequences=True,
                     input_shape=(None, normalized_df1.shape[-1])))
model.add(layers.GRU(64, activation='relu',
                     dropout=0.3,
                     recurrent_dropout=0.1))
model.add(layers.Dense(2, activation='softmax'))
model.compile(optimizer=RMSprop(), loss='mean_squared_error', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,
                       patience=50, min_delta=0.0001, restore_best_weights = True)
    
history = model.fit_generator(train_gen,
                              steps_per_epoch=2,
                              epochs=250,
                              validation_data=val_gen,
                              validation_steps=val_steps,
                              callbacks=[es])

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Restoring model weights from the end of the best epoch
Epoch 00052: early stopping


In [160]:
#evaluating GRU model
test_loss, test_acc = model.evaluate_generator(test_gen, steps=3)
print('test acc:', test_acc)
print("test_loss:", test_loss)

test acc: 0.46875
test_loss: 0.25039148330688477


## Without sentiment

In [161]:
#normalizing data
normalized_df1 = normalized_df2

mean = normalized_df1.mean(axis = 0)
normalized_df1 -= mean
std = normalized_df1.std(axis=0)
normalized_df1 /= std

#adding label: up/down or steady
def add_label(df):
    idx = len(df.columns)
    new_col = np.where(df['Close'] >= df['Close'].shift(1), 1, 0)  
    df.insert(loc=idx, column='Label', value=new_col)
    df = df.fillna(0)
    
add_label(normalized_df1)

In [162]:
#applying function 
del normalized_df1['compound_mean']

In [163]:
normalized_df1

Unnamed: 0,Open,High,Low,Close,Label
0,-1.775432,-1.767936,-1.763157,-1.757065,0
1,-1.738841,-1.743148,-1.716146,-1.722247,1
2,-1.747349,-1.730437,-1.727754,-1.706183,1
3,-1.728158,-1.731899,-1.723755,-1.742728,0
4,-1.746134,-1.745182,-1.750840,-1.744328,0
...,...,...,...,...,...
1160,1.183113,1.180657,1.228761,1.215906,1
1161,1.221623,1.238496,1.258554,1.274149,1
1162,1.310733,1.292774,1.341161,1.327910,1
1163,1.309390,1.317244,1.354703,1.356583,1


In [164]:
normalized_df1 = normalized_df1.values

In [165]:
#splitting into train, test, validation set
train_gen = generator(normalized_df1,
                      lookback=lookback,
                      delay=delay,
                      min_index=0,
                      max_index=round(0.6*len(normalized_df1)),
                      shuffle=False,
                      step=step,
                      batch_size=batch_size)
val_gen = generator(normalized_df1,
                    lookback=lookback,
                    delay=delay,
                    min_index=round(0.6*len(normalized_df1))+1,
                    max_index=round(0.8*len(normalized_df1)),
                    step=step,
                    batch_size=batch_size)
test_gen = generator(normalized_df1,
                     lookback=lookback,
                     delay=delay,
                     min_index=round(0.8*len(normalized_df1))+1,
                     max_index=None,
                     step=step,
                     batch_size=batch_size)

val_steps = (round(0.8*len(normalized_df1)) - round(0.6*len(normalized_df1))+1 - lookback) # how many steps to draw from val_gen in order to see the entire validation set
test_steps = (len(normalized_df1) - round(0.8*len(normalized_df1))+1 - lookback)
# How many steps to draw from test_gen in order to see the entire test set

In [166]:
#building and training LSTM model
model = Sequential()
model.add(LSTM(100, return_sequences=True,
                    input_shape=(None, normalized_df1.shape[-1]),
                    kernel_initializer='random_uniform'))
model.add(Dropout(0.4))
model.add(LSTM(60, dropout=0.0, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(20,activation='relu'))
model.add(layers.Dense(2, activation='softmax'))
model.compile(loss='mean_squared_error', optimizer=RMSprop(),metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,
                       patience=50, min_delta=0.0001, restore_best_weights = True)

history = model.fit_generator(train_gen,
                              steps_per_epoch=2,
                              epochs=200,
                              validation_data=val_gen,
                              validation_steps=val_steps,
                              callbacks=[es])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Restoring model weights from the end of the best epoch
Epoch 00052: early stopping


In [167]:
#evaluating LSTM model
test_loss, test_acc = model.evaluate_generator(test_gen, steps=4)
print('test acc:', test_acc)
print("test_loss:", test_loss)

test acc: 0.53125
test_loss: 0.24936044216156006


In [168]:
#building and training GRU model
model = Sequential()
model.add(layers.GRU(32,
                     dropout=0.3,
                     recurrent_dropout=0.2,
                     return_sequences=True,
                     input_shape=(None, normalized_df1.shape[-1])))
model.add(layers.GRU(64, activation='relu',
                     dropout=0.3,
                     recurrent_dropout=0.2))
model.add(layers.Dense(2, activation='softmax'))
model.compile(optimizer=RMSprop(), loss='mean_squared_error', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,
                       patience=100, min_delta=0.0001, restore_best_weights = True)
    
history = model.fit_generator(train_gen,
                              steps_per_epoch=2,
                              epochs=500,
                              validation_data=val_gen,
                              validation_steps=val_steps,
                              callbacks=[es])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500


Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch

Epoch 174/500
Epoch 175/500
Restoring model weights from the end of the best epoch
Epoch 00175: early stopping


In [169]:
#evaluating GRU model
test_loss, test_acc = model.evaluate_generator(test_gen, steps=4)
print('test acc:', test_acc)
print("test_loss:", test_loss)

test acc: 0.5
test_loss: 0.252469539642334
