In [72]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

import pprint
import pandas as pd
import numpy as np

This iteration of the Random Forest Classifier will use the strategies discussed in the cited research paper. As an initial proof-of-concept for the data, we'll run a test on some AAPL indicators that I built elsewhere.

In [79]:
data = pd.read_csv('stock_tech/AAPL_TA.csv')
data

Unnamed: 0,day,spy open,open,RSI,SO,MACD,PROC,OBV,label
0,1,106.870003,6.960000,39.463012,20.758699,-0.195793,-5.253961,1346777600,1
1,2,106.989998,7.075357,47.695466,35.890841,-0.178103,-4.430446,1897123200,1
2,3,108.860001,7.212143,50.437099,43.179896,-0.157343,0.546947,2552592000,1
3,4,110.269997,7.292500,45.342180,64.682819,-0.130680,5.904408,3096329600,-1
4,5,110.080002,7.201071,49.343096,80.655823,-0.110723,4.015826,2659932800,1
...,...,...,...,...,...,...,...,...,...
2967,2968,467.220001,161.119995,68.207924,74.313983,2.757100,6.438396,16148157700,1
2968,2969,466.059998,160.750000,66.835719,76.454484,3.132270,7.291944,16244199600,1
2969,2970,462.339996,159.570007,64.513874,79.363369,3.432791,7.380151,16313663200,-1
2970,2971,464.070007,159.369995,57.442199,51.207471,3.219890,6.010005,16236703400,1


We now have a dataframe representing ~3000 trading days. We have calculated the Relative Strength Index (RSI), Stochastic Oscillator (SO), Moving Average Convergence/Divergence (MACD), Price Rate of Change (PROC) and On Balance Volume (OBV). 

All of these indicators have been calculated using exclusively data prior to the date of record. This means that RSI on the n-day will have been calculated using only data from the (n-1)-day. The prevents any look-ahead bias and allows the model to see only the data that will be available in real-world trading.

Lastly, we have a label column which is based simply on whether or not the price closed higher/lower/equal on the day being evaluated. All together, this means the model will use yesterday's data to determine today's classification.

In [83]:
# list of features to be pulled from data
# PROC excluded
features = ['day', 'spy open', 'open', 'RSI', 'MACD', 'SO', 'OBV']

# assigning features and label to X and y
X = data[features]
y = data['label']

# splitting training data and validation data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [84]:
model_acc = list()
estimators = [5, 10, 20, 50, 100, 200, 500, 1000]

for est in estimators:
    model = RandomForestClassifier(n_estimators=est, random_state=1)
    model.fit(train_X, train_y)
    prediction = model.predict(val_X)
    result = f'n_estimators={est}, accuracy={accuracy_score(val_y, prediction)}'
    model_acc.append(result)

In [85]:
pprint.pprint(model_acc)

['n_estimators=5, accuracy=0.5168236877523553',
 'n_estimators=10, accuracy=0.5181695827725438',
 'n_estimators=20, accuracy=0.4979811574697174',
 'n_estimators=50, accuracy=0.5289367429340511',
 'n_estimators=100, accuracy=0.5087483176312247',
 'n_estimators=200, accuracy=0.5114401076716016',
 'n_estimators=500, accuracy=0.5100942126514132',
 'n_estimators=1000, accuracy=0.5141318977119784']


Conclusion: Technical indicators, as expected, are insufficient for predicting market conditions on their own.