In [2]:
# Compare Algorithms
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection

# Variety of algorithms to test
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor

from util_SR import *

# Tell iPython to include plots inline within the notebook (as opposed to a seperate window)
%matplotlib inline

print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.19.1.


In [None]:
df = pd.read_csv(symbol_to_path('SPY'), index_col='Date', 
            parse_dates=True, 
            usecols=['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close'], 
            na_values=['nan'])
    
# sort data chronologically
df = df.sort_index(ascending=True, axis=0)
    
# add new column to view Adj Close 5 days later
df['Adj_Close_5_Days_Later'] = df['Adj Close']
df['Adj_Close_5_Days_Later'] = df['Adj_Close_5_Days_Later'].shift(-5)

# remove last 5 rows - remove NaNs
df = df[:-5]

X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

# prepare configuration for cross validation test harness
seed = 0

# prepare models
models = []
models.append(('LR', LinearRegression()))
models.append(('KR', KernelRidge()))
models.append(('SVR', SVR()))
models.append(('KNR', KNeighborsRegressor()))
models.append(('GPR', GaussianProcessRegressor()))
models.append(('DTR', DecisionTreeRegressor()))
models.append(('MLP', MLPRegressor()))
models.append(('GBR', GradientBoostingRegressor()))

# evaluate each model in turn
results = []
names = []
scoring = 'r2'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

LR: 0.921644 (0.047912)




KR: 0.918570 (0.049337)
SVR: -25.877974 (31.867182)
KNR: -71.493267 (96.749707)


In [None]:
# boxplot algorithm comparison
fig = plt.figure(figsize=(20,10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()