In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [2]:
df0 = pd.read_csv("../Data/sp500sub.csv", index_col="Date", parse_dates=True)

df0.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Name
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04,9.22,9.51,9.17,9.5,9.5,1865400.0,INCY
2010-01-05,9.51,10.29,9.45,10.27,10.27,7608900.0,INCY
2010-01-06,10.38,11.09,10.35,11.0,11.0,8046700.0,INCY
2010-01-07,11.0,11.06,10.62,10.82,10.82,3680300.0,INCY
2010-01-08,10.82,11.0,10.75,10.94,10.94,1529300.0,INCY


In [3]:
df = df0.query("Name == 'IBM'")[["Close"]].copy()

df["LogClose"] = np.log(df["Close"])
df["LogReturn"] = df["LogClose"].diff()

In [4]:
Ntest = 252
train = df.iloc[:-Ntest]
test = df.iloc[-Ntest:]

In [7]:
# Make supervised dataset
# let's see if we can use T past values to predict the next value

series = df["LogReturn"].to_numpy()[1:]  # first value is NaN
target = (series > 0).astype(int)

T, X, Y = 21, [], []
# count up to 1 less than before, since there is no target for last close
for t in range(len(series) - T):
    x = series[t : t + T]
    X.append(x)
    y = target[t + T]
    Y.append(y)

X = np.array(X).reshape(-1, T)
Y = np.array(Y)
N = len(X)
print("X.shape:", X.shape, "Y.shape:", Y.shape)

X.shape: (2241, 21) Y.shape: (2241,)


In [8]:
Xtrain, Ytrain = X[:-Ntest], Y[:-Ntest]
Xtest, Ytest = X[-Ntest:], Y[-Ntest:]

In [9]:
lr = LogisticRegression()
lr.fit(Xtrain, Ytrain)
print("Logistic Regression train score:", lr.score(Xtrain, Ytrain))

Logistic Regression train score: 0.5093011563599799


In [10]:
lr.score(Xtest, Ytest)

0.49603174603174605

In [11]:
svc = SVC()
svc.fit(Xtrain, Ytrain)
print("SVC train score:", svc.score(Xtrain, Ytrain))

SVC train score: 0.7551533433886375


In [12]:
svc.score(Xtest, Ytest)

0.49603174603174605

In [13]:
rf = RandomForestClassifier()
rf.fit(Xtrain, Ytrain)
print("Random Forest train score:", rf.score(Xtrain, Ytrain))

Random Forest train score: 1.0


In [14]:
rf.score(Xtest, Ytest)

0.4880952380952381

In [15]:
# Exercise: maybe you believe walk-forward validation will be
# more realistic - will it lead to better results?

In [16]:
# Exercise: do you think using raw (log) prices would have worked?