In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
path = '../data/'

In [3]:
df = pd.read_csv(path+'sp500_closefull.csv', index_col=0, parse_dates=True)

In [4]:
df.dropna(axis=0, how='all', inplace=True)
df.dropna(axis=1, how='any', inplace=True)
df.isna().sum().sum()

0

In [5]:
df_returns = pd.DataFrame()

In [6]:
for name in df.columns:
  df_returns[name] = np.log(df[name]).diff()

In [7]:
df_returns['SPY'] = df_returns['SPY'].shift(-1)
df_returns['SPY'].tail()

Date
2018-12-21   -0.026778
2018-12-24    0.049290
2018-12-26    0.007648
2018-12-27   -0.001291
2018-12-28         NaN
Name: SPY, dtype: float64

In [8]:
Ntest = 1000
train = df_returns.iloc[1:-Ntest]
test = df_returns.iloc[-Ntest:-1]

In [9]:
df_returns.columns

Index(['CSCO', 'UAL', 'TROW', 'ISRG', 'PRGO', 'TPR', 'DVN', 'MRO', 'BA',
       'VRTX',
       ...
       'M', 'CRM', 'PGR', 'WAT', 'BWA', 'LRCX', 'NWL', 'UAA', 'BLK', 'PPL'],
      dtype='object', length=429)

In [10]:
x_cols = ['AAPL', 'MSFT', 'AMZN', 'JNJ', 'V', 'PG', 'JPM']
x_cols

['AAPL', 'MSFT', 'AMZN', 'JNJ', 'V', 'PG', 'JPM']

In [11]:
Xtrain = train[x_cols]
Ytrain = train['SPY']
Xtest = test[x_cols]
Ytest = test['SPY']

In [12]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(Xtrain, Ytrain)
model.score(Xtrain, Ytrain), model.score(Xtest, Ytest)

(0.008271754178782342, -0.011369618185063102)

In [13]:
# Direction
Ptrain = model.predict(Xtrain)
Ptest = model.predict(Xtest)
np.mean(np.sign(Ptrain) == np.sign(Ytrain)), np.mean(np.sign(Ptest) == np.sign(Ytest))

(0.528526148969889, 0.5285285285285285)

In [14]:
set(np.sign(Ptrain)), set(np.sign(Ptest))

({-1.0, 1.0}, {-1.0, 1.0})

In [15]:
df_returns['Position'] = 0 # create new column
df_returns.loc[1:-Ntest,'Position'] = (Ptrain > 0)
df_returns.loc[-Ntest:-1,'Position'] = (Ptest > 0)

  df_returns.loc[1:-Ntest,'Position'] = (Ptrain > 0)
  df_returns.loc[-Ntest:-1,'Position'] = (Ptest > 0)


In [16]:
df_returns['AlgoReturn'] = df_returns['Position'] * df_returns['SPY']

In [17]:
df_returns.iloc[1:-Ntest]['AlgoReturn'].sum()

0.5477174926058384

In [18]:
# Total algo log return test
df_returns.iloc[-Ntest:-1]['AlgoReturn'].sum()

0.3037759916492

In [19]:
# Total return buy-and-hold test
Ytest.sum()

0.19307543946998518

In [20]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=10)
Ctrain = (Ytrain > 0)
Ctest = (Ytest > 0)
model.fit(Xtrain, Ctrain)
model.score(Xtrain, Ctrain), model.score(Xtest, Ctest)

(0.5586370839936609, 0.5285285285285285)

In [21]:
Ptrain = model.predict(Xtrain)
Ptest = model.predict(Xtest)
set(Ptrain), set(Ptest)

({False, True}, {False, True})

In [22]:
df_returns.loc[1:-Ntest,'Position'] = Ptrain
df_returns.loc[-Ntest:-1,'Position'] = Ptest
df_returns['AlgoReturn'] = df_returns['Position'] * df_returns['SPY']

  df_returns.loc[1:-Ntest,'Position'] = Ptrain
  df_returns.loc[-Ntest:-1,'Position'] = Ptest


In [23]:
# Total algo log return train
df_returns.iloc[1:-Ntest]['AlgoReturn'].sum()

0.5905718759742236

In [24]:
# Total algo log return test
df_returns.iloc[-Ntest:-1]['AlgoReturn'].sum()

0.19680747922142316

In [25]:
# Total return buy-and-hold
Ytrain.sum(), Ytest.sum()

(0.5863972019814705, 0.19307543946998518)

In [26]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=2)
model.fit(Xtrain, Ctrain)
model.score(Xtrain, Ctrain), model.score(Xtest, Ctest)

(1.0, 0.5255255255255256)

In [27]:
Ptrain = model.predict(Xtrain)
Ptest = model.predict(Xtest)

In [28]:
df_returns.loc[1:-Ntest,'Position'] = Ptrain
df_returns.loc[-Ntest:-1,'Position'] = Ptest

  df_returns.loc[1:-Ntest,'Position'] = Ptrain
  df_returns.loc[-Ntest:-1,'Position'] = Ptest


In [29]:
df_returns['AlgoReturn'] = df_returns['Position'] * df_returns['SPY']

In [30]:
df_returns.iloc[1:-Ntest]['AlgoReturn'].sum()

4.706503557502184

In [31]:
# Total algo log return test
df_returns.iloc[-Ntest:-1]['AlgoReturn'].sum()

0.3839643540537523

In [32]:
# Total return buy-and-hold
Ytrain.sum(), Ytest.sum()

(0.5863972019814705, 0.19307543946998518)