In [1]:
import numpy as np

import pandas as pd
from pandas.api.types import CategoricalDtype

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor

import quandl
quandl.ApiConfig.api_key = "tzt74qzzscPX2KqxS_rD"

# Plot settings
plt.rcParams['figure.figsize'] = (12, 9)
plt.rcParams['font.size'] = 12

In [11]:
# TODO: Adjust dates to get the latest possible!
quandl.get_table('WIKI/PRICES',
                        ticker=['AAPL', 'MSFT', 'FB'],
                        date = { 'gte': '2013-01-01', 'lte': '2018-03-30' },
                        paginate=True).head()

Unnamed: 0_level_0,ticker,date,open,high,low,close,volume,ex-dividend,split_ratio,adj_open,adj_high,adj_low,adj_close,adj_volume
None,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,AAPL,2013-01-02,553.82,555.0,541.63,549.03,20018500.0,0.0,1.0,71.816894,71.969911,70.236149,71.195748,140129500.0
1,AAPL,2013-01-03,547.88,549.67,541.0,542.0959,12605900.0,0.0,1.0,71.046621,71.27874,70.154453,70.296565,88241300.0
2,AAPL,2013-01-04,536.965,538.6299,525.8286,527.0,21226200.0,0.0,1.0,69.631213,69.847109,68.187094,68.338996,148583400.0
3,AAPL,2013-01-07,522.0,529.3,515.2,523.9,17291300.0,0.0,1.0,67.690619,68.63725,66.808825,67.937002,121039100.0
4,AAPL,2013-01-08,529.21,531.89,521.25,525.31,16382400.0,0.0,1.0,68.625579,68.973109,67.593362,68.119845,114676800.0


In [2]:
# TODO: Adjust dates to get the latest possible!
data = quandl.get_table('WIKI/PRICES',
                        qopts = { 'columns': ['ticker', 'date', 'close'] },
                        ticker=['AAPL', 'MSFT', 'FB'],
                        date = { 'gte': '2013-01-01', 'lte': '2018-03-30' },
                        paginate=True)

In [3]:
data['ticker'] = data['ticker'].astype('category').cat.codes
data['day'] = (data['date'] - data['date'].min()).dt.days

In [4]:
data = data[['ticker', 'day', 'close']]
print(data['day'].min(), data['day'].max())
LAST_DAY = data['day'].max()

0 1910


In [5]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))
def split(high, size):
    t = np.arange(high)
    x = np.random.choice(high, int(high * (size)), replace=False)
    y = np.array([i for i in t if i not in x])
    return x, y

In [6]:
# MODEL 1: A Random Forest Regressor
def build_random_forest(df):
    mean_error = []
    sizes = [1/5, 2/5, 1/2, 3/5, 4/5, 99/100]
    for size in sizes:
        t, v = split(1823, size)
        train = data[data['day'].isin(t)]
        val = data[data['day'].isin(v)]

        xtr, xts = train.drop(['close'], axis=1), val.drop(['close'], axis=1)
        ytr, yts = train['close'].values, val['close'].values

        mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
        mdl.fit(xtr, ytr)

        p = mdl.predict(xts)

        error = rmsle(yts, p)
        print('RMSLE Error: %.5f' % (error))
        mean_error.append(error)
    print('Mean Error = %.5f' % np.mean(mean_error))
    return mdl
forest = build_random_forest(data)

RMSLE Error: 0.08394
RMSLE Error: 0.05014
RMSLE Error: 0.02272
RMSLE Error: 0.01793
RMSLE Error: 0.02237
RMSLE Error: 0.01071
Mean Error = 0.03464


In [7]:
def predict_future_stock_values(mdl, days_out):
    abs_day = LAST_DAY + days_out
    # Warning: Columns must be ordered properly for predictor to work!
    x = pd.DataFrame({'ticker': [0,1,2], 'day': [abs_day] * 3})
    x = x[['ticker', 'day']]
    return mdl.predict(x)
predict_future_stock_values(forest, 360)

array([ 169.92124,  176.88304,   85.59365])

In [8]:
# Featuring lag & diff
data2 = data.copy()
data2['last_day_close'] = data2.groupby(['ticker'])['close'].shift()
data2['last_day_diff'] = data2.groupby(['ticker'])['last_day_close'].diff()
data2 = data2.dropna()
data2

forest2 = build_random_forest(data2)

RMSLE Error: 0.04673
RMSLE Error: 0.07412
RMSLE Error: 0.01818
RMSLE Error: 0.01724
RMSLE Error: 0.01416
RMSLE Error: 0.00756
Mean Error = 0.02966


In [12]:
predict_future_stock_values(forest2, 0)

array([ 169.91251,  176.91611,   85.59108])

In [24]:
# def build_linear_regressor():
X = data2.drop(['close'], axis = 1)
y = data2['close']
linreg = lm.LinearRegression(fit_intercept=True)
linreg.fit(X, y)
y_fitted = linreg.predict(X)
print(y_fitted)

In [10]:
"""
Improvements:
    - Linear Regression (+ Regularization)
    - Recurrent Neural Network (e.g. LSTM)
    - Convert time series to stationary
    - Uniform Scaling for each time series
"""

'\nImprovements:\n    - Linear Regression (+ Regularization)\n    - Recurrent Neural Network (e.g. LSTM)\n    - Convert time series to stationary\n    - Uniform Scaling for each time series\n'