In [1]:
import numpy as np

import pandas as pd
from pandas.api.types import CategoricalDtype

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import quandl
quandl.ApiConfig.api_key = "tzt74qzzscPX2KqxS_rD"

# Plot settings
plt.rcParams['figure.figsize'] = (12, 9)
plt.rcParams['font.size'] = 12

In [2]:
# TODO: Adjust dates to get the latest possible!
data = quandl.get_table('WIKI/PRICES',
                        qopts = { 'columns': ['ticker', 'date', 'close'] },
                        ticker=['AAPL', 'MSFT', 'FB'],
                        date = { 'gte': '2013-01-01', 'lte': '2018-03-30' },
                        paginate=True)

In [3]:
data['ticker'] = data['ticker'].astype('category').cat.codes
data['day'] = (data['date'] - data['date'].min()).dt.days
data = data[['ticker', 'day', 'close']]
data['last_day_close'] = data.groupby(['ticker'])['close'].shift()
data['last_day_diff'] = data.groupby(['ticker'])['last_day_close'].diff()
data = data.dropna()
LAST_DAY = data['day'].max()

In [4]:
def ttsplit(df, train_size):
    X = df.drop(['close'], axis = 1)
    y = df['close']
    return train_test_split(X, y, train_size=train_size, random_state=42)

In [5]:
# MODEL 1: A Random Forest Regressor
def build_random_forest(df):
    mean_error = []
    sizes = [1/4]
    for size in sizes:
        xtr, xts, ytr, yts = ttsplit(df, size)

        mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
        mdl.fit(xtr, ytr)

        p = mdl.predict(xts)

        error = mean_squared_error(yts, p)
        print('RMSE Error: %.5f' % (error))
        mean_error.append(error)
    print('Mean Error = %.5f' % np.mean(mean_error))
    return mdl
forest = build_random_forest(data)



RMSE Error: 112.13585
Mean Error = 112.13585


In [6]:
def build_linear_regressor_test(df):
    X_train, X_test, y_train, y_test = ttsplit(df, 0.25)
    
    # Fit and predict
    model = lm.LinearRegression(fit_intercept=True)
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    
    print(f'The validation RMSE for this model is '
          f'{round(mean_squared_error(y_test, y_predicted), 2)}.')

    return model
linear = build_linear_regressor_test(data)

The validation RMSE for this model is 111.22.




In [7]:
def build_elastic_net_predictor(df):
    X_train, X_test, y_train, y_test = ttsplit(df, 0.25)
    
    l1_ratios = np.arange(0, 1.1, .1)
    alphas = np.arange(0.1, 200.1, .1)
    model = lm.ElasticNetCV(l1_ratio=l1_ratios, alphas=alphas, cv=5, fit_intercept=True, max_iter=1000)

    # Fit and predict
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)

    print(f'The validation RMSE for this model with '
          f'alpha={round(float(model.alpha_), 2)} is '
          f'{round(mean_squared_error(y_test, y_predicted), 2)}.')
    
    return model
elastic = build_elastic_net_predictor(data)

  tol, rng, random, positive)


The validation RMSE for this model with alpha=8.1 is 111.19.


In [8]:
def predict_future_stock_values(mdl, days_out):
    abs_day = LAST_DAY + days_out
    # Warning: Columns must be ordered properly for predictor to work!
    x = pd.DataFrame({'ticker': [0,1,2], 'day': [abs_day] * 3})
    x = x[['ticker', 'day']]
    return mdl.predict(x)

In [9]:
predict_future_stock_values(forest, 10)
predict_future_stock_values(linear, 10)
predict_future_stock_values(elastic, 10)

ValueError: Number of features of the model must match the input. Model n_features is 4 and input n_features is 2 

In [None]:
"""
Improvements:
    - Linear Regression (+ Regularization)
    - Recurrent Neural Network (e.g. LSTM)
    - Convert time series to stationary
    - Uniform Scaling for each time series
"""