In [35]:
import numpy as np

import pandas as pd
from pandas.api.types import CategoricalDtype

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model as lm
from sklearn import preprocessing as pre
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import quandl
quandl.ApiConfig.api_key = "tzt74qzzscPX2KqxS_rD"

# Plot settings
plt.rcParams['figure.figsize'] = (12, 9)
plt.rcParams['font.size'] = 12

In [73]:
# Load info for sp500
sp500_general = pd.read_csv('constituents.csv')
sp500_risks = pd.read_csv('company_risk.csv')
sp500 = sp500_general.merge(sp500_risks, on='Symbol')

# Normalize the TotalRisk Column.
min_max_scaler = pre.MinMaxScaler()
sp500['Risk'] = min_max_scaler.fit_transform(sp500['Risk'].values.reshape(-1, 1)).flatten()

sp500.head()

Unnamed: 0,Symbol,Name,Sector,Risk
0,MMM,3M Company,Industrials,0.0
1,ABT,Abbott Laboratories,Health Care,0.290338
2,ABBV,AbbVie,Health Care,0.374926
3,ACN,Accenture plc,Information Technology,0.258289
4,ATVI,Activision Blizzard,Information Technology,0.395041


In [86]:
data = quandl.get_table('WIKI/PRICES',
                        qopts = { 'columns': ['ticker', 'date', 'close'] },
                        ticker= ['AAPL', 'MSFT', 'FB'],
                        date = { 'gte': '2013-01-01', 'lte': '2018-03-30' },
                        paginate=True)
data['ticker'].value_counts()

MSFT    1317
FB      1317
AAPL    1316
Name: ticker, dtype: int64

In [3]:
data['ticker'] = data['ticker'].astype('category').cat.codes
data['day'] = (data['date'] - data['date'].min()).dt.days
data = data.drop(['date'], axis=1)
data['last_day_close'] = data.groupby(['ticker'])['close'].shift()
data['last_day_diff'] = data.groupby(['ticker'])['last_day_close'].diff()
data = data.dropna()
LAST_DAY = data['day'].max()

In [4]:
data.head()

Unnamed: 0_level_0,ticker,close,day,last_day_close,last_day_diff
None,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,0,527.0,2,542.0959,-6.9341
3,0,523.9,5,527.0,-15.0959
4,0,525.31,6,523.9,-3.1
5,0,517.1,7,525.31,1.41
6,0,523.51,8,517.1,-8.21


In [5]:
def ttsplit(df, train_size):
    X = df.drop(['close'], axis = 1)
    y = df['close']
    return train_test_split(X, y, train_size=train_size, random_state=42)

In [6]:
# MODEL 1: A Random Forest Regressor
def build_random_forest(df):
    mean_error = []
    sizes = [1/4]
    for size in sizes:
        xtr, xts, ytr, yts = ttsplit(df, size)

        mdl = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
        mdl.fit(xtr, ytr)

        p = mdl.predict(xts)

        error = mean_squared_error(yts, p)
        print('RMSE Error: %.5f' % (error))
        mean_error.append(error)
    print('Mean Error = %.5f' % np.mean(mean_error))
    return mdl
forest = build_random_forest(data)



RMSE Error: 112.13585
Mean Error = 112.13585


In [7]:
def build_linear_regressor_test(df):
    X_train, X_test, y_train, y_test = ttsplit(df, 0.25)
    
    # Fit and predict
    model = lm.LinearRegression(fit_intercept=True)
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    
    print(f'The validation RMSE for this model is '
          f'{round(mean_squared_error(y_test, y_predicted), 2)}.')

    return model
linear = build_linear_regressor_test(data)

The validation RMSE for this model is 111.22.




In [25]:
scaler = pre.StandardScaler()

def build_elastic_net_predictor(df):
    scaler = pre.StandardScaler()
    X_train, X_test, y_train, y_test = ttsplit(df, 0.25)
    
    l1_ratios = np.arange(0, 1.1, .1)
    alphas = np.arange(0.1, 200.1, .1)
    model = lm.ElasticNetCV(l1_ratio=l1_ratios,
                            alphas=alphas,
                            cv=5,
                            fit_intercept=True,
                            max_iter=5000)

    # Fit and predict
    model.fit(scaler.fit_transform(X_train), y_train)
    y_predicted = model.predict(scaler.fit_transform(X_test))

    print(f'The validation RMSE for this model with '
          f'alpha={round(float(model.alpha_), 2)} is '
          f'{round(mean_squared_error(y_test, y_predicted), 2)}.')
    
    return model
elastic = build_elastic_net_predictor(data)

  tol, rng, random, positive)


The validation RMSE for this model with alpha=0.2 is 346.61.


In [28]:
def predict_future_stock_values(mdl, source_df, days_out):
    abs_day = LAST_DAY + days_out
    # Warning: Columns must be ordered properly for predictor to work!
    tickers = source_df['ticker'].unique()
    days = np.arange(LAST_DAY + 1, abs_day + 1, 1)

    x = source_df.copy()
    for d in days:
        print("DAY {}".format(d))
        
        # Construct a dataframe for the next day, borrowing appropriate values.
        i = x[x['day'] == x['day'].max()]
        i['day'] += 1
        i['last_day_diff'] = i['close'] - i['last_day_close']
        i['last_day_close'] = i['close']
        i = i.drop(['close'], axis=1)
        
        # Predict new close values
        y = mdl.predict(scaler.fit_transform(i))
        i['close'] = pd.Series(y, index=i.index)
        
        x = x.append(i).sort_values(['ticker', 'day'], ascending=[True, True])
        print(x.tail())
#     print(x.head())
#     return mdl.predict(x)
predict_future_stock_values(elastic, data, 10)

DAY 1911
          close   day  last_day_close  last_day_diff  ticker
None                                                        
3946  89.790000  1905           92.48          -0.65       2
3947  87.180000  1906           89.79          -2.69       2
3948  93.780000  1909           87.18          -2.61       2
3949  89.470000  1910           93.78           6.60       2
3949 -43.873228  1911           89.47          -4.31       2
DAY 1912


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


          close   day  last_day_close  last_day_diff  ticker
None                                                        
3947  87.180000  1906       89.790000      -2.690000       2
3948  93.780000  1909       87.180000      -2.610000       2
3949  89.470000  1910       93.780000       6.600000       2
3949 -43.873228  1911       89.470000      -4.310000       2
3949 -43.843856  1912      -43.873228    -133.343228       2
DAY 1913
           close   day  last_day_close  last_day_diff  ticker
None                                                         
3948   93.780000  1909       87.180000      -2.610000       2
3949   89.470000  1910       93.780000       6.600000       2
3949  -43.873228  1911       89.470000      -4.310000       2
3949  -43.843856  1912      -43.873228    -133.343228       2
3949  281.626601  1913      -43.843856       0.029372       2
DAY 1914
           close   day  last_day_close  last_day_diff  ticker
None                                                       

In [10]:
"""
Improvements:
    - Linear Regression (+ Regularization)
    - Recurrent Neural Network (e.g. LSTM)
    - Convert time series to stationary
    - Uniform Scaling for each time series
"""

'\nImprovements:\n    - Linear Regression (+ Regularization)\n    - Recurrent Neural Network (e.g. LSTM)\n    - Convert time series to stationary\n    - Uniform Scaling for each time series\n'