In [3]:
## Replace this section of imports with your preferred
## data download/access interface.  This calls a 
## proprietary set of methods (ie they won't work for you)

from IPython.core.display import Image

import numpy as np
import pandas as pd
pd.core.common.is_list_like = pd.api.types.is_list_like # remove once updated pandas-datareader issue is fixed
# https://github.com/pydata/pandas-datareader/issues/534
import pandas_datareader.data as web
%matplotlib inline


def get_symbols(symbols,data_source, begin_date=None,end_date=None):
    out = pd.DataFrame()
    for symbol in symbols:
        df = web.DataReader(symbol, data_source,begin_date, end_date, access_key='hQqbsfakqXiqavyb4SV9')\
        [['AdjOpen','AdjHigh','AdjLow','AdjClose','AdjVolume']].reset_index()
        
        df.columns = ['date','open','high','low','close','volume'] #my convention: always lowercase
        df['symbol'] = symbol # add a new column which contains the symbol so we can keep multiple symbols in the same dataframe
        df = df.set_index(['date','symbol'])
        out = pd.concat([out,df],axis=0) #stacks on top of previously collected data
    return out.sort_index()
        
prices = get_symbols(['AAPL','CSCO','AMZN','YHOO','MSFT'],data_source='quandl',begin_date='2012-01-01',end_date=None)

print(prices.sort_index().tail())

  df = web.DataReader(symbol, data_source,begin_date, end_date, access_key='hQqbsfakqXiqavyb4SV9')\
  df = web.DataReader(symbol, data_source,begin_date, end_date, access_key='hQqbsfakqXiqavyb4SV9')\
  df = web.DataReader(symbol, data_source,begin_date, end_date, access_key='hQqbsfakqXiqavyb4SV9')\
  df = web.DataReader(symbol, data_source,begin_date, end_date, access_key='hQqbsfakqXiqavyb4SV9')\
  df = web.DataReader(symbol, data_source,begin_date, end_date, access_key='hQqbsfakqXiqavyb4SV9')\


                      open      high      low    close      volume
date       symbol                                                 
2018-03-26 MSFT      90.61    94.000    90.40    93.78  55031149.0
2018-03-27 AAPL     173.68   175.150   166.92   168.34  38962839.0
           AMZN    1572.40  1575.960  1482.32  1497.05   6793279.0
           CSCO      44.49    44.520    42.24    42.68  30088447.0
           MSFT      94.94    95.139    88.51    89.47  53704562.0


In [4]:
num_obs = prices.close.count()

def add_memory(s,n_days=50,memory_strength=0.1):
    ''' adds autoregressive behavior to series of data'''
    add_ewm = lambda x: (1-memory_strength)*x + memory_strength*x.ewm(n_days).mean()
    out = s.groupby(level='symbol').apply(add_ewm)
    return out

# generate feature data
f01 = pd.Series(np.random.randn(num_obs),index=prices.index)
f01 = add_memory(f01,10,0.1)
f02 = pd.Series(np.random.randn(num_obs),index=prices.index)
f02 = add_memory(f02,10,0.1)
f03 = pd.Series(np.random.randn(num_obs),index=prices.index)
f03 = add_memory(f03,10,0.1)
f04 = pd.Series(np.random.randn(num_obs),index=prices.index)
f04 = f04 # no memory

## now, create response variable such that it is related to features
# f01 becomes increasingly important, f02 becomes decreasingly important,
# f03 oscillates in importance, f04 is stationary, finally a noise component is added

outcome =   f01 * np.linspace(0.5,1.5,num_obs) + \
            f02 * np.linspace(1.5,0.5,num_obs) + \
            f03 * pd.Series(np.sin(2*np.pi*np.linspace(0,1,num_obs)*2)+1,index=f03.index) + \
            f04 + \
            np.random.randn(num_obs) * 3 
outcome.name = 'outcome'

In [6]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
features = pd.concat([f01,f02,f03,f04],axis=1)
features.columns = ['f01','f02','f03','f04']
model.fit(X=features,y=outcome)
print('RSQ: '+str(model.score(X=features,y=outcome)))
print('Regression Coefficients: '+str(model.coef_))

RSQ: 0.2588942710563603
Regression Coefficients: [0.9658428  1.02289272 0.94590309 0.99106727]


In [12]:
recalc_dates = features.resample('Q',level='date').mean().index.values[:-1]
#print('recalc_dates:')
#print(recalc_dates)
#print()

models = pd.Series(index=recalc_dates)
for date in recalc_dates:
    X_train = features.xs(slice(None,date),level='date',drop_level=False)
    print(X_train)
    print('=====')
    y_train = outcome.xs(slice(None,date),level='date',drop_level=False)
    model = LinearRegression()
    model.fit(X_train,y_train)
    models.loc[date] = model
    
    
    print("Training on the first {} records, through {}"\
          .format(len(y_train),y_train.index.get_level_values('date').max()))
    #print("Coefficients: {}".format((model.coef_)))
    #print()

                        f01       f02       f03       f04
date       symbol                                        
2012-01-03 AAPL   -0.256250  0.361949  1.504149 -0.178336
           AMZN   -0.290906  1.657943  0.973844 -1.241853
           CSCO   -1.782431  0.006901  1.734060 -0.090088
           MSFT   -0.147602 -0.782910 -0.244746 -0.216085
           YHOO    0.032195 -0.853457 -1.545315 -0.570299
...                     ...       ...       ...       ...
2012-03-30 AAPL    0.891377 -1.795623  0.217164  0.559722
           AMZN    1.382148  1.361728 -0.618762 -2.873088
           CSCO   -0.258589 -0.954993  0.661155  1.279851
           MSFT    0.161012 -0.240603  1.635416  0.757420
           YHOO   -0.274762 -0.586295 -0.713025 -0.822557

[310 rows x 4 columns]
=====
Training on the first 310 records, through 2012-03-30 00:00:00
                        f01       f02       f03       f04
date       symbol                                        
2012-01-03 AAPL   -0.256250  0.361949

  models = pd.Series(index=recalc_dates)


In [13]:
features

Unnamed: 0_level_0,Unnamed: 1_level_0,f01,f02,f03,f04
date,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-01-03,AAPL,-0.256250,0.361949,1.504149,-0.178336
2012-01-03,AMZN,-0.290906,1.657943,0.973844,-1.241853
2012-01-03,CSCO,-1.782431,0.006901,1.734060,-0.090088
2012-01-03,MSFT,-0.147602,-0.782910,-0.244746,-0.216085
2012-01-03,YHOO,0.032195,-0.853457,-1.545315,-0.570299
...,...,...,...,...,...
2018-03-26,MSFT,-0.463295,-1.000721,-2.026862,-0.677719
2018-03-27,AAPL,-1.216308,1.616945,-0.423595,0.747324
2018-03-27,AMZN,-1.312868,0.815364,0.933150,-0.356543
2018-03-27,CSCO,-0.361431,-0.555833,-0.093145,0.705751
