In [127]:
%matplotlib inline

import warnings
import os
from pathlib import Path
import quandl
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz, _tree
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error, precision_recall_curve
from sklearn.preprocessing import Imputer
import statsmodels.api as sm
from scipy.interpolate import interp1d, interp2d

In [2]:
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

## Get Data

In [3]:
with pd.HDFStore('../data/assets.h5') as store:
    print(store.info())
    prices = store['quandl/wiki/prices'].adj_close.unstack('ticker')
    stocks = store['us_equities/stocks']

<class 'pandas.io.pytables.HDFStore'>
File path: ../data/assets.h5
/fred/assets                   frame        (shape->[4826,5])     
/quandl/wiki/prices            frame        (shape->[15389314,12])
/quandl/wiki/stocks            frame        (shape->[1,2])        
/sp500/prices                  frame        (shape->[37721,5])    
/sp500/stocks                  frame        (shape->[1,7])        
/us_equities/stocks            frame        (shape->[1,6])        


In [4]:
shared = prices.columns.intersection(stocks.index)
prices = prices.loc['2010': '2018', shared]
stocks = stocks.loc[shared, ['marketcap', 'ipoyear', 'sector']]

In [None]:
prices.info()

### Create monthly returns

Remove outliers

In [None]:
returns = prices.resample('M').pct_change().stack().swaplevel()
returns = returns[returns.between(left=returns.quantile(.05), right=returns.quantile(.95))].to_frame('returns')

In [None]:
for t in range(1, 13):
    returns[f't-{t}'] = returns.groupby(level='ticker').returns.shift(t)
returns = returns.dropna()

In [None]:
dates = returns.index.get_level_values('date')
returns['year'] = dates.year
returns['month'] = dates.month
returns = pd.get_dummies(returns, columns=['year', 'month'])
returns = returns.reset_index('date', drop=True)

In [None]:
returns.info()

In [None]:
stocks.info()
stocks.ipoyear = pd.qcut(stocks.ipoyear, q=5, labels=list(range(1, 6)))

In [None]:
stocks.marketcap = stocks.marketcap.str.replace('$', '')
stocks['mcap'] = stocks.marketcap.str[-1]
stocks.marketcap =  pd.to_numeric(stocks.marketcap.str[:-1])
stocks = stocks[stocks.mcap.isin(['B', 'M'])]
stocks.info()

In [None]:
stocks.marketcap = stocks.apply(lambda x: x.marketcap * 1000 if x.mcap == 'B' else x.marketcap, axis=1)
stocks.marketcap = pd.qcut(stocks.marketcap, q=10, labels=list(range(1, 11)))
stocks = stocks.drop('mcap', axis=1)

In [None]:
stocks = pd.get_dummies(stocks, prefix=['size', 'age', ''], prefix_sep=['_', '_', ''])
stocks.info()

In [None]:
data = returns.join(stocks).dropna()
s = len(returns.columns)
data.iloc[:, s:] = data.iloc[:, s:].astype(int).apply(pd.to_numeric, downcast='integer')
data.info()

In [None]:
with pd.HDFStore('data.h5') as store:
    store.put('data', data)

In [3]:
with pd.HDFStore('data.h5') as store:
    data = store['data']
    data = data.drop([f't-{t}' for t in range(7, 13)] + [c for c in data.columns if c.startswith('month')], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 174551 entries, A to ZUMZ
Data columns (total 43 columns):
returns                  174551 non-null float64
t-1                      174551 non-null float64
t-2                      174551 non-null float64
t-3                      174551 non-null float64
t-4                      174551 non-null float64
t-5                      174551 non-null float64
t-6                      174551 non-null float64
year_2010                174551 non-null uint8
year_2011                174551 non-null uint8
year_2012                174551 non-null uint8
year_2013                174551 non-null uint8
year_2014                174551 non-null uint8
year_2015                174551 non-null uint8
year_2016                174551 non-null uint8
year_2017                174551 non-null uint8
year_2018                174551 non-null uint8
size_1                   174551 non-null int8
size_2                   174551 non-null int8
size_3                   174551 non-nu

### Stock Prices

In [4]:
y = data.returns
X = data.drop('returns', axis=1)

## Explore Data

In [5]:
y.describe(percentiles=np.arange(.1, .91, .1))

count    174551.000000
mean          0.009913
std           0.055606
min          -0.128560
10%          -0.064080
20%          -0.036451
30.0%        -0.017798
40%          -0.002855
50%           0.010522
60%           0.023904
70%           0.038457
80%           0.056050
90%           0.082772
max           0.146335
Name: returns, dtype: float64

In [6]:
y_binary = (y>0).astype(int)