In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import ElasticNetCV

from sklearn.model_selection import GridSearchCV

In [None]:
%matplotlib inline

pd.set_option('display.max_columns', None)

from sklearn.metrics import r2_score

from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression

from sklearn.feature_selection import RFECV

from scripy.stats import boxcox

from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, plot_confusion_matrix

from sklearn.metrics import plot_roc_curve

from sklearn.metrics import roc_auc_score

from sklearn.metrics import roc_curve, auc

from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.metrics import precision_recall_curve

from sklearn.metrics import plot_precision_recall_curve

from sklearn.metrics import plot_roc_curve

from sklearn.metrics import plot_confusion_matrix

from scipy.stats import spearmanr

from scipy.stats import jarque_bera

from scipy.stats import normaltest

from talib import RSI, BBANDS, MACD, STOCH, ATR, OBV

from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier

from sklearn.ensemble import AdaBoostClassifier

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC

from talib import WMA

In [None]:
idx=pd.IndexSlice
sns.set_style('darkgrid')

In [None]:
def get_data():
    df=pd.read_csv('data.csv',index_col=[0],parse_dates=['Date'])
    df=df.sort_values(['Ticker','Date'])
    df=df.dropna()
    return df

def get_ticker_data(df,ticker):
    df=df.loc[idx[ticker,:],:]
    return df

In [None]:
def rank_features(df,feature):
    df=df.sort_values(feature,ascending=False)
    df[feature+'_rank']=np.arange(1,len(df)+1)
    df=df.sort_values(['Date','Ticker'])
    return df

def get_feature_rank(df,feature):
    df=df.loc[:,[feature+'_rank']]
    return df

In [None]:
def get_feature(df,feature):
    df=df.loc[:,[feature]]
    return df

In [None]:
def rank(df,feature):
    df=df.sort_values(feature,ascending=False)
    df['rank']=np.arange(1,len(df)+1)
    df=df.sort_values(['Date','Ticker'])
    return df

def get_rank(df):
    df=df.loc[:,['rank']]
    return df

In [None]:
def scale(df,feature):
    df[feature+'_scaled']=StandardScaler().fit_transform(df[[feature]])
    return df

def get_scaled(df,feature):
    df=df.loc[:,[feature+'_scaled']]
    return df

In [None]:
def log(df,feature):
    df[feature+'_log']=np.log(df[feature])
    return df

def get_log(df,feature):
    df=df.loc[:,[feature+'_log']]
    return df

In [None]:
def signal(df,feature):
    df[feature+'_signal']=np.where(df[feature]>0,1,0)
    return df

def get_signal(df,feature):
    df=df.loc[:,[feature+'_signal']]
    return df

In [None]:
def get_target(df,feature):
    df=df.loc[:,[feature]]
    return df

def get_features(df,features):
    df=df.loc[:,features]
    return df

In [None]:
def power_transform(df,feature):
    df[feature+'_power']=stats.boxcox(df[feature])[0]
    return df

def get_power(df,feature):
    df=df.loc[:,[feature+'_power']]
    return df

In [None]:
def power(df,exp,feature):
    df[feature+'_power']=df[feature]**exp
    return df

In [None]:
def ts(df,feature):
    df[feature+'_ts']=df[feature].shift(1)
    return df

def get_ts(df,feature):
    df=df.loc[:,[feature+'_ts']]
    return df

In [None]:
def ts_diff(df,feature):
    df[feature+'_ts_diff']=df[feature].diff(1)
    return df

def get_ts_diff(df,feature):
    df=df.loc[:,[feature+'_ts_diff']]
    return df

def ts_pct(df,feature):
    df[feature+'_ts_pct']=df[feature].pct_change(1)
    return df

def get_ts_pct(df,feature):
    df=df.loc[:,[feature+'_ts_pct']]
    return df

def ts_log(df,feature):
    df[feature+'_ts_log']=np.log(df[feature])
    return df

def get_ts_log(df,feature):
    df=df.loc[:,[feature+'_ts_log']]
    return df

In [None]:
def ts_delta(df,feature):
    df[feature+'_ts_delta']=df[feature]-df[feature+'_ts']
    return df

def get_ts_delta(df,feature):
    df=df.loc[:,[feature+'_ts_delta']]
    return df

In [None]:
ohlcv=['Open','High','Low','Close','Volume']

def get_ohlcv(df):
    df=df.loc[:,ohlcv]
    return df

data=get_data()

data=data.loc[idx[:,'2010-01-01':'2020-12-31'],:]

data=data.dropna()

data=data.drop_duplicates()

data=data.drop(['Open','High','Low','Close','Volume'],axis=1)

data=data.drop(['Market Cap','Enterprise Value','Shares Outstanding','Shares Short (prior month )','Float','Shares Short','Shares Short (prior month)','Shares Short (Jan 14, 2021)','Shares Short (prior month Dec 14, 2020)'],axis=1)

data=data.drop(['Forward Annual Dividend Rate','Forward Annual Dividend Yield','Trailing Annual Dividend Rate','Trailing Annual Dividend Yield','5 Year Average Dividend Yield','Payout Ratio','Dividend Date','Ex-Dividend Date'],axis=1)

data=data.drop(['Fiscal Year Ends','Most Recent Quarter (mrq)','Profit Margin','Operating Margin (ttm)','Return on Assets (ttm)','Return on Equity (ttm)','Revenue (ttm)','Revenue Per Share (ttm)','Quarterly Revenue Growth (yoy)','Gross Profit (ttm)','EBITDA','Net Income Avi to Common (ttm)','Diluted EPS (ttm)','Quarterly Earnings Growth (yoy)','Total Cash (mrq)','Total Cash Per Share (mrq)','Total Debt (mrq)','Total Debt/Equity (mrq)','Current Ratio (mrq)','Book Value Per Share (mrq)','Operating Cash Flow (ttm)','Levered Free Cash Flow (ttm)'],axis=1)

data=data.drop(['Forward P/E','PEG Ratio (5 yr expected)','Enterprise Value/Revenue','Enterprise Value/EBITDA','Beta (5Y Monthly)','52-Week Change','S&P500 52-Week Change','52 Week High 3','52 Week Low 3','50-Day Moving Average 3','200-Day Moving Average 3','Avg Vol (3 month) 3','Avg Vol (10 day) 3','Shares Outstanding 5','Float 5','% Held by Insiders 1','% Held by Institutions 1','% Shorted (previous month ) 4','% of Float Shorted (previous month ) 4','Short Ratio (Dec 14, 2020) 4','Short % of Shares Outstanding (Dec 14, 2020) 4','Short % of Float (Dec 14, 2020) 4','Shares Short (prior month Nov 12, 2020) 4','Forward Annual Dividend Rate 4','Forward Annual Dividend Yield 4','Trailing Annual Dividend Rate 3','Trailing Annual Dividend Yield 3','5 Year Average Dividend Yield 4','Payout Ratio 4','Dividend Date 3','Ex-Dividend Date 4','Last Split Factor (new per old) 2','Last Split Date 3'],axis=1)

data=data.drop(['Fiscal Year Ends 3','Most Recent Quarter (mrq) 3','Profit Margin 3','Operating Margin (ttm) 3','Return on Assets (ttm) 3','Return on Equity (ttm) 3','Revenue (ttm) 3','Revenue Per Share (ttm) 3','Quarterly Revenue Growth (yoy) 3','Gross Profit (ttm) 3','EBITDA 3','Net Income Avi to Common (ttm) 3','Diluted EPS (ttm) 3','Quarterly Earnings Growth (yoy) 3','Total Cash (mrq) 3','Total Cash Per Share (mrq) 3','Total Debt (mrq) 3','Total Debt/Equity (mrq) 3','Current Ratio (mrq) 3','Book Value Per Share (mrq) 3','Operating Cash Flow (ttm) 3','Levered Free Cash Flow (ttm) 3'],axis=1)

In [None]:
adv20=data['Average Daily Volume (10 day)'].rolling(20).mean()

data['Average Daily Volume (10 day)']=data['Average Daily Volume (10 day)'].fillna(adv20)

data=data.dropna()

data=data.drop(['Average Daily Volume (10 day)'],axis=1)

data=data.assign(Adj_Close=data['Adj Close'])

data=data.drop(['Adj Close'],axis=1)

data=data.assign(adv20=data['Volume'].rolling(20).mean())

data=data.join(data['adv20'].rolling(20).mean(),rsuffix='_20')

data=data.drop(['adv20'],axis=1)

data.info()

data=data.dropna()

In [None]:
data.to_hdf('data.h5','data')

data=pd.read_hdf('data.h5','data')

data=data.dropna()

data.info()

In [None]:
o = data.open.unstack().rename_axis('date').reset_index()

h = data.high.unstack().rename_axis('date').reset_index()

l = data.low.unstack().rename_axis('date').reset_index()

c = data.close.unstack().rename_axis('date').reset_index()

v = data.volume.unstack().rename_axis('date').reset_index()

o=o.rename(columns={'level_0':'ticker','open':'Open'})

h=h.rename(columns={'level_0':'ticker','high':'High'})

l=l.rename(columns={'level_0':'ticker','low':'Low'})

c=c.rename(columns={'level_0':'ticker','close':'Close'})

v=v.rename(columns={'level_0':'ticker','volume':'Volume'})  

data=pd.merge(o,h,on=['ticker','date'])

data=pd.merge(data,l,on=['ticker','date'])

data=pd.merge(data,c,on=['ticker','date'])

data=pd.merge(data,v,on=['ticker','date'])

data=data.dropna()

data=data.set_index(['ticker','date'])

data.info()

In [None]:
v=data['Volume']

v=v.unstack()

v=v.fillna(method='ffill')

v=v.stack()

data['Volume']=v

data.info()

In [None]:
vwap=data['Volume Weighted Average Price']

vwap=vwap.unstack()

vwap=vwap.fillna(method='ffill')

vwap=vwap.stack()

data['Volume Weighted Average Price']=vwap

data.info()

In [None]:
vwap=o.add(h).add(l).add(c).div(4).mul(v).sum().div(v.sum())

adv20=data['Volume'].rolling(20).mean() 

r=data['Close'].pct_change()

r20=r.rolling(20).std()

r20=r20.mul(np.sqrt(252))

data=data.assign(adv20=adv20,r=r,r20=r20,vwap=vwap)

data=data.dropna()

data.info()

In [None]:
r=data.returns.unstack("ticker")

r=r.fillna(method='ffill')

r=r.stack("ticker")

data['returns']=r

data.info()

In [None]:
# HYPOTHESIS: If the stock price of a company has increased over the last 2 days, it may decrease in the future (time series delta of closing price today and closing price 2 days ago).

# IMPLEMENTATION: If company A's stock price had increased twice as much as the stock price of another company B, the prices of both stocks may decrease in the future.  In this reversion example, stock A may not fall double stock B, though it may fall more than stock B (rank operator).

# HINT: Can different neutralizations and decay settings improve this signal? Under what neutralization would a reversion idea work best?

rank(-ts_delta(close,2))

# different neutralizations and decay settings improve this signal

rank(-ts_delta(close,2),neutralize=sector)

rank(-ts_delta(close,2),decay=5)

rank(-ts_delta(close,2),decay=5,neutralize=sector)

# neutralization would a reversion idea work best

rank(-ts_delta(close,2),neutralize=sector).quantiles(5)


