In [5]:
import pandas as pd
import numpy as np
import talib as ta
import os 
from alphien.utils import get_all_data, _read_output
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [2]:
def add_features(data):
    """Add features to raw data

    Parameters
    ----------
    data: pd.DataFrame
        Raw data

    Returns
    -------
    pd.DataFrame
        Results
    
    NOTE: MACD and SMA are adjusted to cum. corp. actions effect
    
    """
    data = data.copy()
    data['date'] = pd.to_datetime(data['date'])
    data = data.set_index('date')
    data['tradevol'] = (data['volume']) * ((data['high'] + data['low'])/2)
    data['spread'] = data['high'] - data['low']
    data['returns'] = data['adjClose'].pct_change(1)
    data['vol'] = data['returns'].rolling(126).std()
    


    data['MOM5'] = data['adjClose'].pct_change(5)
    data['MOM22'] = data['adjClose'].pct_change(22)
    data['MOM252'] = data['adjClose'].pct_change(252)

    data['MACD'], data['MACDSignal'], data['MACDHist'] = ta.MACD(data['adjClose'], 12, 26, 9)
    #Scaling
    data['MACD'] = data['MACD']/data['adjClose']
    data['MACDSignal'] = data['MACDHist']/data['adjClose']
    data['MACDSignal'] = data['MACDHist']/data['adjClose']

    data['RSI14'] = ta.RSI(data['adjClose'], 14)
    data['RSI28'] = ta.RSI(data['adjClose'], 28)
    data['RSI56'] = ta.RSI(data['adjClose'], 56)

    data['ADX14'] = ta.ADX(data['adjHigh'], data['adjLow'], data['adjClose'], 14)
    data['ADX56'] = ta.ADX(data['adjHigh'], data['adjLow'], data['adjClose'], 56)

    data['SMA14'] = ta.SMA(data['adjClose'], 14)/data['adjClose']
    data['SMA42'] = ta.SMA(data['adjClose'], 42)/data['adjClose']



    for lag in [3,4,14,56,112,224]:
        col = 'C' + str(lag)
        data[col] = data['close'].shift(lag)




    data = data.drop(['adjHigh', 'adjLow', 'adjOpen', 'adjClose','adjVolume','divCash','splitFactor'], axis = 1)


    # data['Forward Return Daily'] = data['adjClose'].shift(-1)/data['adjClose']
    # data['Forward Return Quarterly'] = data['adjClose'].shift(-63)/data['adjClose']
    data = data.replace(np.inf,np.nan)
    data = data.dropna()
    return data

def rebase(arr, idx = -1):
    return arr/arr[idx]

In [3]:
datalist = list(_read_output().values())
datalist = [data for data in datalist if data.shape[0] > 1]
datalist = [add_features(data) for data in datalist]

In [6]:
labels = pd.read_csv(os.path.join(os.getcwd(), "labels60.csv")).drop(["Unnamed: 0"], axis = 1)
labels['target'] = 1

In [7]:
df = pd.concat(datalist)
df = df.reset_index()
df = df.sort_values(by=['date','ticker'])
# df = df.set_index(['date','ticker'])
df = df.set_index('date')
df = df['2000-01-01':'2020-07-17']
df = df.reset_index()
df["date"] = df["date"].astype(str)

In [8]:
df = df.merge(labels, left_on= ["date", "ticker"], right_on= ["date", "ticker"], how="left")
df = df.fillna(0)

#drop ticker
df = df.drop('ticker', axis = 1)
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')

#drop dates
df_train = df['2009-01-01':'2016-01-01'].reset_index(drop = True)
df_past = df['2000-01-01':'2009-01-01'].reset_index(drop=True)
df_future = df['2016-01-01':'2020-07-07'].reset_index(drop=True)

#LB test
trading_days = df.index.unique()
dfq = pd.DataFrame(index = trading_days)
dfq['trading_days'] = trading_days
dfq = dfq.resample('Q').bfill()
lb_idx = pd.to_datetime(dfq['trading_days'].values[:-1])
lb_idx = lb_idx.tz_localize('UTC')
df_lb = df.loc[lb_idx].copy().reset_index(drop=True)

In [9]:
Y_train = df_train['target']
X_train = df_train.drop("target", axis = 1)

Y_past = df_past['target']
X_past = df_past.drop("target", axis = 1)

Y_future = df_future['target']
X_future = df_future.drop("target", axis = 1)

Y_lb = df_lb['target']
X_lb = df_lb.drop("target", axis = 1)

In [13]:
df

Unnamed: 0_level_0,close,high,low,open,volume,tradevol,spread,returns,vol,MOM5,MOM22,MOM252,MACD,MACDSignal,MACDHist,RSI14,RSI28,RSI56,ADX14,ADX56,SMA14,SMA42,C3,C4,C14,C56,C112,C224,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
2000-01-03 00:00:00+00:00,111.94,112.50,101.69,104.87,4783900,5.123318e+08,10.81,0.088805,0.038954,0.127178,0.086163,1.713697,0.020690,0.000472,0.000407,64.527080,62.013552,61.015343,23.559854,25.205360,0.898038,0.876111,100.69,98.19,99.00,64.03,50.94,37.690,0.0
2000-01-03 00:00:00+00:00,15.56,15.75,15.25,15.50,696200,1.079110e+07,0.50,0.024358,0.043181,0.121037,0.250804,-0.565485,0.032433,0.024622,0.076232,67.775363,53.111243,45.256718,31.731481,24.477765,0.886293,0.837679,14.69,15.13,12.44,18.00,25.37,75.120,0.0
2000-01-03 00:00:00+00:00,36.50,37.31,36.50,37.03,92800,3.424784e+06,0.81,-0.006803,0.052138,-0.058065,0.186992,3.112676,0.039850,-0.049115,-0.896340,47.207328,54.832295,58.254673,50.202857,28.568606,1.176262,0.935127,36.00,35.50,45.75,17.00,15.88,9.375,0.0
2000-01-03 00:00:00+00:00,35.00,36.00,34.75,35.25,10635000,3.762131e+08,1.25,-0.036078,0.024361,-0.058887,-0.065171,-0.277261,-0.008137,-0.001585,-0.014502,40.787791,43.629951,44.672127,17.448432,10.207186,1.048122,1.057388,36.38,37.06,33.93,38.69,43.13,45.000,0.0
2000-01-03 00:00:00+00:00,17.69,17.75,17.50,17.56,1509900,2.661199e+07,0.25,0.000000,0.034218,-0.024269,-0.006738,-0.081008,-0.003291,0.000168,0.002326,46.279919,47.261755,48.272910,9.159197,4.607502,1.013567,1.011282,18.19,18.45,17.75,21.75,17.81,19.630,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-07-17 00:00:00+00:00,90.57,91.00,89.43,90.05,1293900,1.167292e+08,1.57,0.009812,0.037177,0.038171,-0.014043,-0.184760,0.001739,0.004442,0.400340,58.795319,54.284024,51.200794,14.461969,8.274992,0.968990,0.987778,88.18,87.43,84.79,88.27,103.74,116.970,0.0
2020-07-17 00:00:00+00:00,135.14,136.00,130.88,131.11,1639386,2.187597e+08,5.12,0.035080,0.040159,0.131541,0.033275,0.097705,0.011435,0.013242,1.786272,66.511949,58.203801,53.537474,15.979496,10.423259,0.903682,0.925371,124.17,120.87,115.43,114.74,159.01,134.500,0.0
2020-07-17 00:00:00+00:00,273.10,274.54,269.10,269.10,258700,7.031983e+07,5.44,0.020324,0.039169,0.047685,0.024919,0.494882,0.013850,0.004320,1.179883,61.186521,57.949549,55.504660,15.891036,11.875150,0.952278,0.949440,266.47,255.50,246.42,228.48,252.09,198.590,0.0
2020-07-17 00:00:00+00:00,32.56,33.65,32.46,33.24,2325300,7.686279e+07,1.19,-0.024858,0.040882,0.010866,-0.119286,-0.260190,-0.011230,-0.000161,-0.005185,47.700685,48.928800,48.193604,12.010633,13.522322,0.998508,1.043451,32.02,32.88,32.29,31.33,47.01,39.490,0.0


In [10]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [8]:
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingClassifier

param_grid = {"learning_rate": [0.01, 0.001, 0.03]}

model = HistGradientBoostingClassifier(max_iter=1000, learning_rate= 0.01, validation_fraction=0.4, early_stopping=True, verbose=0)
model = hg.fit(X_train, Y_train)

Y_train_pred = model.predict(X_train)
Y_past_pred = model.predict(X_past)
Y_future_pred = model.predict(X_future)
Y_lb_pred = model.predict(X_lb)

train_score = roc_auc_score(Y_train, Y_train_pred)
past_score = roc_auc_score(Y_past, Y_past_pred)
future_score = roc_auc_score(Y_future, Y_future_pred)
lb_score = roc_auc_score(Y_lb, Y_lb_pred)

print(train_score, past_score, future_score, lb_score)

n loss: 0.25771, val loss: 0.26046, in 0.101s
[795/1000] 1 tree, 31 leaves, max depth = 9, train loss: 0.25770, val loss: 0.26045, in 0.105s
[796/1000] 1 tree, 31 leaves, max depth = 13, train loss: 0.25768, val loss: 0.26043, in 0.110s
[797/1000] 1 tree, 31 leaves, max depth = 12, train loss: 0.25766, val loss: 0.26041, in 0.089s
[798/1000] 1 tree, 31 leaves, max depth = 11, train loss: 0.25764, val loss: 0.26040, in 0.096s
[799/1000] 1 tree, 31 leaves, max depth = 10, train loss: 0.25763, val loss: 0.26039, in 0.092s
[800/1000] 1 tree, 31 leaves, max depth = 11, train loss: 0.25762, val loss: 0.26038, in 0.100s
[801/1000] 1 tree, 31 leaves, max depth = 10, train loss: 0.25760, val loss: 0.26037, in 0.105s
[802/1000] 1 tree, 31 leaves, max depth = 15, train loss: 0.25758, val loss: 0.26035, in 0.097s
[803/1000] 1 tree, 31 leaves, max depth = 10, train loss: 0.25756, val loss: 0.26034, in 0.108s
[804/1000] 1 tree, 31 leaves, max depth = 13, train loss: 0.25755, val loss: 0.26033, in 0.

In [14]:
from sklearn.ensemble import RandomForestClassifier


# params_grid = {'n_estimators': [50, 100, 200]}
# model_gs = GridSearchCV(rf, params_grid, cv=5)
# model_gs.fit(X_train, y_train)

params_rf = {'n_estimators': 200}
model = RandomForestClassifier(**params_rf)
model = rf.fit(X_train, Y_train)
model.fit(X_train, Y_train)

Y_train_pred = model.predict(X_train)
Y_past_pred = model.predict(X_past)
Y_future_pred = model.predict(X_future)
Y_lb_pred = model.predict(X_lb)

train_score = roc_auc_score(Y_train, Y_train_pred)
past_score = roc_auc_score(Y_past, Y_past_pred)
future_score = roc_auc_score(Y_future, Y_future_pred)
lb_score = roc_auc_score(Y_lb, Y_lb_pred)

print(train_score, past_score, future_score, lb_score)

ValueError: n_estimators must be an integer, got <class 'list'>.

0.5229484892415796 0.5067157296734613 0.5038306067071535 0.5139969823941288
