In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import neighbors
from sklearn import ensemble
from lightgbm import LGBMRegressor

In [2]:
from warnings import filterwarnings
filterwarnings("ignore")

def calc_spread_return_per_day(df, portfolio_size=200, toprank_weight_ratio=2):
    assert df['Rank'].min() == 0
    assert df['Rank'].max() == len(df['Rank']) - 1
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
    short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
    return purchase - short

def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size=200, toprank_weight_ratio=2):
    buf = df.groupby('Date').apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio, buf

In [3]:
def prepare_data(df):
    for col in ['Open', 'High', 'Low', 'Close']:
        df[col] = df[col] * df['AdjustmentFactor']
    
    df.ExpectedDividend.fillna(0, inplace=True)
    df.drop(['AdjustmentFactor', 'RowId'], axis=1, inplace=True)
    df['SupervisionFlag'] = df['SupervisionFlag'].astype(int)
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Week'] = df['Date'].dt.week
    df['Day'] = df['Date'].dt.day
    
    
    df.sort_values(by=['SecuritiesCode','Date'], inplace=True)
    df['Open'].interpolate(inplace=True)
    df['High'].interpolate(inplace=True)
    df['Low'].interpolate(inplace=True)
    df['Close'].interpolate(inplace=True)
    df.loc[df['Volume'] == 0,"Volume"] = np.nan
    df['Volume'].interpolate(inplace=True)
    df['Target'].interpolate(inplace=True)
    
    return df

def create_diffs(df):
    d1 = lambda df,col: df[col].diff(periods=1)/(df[col] + 1e-8)

    for f in ['Open', 'High', 'Low', 'Close', 'Volume']:
        df['diff' + f] = d1(df, f)
    df['pctDailyChange'] = (df['Close'] - df['Open'])/df['Close']
    df = df.dropna()
    return df

t = pd.read_csv("../jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv",
                parse_dates=["Date"])
t = prepare_data(t)
t = create_diffs(t)

In [4]:
# codes = t.SecuritiesCode.unique()
# new_codes = dict(zip(codes, np.arange(2000)))
# t = t.replace({'SecuritiesCode' : new_codes})

In [5]:
stock = t.loc[t['SecuritiesCode'] == 1376].reset_index(drop=True).copy()
stock = create_diffs(stock)

In [6]:
# fig1, ax1 = plt.subplots(figsize=(50, 20), dpi=30)
# ax1.plot(stock['Close'][-200:],color='green') 
# ax1.plot(stock['High'][-200:], color='red') 
# ax1.plot(stock['Low'][-200:], color='black') 

In [7]:
def normalize(x):
    std = x.std()
    mean = x.mean()
    x = (x - mean) / std
    return x

In [8]:
# for f in features:
#     stock[f] = normalize(stock[f].values)

In [15]:
features = ['diffOpen', 'diffHigh', 'diffLow', 'diffClose', 'diffVolume', 'pctDailyChange',
            'ExpectedDividend',]

ohlc = ['Open', 'High', 'Low', 'Close']
target = 'Target'

In [10]:
# train_df = t.loc[t.Date < t.Date.unique()[100], [*features, target]]
# test_df = t.loc[t.Date == t.Date.unique()[100], [*features, target]]
# X_train = train_df[features].to_numpy()
# y_train = train_df[target].to_numpy()
# X_test = test_df[features].to_numpy()
# y_test = test_df[target].to_numpy()

In [16]:
from sklearn.metrics import mean_absolute_error


X_train, y_train = [], []
history_size = 1 # training history
val_size = 100

for i in range(0, len(stock) - history_size - val_size):
    tmp = []
    for j in range(history_size):
        for f in features:
            tmp.append(stock[f].iloc[i+j])
    X_train.append(tmp)
    y_train.append(stock[target].iloc[i+history_size - 1])

X_test, y_test = [], []
for i in range(len(stock) - history_size - val_size, len(stock) - history_size):
    tmp = []
    for j in range(history_size):
        for f in features:
            tmp.append(stock[f].iloc[i+j])
    X_test.append(tmp)
    y_test.append(stock[target].iloc[i+history_size - 1])

In [17]:
# from sklearn.model_selection import GridSearchCV
              
# lgbm_regr = LGBMRegressor(boosting_type='goss',objective='regression',
#                           metrics='rmse',learning_rate=0.00001, n_estimators=20,
#                           max_depth=10)
# lgbm_regr.fit(X_train,y_train)
# y_pred = lgbm_regr.predict(X_test)
# mean_absolute_error(y_test, y_pred)

In [18]:
y_pred_best = []
errors = [] 
for i in range(10):
    regr = MLPRegressor(activation='identity', solver='adam', alpha=1e-3, tol=1e-5, max_iter=400, #random_state=i,
                         hidden_layer_sizes= tuple([50]*1)).fit(X_train, y_train)

    #y_pred = np.array([np.array(y_train).mean()] * len(y_test))
    #y_pred = np.zeros(len(y_test)) 
    y_pred = regr.predict(X_test)
    error = mean_absolute_error(y_test, y_pred)
    errors.append(error)
    if error < 0.83:
        y_pred_best = y_pred
    print(error)
print(min(errors))

0.009605654182638292
0.009023391928445427
0.00881786668843058
0.008987550158280283
0.008677553812301899
0.008819805640479985
0.008723310371747848
0.009253691990052043
0.009019259291144376
0.009344389447640055
0.008677553812301899


In [None]:
errors.index(min(errors))

In [None]:
errors.index(min(errors))

In [None]:
fig1, ax1 = plt.subplots(figsize=(50, 20), dpi=50)
ax1.tick_params(axis='y', which='both',    
                labeltop='on', labelbottom='off', labelsize=27)
ax1.tick_params(axis='x', labelbottom='off', labelsize=27)
ax1.grid()
ax1.plot(y_pred_best, lw=3, color='orange')
ax1.plot(y_test, lw=3, color='green')

In [None]:
np.corrcoef(y_pred_best, y_test)

In [None]:
y_train_pred1 = regr1.predict(X_train)
y_train_pred2 = regr2.predict(X_train)
y_train_pred3 = regr3.predict(X_train)


ensamble_train_X = [[y_train_pred1[i], y_train_pred2[i], y_train_pred3[i]] for i in range(len(y_train_pred1))]
regr_ens = MLPRegressor().fit(ensamble_train_X, y_train)

In [None]:
y_test_pred1 = regr1.predict(X_test)
y_test_pred2 = regr2.predict(X_test)
y_test_pred3 = regr3.predict(X_test)
ensamble_test_X = [[y_test_pred1[i], y_test_pred2[i], y_test_pred3[i]] for i in range(len(y_test_pred1))]
y_pred = regr_ens.predict(ensamble_test_X)
corr = abs(np.corrcoef(y_pred, y_test)[0][1])
corr

In [None]:
pricesSup = pd.read_csv('./supplemental_files/stock_prices.csv', parse_dates=["Date"])
pricesSup = prepare_data(pricesSup)
pricesSup = pricesSup.replace({'SecuritiesCode' : new_codes})
pricesSup = create_diffs(pricesSup)

with torch.no_grad():
    for date in pricesSup.Date.unique():
        x = torch.Tensor(pricesSup.loc[pricesSup.Date==date, [*features]].to_numpy()).to(device)
        pred = model(x)
        pricesSup.loc[pricesSup.Date==date, "Prediction"] = pred.cpu().detach().numpy()
        pricesSup.loc[pricesSup.Date==date, "Rank"] = pricesSup.loc[pricesSup.Date==date, "Prediction"].rank(ascending=False, method="first") - 1
        pricesSup.loc[pricesSup.Date==date, "Rank"] = pricesSup.loc[pricesSup.Date==date, "Rank"].astype("int")

sharpe_ratio, buf = calc_spread_return_sharpe(pricesSup)
sharpe_ratio