<div style="text-align:center;font-size:150%;background:CornflowerBlue;color:white;">
    <div>Thank you 🙏,</div> 
    <div>Kaggle Community for everything!</div></div>

In [None]:
import numpy as np
import pandas as pd
import jpx_tokyo_market_prediction
import xgboost as xgb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

prices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
prices.drop(["RowId", "ExpectedDividend", "SupervisionFlag","AdjustmentFactor"], axis=1, inplace=True)
df = prices

# Technical Indicators
 Several indicators to improve model

In [None]:
def EVM(data, ndays): 
    dm = ((data['High'] + data['Low'])/2) - ((data['High'].shift(1) + data['Low'].shift(1))/2)
    br = (data['Volume'] / 100000000) / ((data['High'] - data['Low']))
    EVM = dm / br 
    EVM_MA = pd.Series(EVM.rolling(ndays).mean(), name = 'EVM_'+str(ndays)) 
    data = data.join(EVM_MA) 
    return data


# Exponentially-weighted Moving Average 
def EWMA(data, ndays): 
    EMA = pd.Series(data['Close'].ewm(span = ndays, min_periods = ndays - 1).mean(), 
                    name = 'EWMA_' + str(ndays)) 
    data = data.join(EMA) 
    return data


# Rate of Change (ROC)
def ROC(data,n):
    N = data['Close'].diff(n)
    D = data['Close'].shift(n)
    ROC = pd.Series(N/D,name='Rate_of_Change_'+str(n))
    data = data.join(ROC)
    return data 


def ADX(data: pd.DataFrame, period: int):
    """
    Computes the ADX indicator.
    """
    
    df = data.copy()
    alpha = 1/period

    # TR
    df['H-L'] = df['High'] - df['Low']
    df['H-C'] = np.abs(df['High'] - df['Close'].shift(1))
    df['L-C'] = np.abs(df['Low'] - df['Close'].shift(1))
    df['TR'] = df[['H-L', 'H-C', 'L-C']].max(axis=1)
    del df['H-L'], df['H-C'], df['L-C']

    # ATR
    df['ATR'] = df['TR'].ewm(alpha=alpha, adjust=False).mean()

    # +-DX
    df['H-pH'] = df['High'] - df['High'].shift(1)
    df['pL-L'] = df['Low'].shift(1) - df['Low']
    df['+DX'] = np.where(
        (df['H-pH'] > df['pL-L']) & (df['H-pH']>0),
        df['H-pH'],
        0.0
    )
    df['-DX'] = np.where(
        (df['H-pH'] < df['pL-L']) & (df['pL-L']>0),
        df['pL-L'],
        0.0
    )
    del df['H-pH'], df['pL-L']

    # +- DMI
    df['S+DM'] = df['+DX'].ewm(alpha=alpha, adjust=False).mean()
    df['S-DM'] = df['-DX'].ewm(alpha=alpha, adjust=False).mean()
    df['+DMI'] = (df['S+DM']/df['ATR'])*100
    df['-DMI'] = (df['S-DM']/df['ATR'])*100
    del df['S+DM'], df['S-DM']

    # ADX
    df['DX'] = (np.abs(df['+DMI'] - df['-DMI'])/(df['+DMI'] + df['-DMI']))*100
    df['ADX'+str(period)] = df['DX'].ewm(alpha=alpha, adjust=False).mean()
    del df['DX'], df['ATR'], df['TR'], df['-DX'], df['+DX'], df['+DMI'], df['-DMI']

    return df

Days15 = 15
Days50 =50


def add_extras(df):
    df = EVM(df, Days15)
    df = EVM(df, Days50)  
    df = EWMA(df, Days15)
    df = EWMA(df, Days50) 
    #df = ROC(df, Days15)
    #df = ROC(df, Days50) 
    df = ADX(df, Days15)
    df = ADX(df, Days50)
    return df

df = add_extras(df)
df = df.fillna(0)

# Baseline

In [None]:
df.Date = pd.to_datetime(df.Date)
best = df.groupby(['Date', 'SecuritiesCode']).mean().unstack("SecuritiesCode")
best.head(3)

# Importing models

I select LinearRegression and XGBRegressor, because their prediction techniques are different! Linear Regression is works linearly, while XGBRegressor - tree-based.

Let\`s see how HYBRID will work ☢!

In [None]:
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor


linear = LinearRegression(fit_intercept=False)
model = XGBRegressor(n_estimators=500, learning_rate=0.05, n_jobs=-1)

# Hybrid Model

In [None]:
class BoostedHybrid:
    def __init__(self, model_1, model_2):
        self.model_1 = model_1
        self.model_2 = model_2
        self.y_columns = None 
    
    def fit(self, X_1, X_2, y):
        self.model_1.fit(X_1,y)
        y_fit = pd.DataFrame(
            self.model_1.predict(X_1),
            index=X_1.index, columns=y.columns)
        print("First Model is ready")
        y_resid = y - y_fit
        y_resid = y_resid.stack(dropna=False).squeeze()
        y_resid = y_resid.fillna(0)
        print("Second Model is preparing")
        self.model_2.fit(X_2, y_resid)
        print("Second model is ready too")
        self.y_columns = y.columns
        self.y_fit = y_fit
        self.y_resid = y_resid
        
    
    def predict(self, X_1, X_2):
        y_pred = pd.DataFrame(
            self.model_1.predict(X_1),
            index=X_1.index, columns=self.y_columns)
        y_pred = y_pred.stack().squeeze()
        y_pred += self.model_2.predict(X_2)
        return y_pred.unstack()


# Target Value

In [None]:
y = best.loc[:, 'Target']
y = y.fillna(0) # I have to fill 0 for SecuritiesCode that are not included initially from beginning
y.shape 

# X for Linear Regression


In [None]:
from statsmodels.tsa.deterministic import DeterministicProcess

dp = DeterministicProcess(index=y.index, order=2, constant=True)
X_1 = dp.in_sample()
X_1.shape

In [None]:
from statsmodels.tsa.deterministic import DeterministicProcess

dp = DeterministicProcess(index=y.index, order=1)

X_1 = dp.in_sample()
X_1.shape

# X for XGBRegressor



In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

X_2 = best.drop('Target', axis=1).stack(dropna=False)
le = LabelEncoder()
X_2 = X_2.reset_index('SecuritiesCode')
X_2['SecuritiesCode'] = le.fit_transform(X_2['SecuritiesCode'])

X_2["Day"] = X_2.index.day
X_2['Year'] =X_2.index.year
X_2['Month'] = X_2.index.month
X_2['Week'] = X_2.index.week
X_2['Days'] = X_2.index.day
X_2['quarter'] = X_2.index.quarter

X_2['SecuritiesCode'] =X_2['SecuritiesCode'].fillna(0)
X_2['Day'] =X_2['Day'].fillna(0)
X_2 = X_2.fillna(0.0)
print(X_2.shape)
X_2.head(3)

# Modelling

In [None]:
from time import time
start = time()
model = BoostedHybrid(LinearRegression(), XGBRegressor(n_estimators=500,
                                                       max_depth=11,
                                                       learning_rate=0.05,
                                                       subsample=0.9,
                                                       random_state=17,
                                                       tree_method='gpu_hist',
                                                       n_jobs=-1))

model.fit(X_1, X_2, y)
print("Training time: ", time()-start)
y_pred = model.predict(X_1, X_2)


In [None]:
print("Simple Visualization of Hybrid Model Results")
ax1 = y.plot(c="g", figsize=(12,7), legend=None);
ax1 = y_pred.plot(c='r', ax=ax1, legend=None);

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
print("MAE of Prediction: ", mean_squared_error(y, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y, y_pred))

In [None]:
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [None]:
#(2404000,) (2000,) with dp 3 values foro inear regression

# (2404000,) (2000,) 

In [None]:
y.index

In [None]:
prices.index

In [None]:
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    ds=[prices, options, financials, trades, secondary_prices, sample_prediction]
    df = sample_prediction[["Date","SecuritiesCode"]]
    prices.drop(["RowId", "ExpectedDividend", "SupervisionFlag","AdjustmentFactor"], axis=1, inplace=True)
    secondary_prices.drop(["RowId", "ExpectedDividend", "SupervisionFlag","AdjustmentFactor"], axis=1, inplace=True)
    #prices = add_extras(prices)
    #prices = prices.fillna(0)
    prices = add_extras(prices)
    prices = prices.fillna(0)
    prices.Date = pd.to_datetime(prices.Date)
    prices = prices.groupby(['Date', 'SecuritiesCode']).mean().unstack("SecuritiesCode")
    # creating X_1 trend feature
    dp = DeterministicProcess(index=prices.index, order=2, constant=True)
    #dp = DeterministicProcess(index=prices.index, order=1)
    X_test_1 = dp.in_sample()
    print("Shape of X_tets_1: ", X_test_1.shape)
    print(X_test_1.head())
    #creating X_2 feature
    X_test_2 = prices.stack(dropna=False)
    le = LabelEncoder()
    X_test_2 = X_test_2.reset_index('SecuritiesCode')
    X_test_2['SecuritiesCode'] = le.fit_transform(X_test_2['SecuritiesCode'])
    # Label encoding for seasonality
    X_test_2["Day"] = X_test_2.index.day
    X_test_2['Year'] =X_test_2.index.year
    X_test_2['Month'] = X_test_2.index.month
    X_test_2['Week'] = X_test_2.index.week
    X_test_2['Days'] = X_test_2.index.day
    X_test_2['quarter'] = X_test_2.index.quarter
    X_test_2['SecuritiesCode'] =X_test_2['SecuritiesCode'].fillna(0)
    X_test_2['Day'] =X_test_2['Day'].fillna(0)
    X_test_2 = X_test_2.fillna(0.0)
    print("Is there missing", X_test_2.isnull().sum())
    print("Security codes are equal or not: ", X_test_2['SecuritiesCode'].shape, sample_prediction['SecuritiesCode'].shape)
    print(X_test_1.shape, X_test_2.shape, sample_prediction.shape, prices.shape)
    sample_prediction["Prediction"] = model.predict(X_test_1, X_test_2).stack().values
    print(sample_prediction.head(3))
    sample_prediction = sample_prediction.sort_values(by = "Prediction", ascending=False)
    sample_prediction.Rank = np.arange(0,2000)
    print(sample_prediction.head(3))
    sample_prediction = sample_prediction.sort_values(by = "SecuritiesCode", ascending=True)
    sample_prediction.drop(["Prediction"],axis=1)
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    env.predict(submission)