# はじめに（概要、留意事項、免責事項など）
- Kaggleで開催されたJPX Tokyo Stock Exchange Prediction（ https://www.kaggle.com/competitions/jpx-tokyo-stock-exchange-prediction ）の解法コードとなります。
- コンペ提出時にエラーが発生したため、提出コードに対して一部コードを修正しています。
- Kaggleで提供されたデータセットは本GitHubでは提供できません。Kaggleにてダウンロードをお願いします。
- Kaggleで提供されたデータセットと、JPXが提供するAPI（JQuants API）で取得できるデータとは、一部整合性がありません。必要に応じて本コードに修正が必要です。
- 本コードはコンペで提供されたオプションデータを使います。現在、JPXのオプションデータを取得する手段がないため、本コードはそのままでは将来的に使うことができません。必要に応じて修正が必要です。
- 開発環境により、機械学習モデルの再現性がなくなる場合があります。本GitHubで提供する学習済みモデルは、GCPの16コアで学習しています。シードを指定した場合でもコア数等が異なる場合、モデルの再現性がなくなる場合があります。
- 本コードに対する質問は、マケデコDiscord（リンク： https://t.co/gyU7O50Wb5 ）にお願いします。ただし全てに回答するとは限りませんのでご承知おき願います。
- 本コードを用いたことで発生するいかなる損害にも応対はできません。

In [9]:
import warnings
warnings.simplefilter("ignore")

import time, datetime
import sys, gc, random

import numpy as np
import pandas as pd

random.seed(0)
np.random.seed(0)

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.auto import tqdm
from joblib import Parallel, delayed

import lightgbm as lgb
import umap.umap_ as umap

from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

%matplotlib inline

import pickle

def pickle_dump(obj, path):
    with open(path, mode="wb") as f:
        pickle.dump(obj, f)

def pickle_load(path):
    with open(path, mode="rb") as f:
        data = pickle.load(f)
        return data

# JPX Class

In [None]:
class JPX(object):
    def __init__(self, isTRAIN, USE_SUPPLEMENT):
        self.isTRAIN        = isTRAIN
        self.use_supplement = USE_SUPPLEMENT
        self.stock_list     = None
        self.univ_codes     = []
        self.topix100_codes = []
        self.stock_prices   = None
        self.stock_fins     = None
        self.markets        = None
        self.options        = None
        self.fundamentals   = []
        self.data_dir          = "input/jpx-tokyo-stock-exchange-prediction/"
        self.external_data_dir = "input/jpx-external-data/"
        
        self.tick           = None
        self.limit          = None
        
    def _load_list(self):
        tmp = pd.read_csv(f"{self.data_dir}stock_list.csv")
        tmp = tmp[tmp["Universe0"]==True]
        tmp["33SectorCode"] = tmp["33SectorCode"].replace("-", 0)
        tmp["33SectorCode"] = tmp["33SectorCode"].astype("int")
        self.stock_list = tmp
        self.univ_codes = [c for c in sorted(tmp["SecuritiesCode"].unique())]
        self.topix100_codes= [c for c in tmp.loc[tmp["NewIndexSeriesSizeCode"].isin(["1", "2"]), "SecuritiesCode"]]
        
    def _load_fins(self):
        tmp = pd.read_csv(f"{self.data_dir}train_files/financials.csv")
        if self.use_supplement:
            tmp2 = pd.read_csv(f"{self.data_dir}supplemental_files/financials.csv")
            tmp = pd.concat([tmp, tmp2])
            tmp = tmp.drop_duplicates(subset=["Date", "SecuritiesCode"], keep="last")
            tmp = tmp.sort_values("Date")
        self.stock_fins = self._correct_fins(tmp)
        
    def _load_markets(self):
        tmp = pd.read_csv(f"{self.data_dir}train_files/secondary_stock_prices.csv")
        if self.use_supplement:
            tmp2 = pd.read_csv(f"{self.data_dir}supplemental_files/secondary_stock_prices.csv")
            tmp = pd.concat([tmp, tmp2])
            tmp = tmp.drop_duplicates(subset=["Date", "SecuritiesCode"], keep="last")
            tmp = tmp.sort_values("Date")
        if not self.isTRAIN:
            tmp = tmp[tmp["Date"]>="2019-12-01"]
        self.markets = self._secondary_to_market(tmp)
        
    def _load_prices(self):
        tmp = pd.read_csv(f"{self.data_dir}train_files/stock_prices.csv")
        if self.use_supplement:
            tmp2 = pd.read_csv(f"{self.data_dir}supplemental_files/stock_prices.csv")
            tmp = pd.concat([tmp, tmp2])
            tmp = tmp.drop_duplicates(subset=["Date", "SecuritiesCode"], keep="last")
            tmp = tmp.sort_values("Date")
        if not self.isTRAIN:
            tmp = tmp[tmp["Date"]>="2019-12-01"]
        self.stock_prices = self._correct_prices(tmp)
    
    def _load_options(self):
        tmp = pd.read_csv(f"{self.data_dir}train_files/options.csv")
        if self.use_supplement:
            tmp2 = pd.read_csv(f"{self.data_dir}supplemental_files/options.csv")
            tmp = pd.concat([tmp, tmp2])
            tmp = tmp.drop_duplicates(subset=["Date", "OptionsCode"], keep="last")
            tmp = tmp.sort_values("Date")
        if not self.isTRAIN:
            tmp = tmp[tmp["Date"]>="2019-12-01"]
        self.options = self._correct_options(tmp)
    
    def _load_tick(self):
        self.tick = np.array(pd.read_csv(f"{self.external_data_dir}tick.csv"))
    
    def _load_limit(self):
        self.limit = np.array(pd.read_csv(f"{self.external_data_dir}limit.csv"))
        
    def initialize(self):
        self._load_list()
        self._load_tick()
        self._load_limit()
        self._load_fins()
        self._load_markets()
        self._load_prices()
        self._load_options()
        
    def _correct_fins(self, tmp_fins):
        tmp_fins = tmp_fins.dropna(subset=["DisclosureNumber"])
        tmp_fins = tmp_fins.replace("－", np.nan)
        tmp_fins = tmp_fins[~tmp_fins["TypeOfDocument"].str.contains("OtherPeriod")]
        
        tmp_fins["Date"] = pd.to_datetime(tmp_fins["Date"])
        
        # simplify document type
        tmp_fins["TypeOfDocument"] = tmp_fins["TypeOfDocument"].apply(lambda x: x.replace("FY", "4Q"))
        tmp_fins["TypeOfDocument"] = tmp_fins["TypeOfDocument"].apply(lambda x: x.replace("_Consolidated", ""))
        tmp_fins["TypeOfDocument"] = tmp_fins["TypeOfDocument"].apply(lambda x: x.replace("_NonConsolidated", ""))
        tmp_fins["TypeOfDocument"] = tmp_fins["TypeOfDocument"].apply(lambda x: x.replace("_JP", ""))
        tmp_fins["TypeOfDocument"] = tmp_fins["TypeOfDocument"].apply(lambda x: x.replace("_US", ""))
        tmp_fins["TypeOfDocument"] = tmp_fins["TypeOfDocument"].apply(lambda x: x.replace("_IFRS", ""))
        tmp_fins["TypeOfDocument"] = tmp_fins["TypeOfDocument"].apply(lambda x: x.replace("FinancialStatements", "_FS"))
        tmp_fins["TypeOfDocument"] = tmp_fins["TypeOfDocument"].apply(lambda x: x.replace("ForecastRevision", "Revision"))
        tmp_fins["TypeOfDocument"] = tmp_fins["TypeOfDocument"].apply(lambda x: x.replace("NumericalCorrection", "Revision"))
        
        # make sequential period
        tmp_fins["FiscalYear"] = tmp_fins["CurrentFiscalYearStartDate"].apply(lambda x: float(x[:4]))
        tmp_fins["Period"] = tmp_fins["TypeOfDocument"].map({"1Q_FS":0, "2Q_FS":0.25, "3Q_FS":0.5, "4Q_FS":0.75, "Revision":np.nan}) + tmp_fins["FiscalYear"]
        tmp_fins["Period"] = tmp_fins.groupby("SecuritiesCode").ffill()["Period"]
        tmp_fins = tmp_fins.dropna(subset=["Period"])
        
        # pick up columns
        tmp_fins = tmp_fins[["Date", "SecuritiesCode", "TypeOfDocument", "NetSales", "OperatingProfit", "OrdinaryProfit", "Profit", 
                         "EarningsPerShare", "TotalAssets", "Equity", "EquityToAssetRatio", "BookValuePerShare",
                         "ForecastNetSales", "ForecastOperatingProfit", "ForecastOrdinaryProfit", "ForecastProfit",
                         "ForecastEarningsPerShare", "AverageNumberOfShares", "FiscalYear", "Period"]]
        
        # rename
        tmp_fins.rename(columns={"EarningsPerShare":"EPS", "EquityToAssetRatio":"EquityRatio", "BookValuePerShare":"BPS",
                             "ForecastEarningsPerShare":"ForecastEPS", "AverageNumberOfShares":"Shares"}, inplace=True)
        
        # data types
        float_n = ["EPS", "EquityRatio", "BPS", "ForecastEPS"]
        float_b = ["NetSales", "OperatingProfit", "OrdinaryProfit", "Profit", "TotalAssets", "Equity",
               "ForecastNetSales", "ForecastOperatingProfit", "ForecastOrdinaryProfit", "ForecastProfit", "Shares"]
        
        for f in float_n:
            tmp_fins[f] = tmp_fins[f].astype("float")
        
        for f in float_b:
            tmp_fins[f] = tmp_fins[f].astype("float")/1000000000
        
        # pick up stock in the universe
        tmp_fins = tmp_fins[tmp_fins["SecuritiesCode"].isin(self.univ_codes)]
        
        return tmp_fins
        
    def _secondary_to_market(self, tmp_secondary):
        tmp_secondary["Date"] = pd.to_datetime(tmp_secondary["Date"])
        tmp_markets = tmp_secondary.pivot(index="Date", columns="SecuritiesCode", values="Close")
        
        topix_etf = [1305, 1306, 1308, 1348, 1473, 1475, 2524, 2557]
        topix_markets = tmp_markets[topix_etf].copy()
        topix_markets["TOPIX"] = topix_markets.mean(axis=1)
        topix_markets.reset_index(inplace=True)
        topix_markets = topix_markets[["Date", "TOPIX"]]
        
        nikkei_etf = [1320, 1321, 1329, 1330, 1346, 1369, 1397, 2525]
        nikkei_markets = tmp_markets[nikkei_etf].copy()
        nikkei_markets["NIKKEI"] = nikkei_markets.mean(axis=1)
        nikkei_markets.reset_index(inplace=True)
        nikkei_markets = nikkei_markets[["Date", "NIKKEI"]]
        
        tmp_markets = pd.merge(topix_markets, nikkei_markets, on="Date")
        
        return tmp_markets
        
    def _correct_prices(self, tmp_prices):
        tmp_prices["Date"] = pd.to_datetime(tmp_prices["Date"])
        
        def get_limit(price):
            try:
                return self.limit[self.limit[:, 0]>price, 1][0]
            except:
                return np.nan
        
        def get_large_stock_tick(price):
            try:
                return self.tick[self.tick[:, 0]>price, 1][0]
            except:
                return np.nan
        
        def get_small_stock_tick(price):
            try:
                return self.tick[self.tick[:, 0]>price, 2][0]
            except:
                return np.nan
        
        tmp_prices["limit"] = tmp_prices["Close"].apply(get_limit)
        tmp_prices["large_stock_tick"] = tmp_prices["Close"].apply(get_large_stock_tick)
        tmp_prices["small_stock_tick"] = tmp_prices["Close"].apply(get_small_stock_tick)
        
        return tmp_prices
    
    def _correct_options(self, tmp_options):
        tmp_options["Date"] = pd.to_datetime(tmp_options["Date"])
        
        tmp_options = tmp_options[tmp_options["StrikePrice"]%500==0]
        tmp_options["month"] = tmp_options["ContractMonth"]%100
        tmp_options = tmp_options[tmp_options["month"].isin([3, 6, 9, 12])]
        
        return tmp_options
    
    def _get_fundas(self, code):
        tmp = self.stock_fins[self.stock_fins["SecuritiesCode"]==code].copy()
        
        periods = [p for p in sorted(tmp["Period"].unique())]
        cols = ["NetSales", "OperatingProfit", "OrdinaryProfit", "Profit", "EPS", "TotalAssets", "Equity", "EquityRatio", "BPS",
                "ForecastNetSales", "ForecastOperatingProfit", "ForecastOrdinaryProfit", "ForecastProfit", "ForecastEPS", "Shares"]
        
        # fs is the latest and modified financial statement at each date in "financials"
        fs = pd.DataFrame(index=periods, columns=["comp"])
        fs.index.name="Period"
        fs[cols] = np.nan
        
        res = []
        
        for i in range(len(tmp)):
            # update fs
            if tmp["TypeOfDocument"].iloc[i] == "Revision":
                tmp_ser = tmp.iloc[i]
                tmp_cols = [f for f in tmp_ser[~tmp_ser.isna()].index if f not in ["Date", "SecuritiesCode", "TypeOfDocument", "FiscalYear", "Period"]]
                if len(tmp_cols) != 0:
                    fs.loc[tmp["Period"].iloc[i], tmp_cols] = tmp_ser[tmp_cols]
            else:
                fs.loc[tmp["Period"].iloc[i], "comp"] = 1
                fs.loc[tmp["Period"].iloc[i], cols] = tmp[cols].iloc[i]
            
            # make feature based on fs
            tmp_fs = fs.dropna(subset=["comp"])
            tmp_fs = tmp_fs.ffill()
            tmp_fs.reset_index(inplace=True)
            tmp_fs["PeriodDiff"] = tmp_fs["Period"].diff()
            tmp_fs["Period"] = tmp_fs["Period"]%1
            
            # quarterly adjustment of Profit Loss
            tmp_cols = ["NetSales", "OperatingProfit", "OrdinaryProfit", "Profit"]
            for f in tmp_cols:
                tmp_fs["Yearly"+f] = tmp_fs[f]
                tmp_fs.loc[tmp_fs["Period"]!=0, "Yearly"+f] = tmp_fs[f].diff() * (0.25/tmp_fs["PeriodDiff"])
                tmp_fs["Yearly"+f] = tmp_fs["Yearly"+f] * 4
            
            # EPS
            tmp_fs["YearlyEPS"] = tmp_fs["YearlyProfit"] / tmp_fs["Shares"]
            
            # raw feature
            tmp_fs["Expense1"] = tmp_fs["YearlyNetSales"] - tmp_fs["YearlyOperatingProfit"]
            tmp_fs["Expense2"] = tmp_fs["YearlyOperatingProfit"] - tmp_fs["YearlyOrdinaryProfit"]
            tmp_fs["Expense3"] = tmp_fs["YearlyOrdinaryProfit"] - tmp_fs["YearlyProfit"]
            
            tmp_fs["ForecastExpense1"] = tmp_fs["ForecastNetSales"] - tmp_fs["ForecastOperatingProfit"]
            tmp_fs["ForecastExpense2"] = tmp_fs["ForecastOperatingProfit"] - tmp_fs["ForecastOrdinaryProfit"]
            tmp_fs["ForecastExpense3"] = tmp_fs["ForecastOrdinaryProfit"] - tmp_fs["ForecastProfit"]
            
            # ratio feature
            tmp_fs["ProfitMargin1"] = tmp_fs["Profit"] / tmp_fs["NetSales"]
            tmp_fs["ProfitMargin2"] = tmp_fs["OrdinaryProfit"] / tmp_fs["NetSales"]
            tmp_fs["ProfitMargin3"] = tmp_fs["OperatingProfit"] / tmp_fs["NetSales"]
            
            tmp_fs["ForecastProfitMargin1"] = tmp_fs["ForecastProfit"] / tmp_fs["ForecastNetSales"]
            tmp_fs["ForecastProfitMargin2"] = tmp_fs["ForecastOrdinaryProfit"] / tmp_fs["ForecastNetSales"]
            tmp_fs["ForecastProfitMargin3"] = tmp_fs["ForecastOperatingProfit"] / tmp_fs["ForecastNetSales"]
            
            tmp_fs["ROE1"] = tmp_fs["Profit"] / tmp_fs["Equity"]
            tmp_fs["ROE2"] = tmp_fs["OrdinaryProfit"] / tmp_fs["Equity"]
            tmp_fs["ROE3"] = tmp_fs["OperatingProfit"] / tmp_fs["Equity"]
            
            tmp_fs["ForecastROE1"] = tmp_fs["ForecastProfit"] / tmp_fs["Equity"]
            tmp_fs["ForecastROE2"] = tmp_fs["ForecastOrdinaryProfit"] / tmp_fs["Equity"]
            tmp_fs["ForecastROE3"] = tmp_fs["ForecastOperatingProfit"] / tmp_fs["Equity"]
            
            tmp_fs["ROA1"] = tmp_fs["Profit"] / tmp_fs["TotalAssets"]
            tmp_fs["ROA2"] = tmp_fs["OrdinaryProfit"] / tmp_fs["TotalAssets"]
            tmp_fs["ROA3"] = tmp_fs["OperatingProfit"] / tmp_fs["TotalAssets"]
            
            tmp_fs["ForecastROA1"] = tmp_fs["ForecastProfit"] / tmp_fs["TotalAssets"]
            tmp_fs["ForecastROA2"] = tmp_fs["ForecastOrdinaryProfit"] / tmp_fs["TotalAssets"]
            tmp_fs["ForecastROA3"] = tmp_fs["ForecastOperatingProfit"] / tmp_fs["TotalAssets"]
            
            tmp_fs["CostRatio1"] = (tmp_fs["NetSales"] - tmp_fs["OperatingProfit"]) / tmp_fs["NetSales"]
            tmp_fs["CostRatio2"] = (tmp_fs["OperatingProfit"] - tmp_fs["OrdinaryProfit"]) / tmp_fs["NetSales"]
            tmp_fs["CostRatio3"] = (tmp_fs["OrdinaryProfit"] - tmp_fs["Profit"]) / tmp_fs["NetSales"]
            
            tmp_fs["ForecastCostRatio1"] = tmp_fs["ForecastExpense1"] / tmp_fs["ForecastNetSales"]
            tmp_fs["ForecastCostRatio2"] = tmp_fs["ForecastExpense2"] / tmp_fs["ForecastNetSales"]
            tmp_fs["ForecastCostRatio3"] = tmp_fs["ForecastExpense3"] / tmp_fs["ForecastNetSales"]
            
            tmp_fs["TurnOver"] = tmp_fs["NetSales"] / tmp_fs["TotalAssets"]
            tmp_fs["ForecastTurnOver"] = tmp_fs["ForecastNetSales"] / tmp_fs["TotalAssets"]
            
            tmp_fs = tmp_fs.replace([-np.inf, np.inf], np.nan)
            
            # feature lists
            feats1 = ["Period"]
            
            feats2 = ["YearlyNetSales", "YearlyOperatingProfit", "YearlyOrdinaryProfit", "YearlyProfit",
                      "ForecastNetSales", "ForecastOperatingProfit", "ForecastOrdinaryProfit", "ForecastProfit",
                      "Expense1", "Expense2", "Expense3", "ForecastExpense1", "ForecastExpense2", "ForecastExpense3",
                      "TotalAssets", "Equity"]
            
            feats3 = ["ProfitMargin1", "ProfitMargin2", "ProfitMargin3",
                      "ForecastProfitMargin1", "ForecastProfitMargin2", "ForecastProfitMargin3",
                      "ROE1", "ROE2", "ROE3", "ForecastROE1", "ForecastROE2", "ForecastROE3",
                      "ROA1", "ROA2", "ROA3", "ForecastROA1", "ForecastROA2", "ForecastROA3",
                      "CostRatio1", "CostRatio2", "CostRatio3", "ForecastCostRatio1", "ForecastCostRatio2", "ForecastCostRatio3",
                      "TurnOver", "ForecastTurnOver", "EquityRatio"]
            
            feats4 = ["BPS", "YearlyEPS", "ForecastEPS"]
            
            # diff feature: diff means diff(1), dev means Deviation between Results and Past Forecasts
            d_feats1 = ["DevNetSales", "DevOperatingProfit", "DevOrdinaryProfit", "DevProfit",
                        "DevExpense1", "DevExpense2", "DevExpense3", "DevProfitMargin1", "DevProfitMargin2", "DevProfitMargin3",
                        "DevROE1", "DevROE2", "DevROE3", "DevROA1", "DevROA2", "DevROA3",
                        "DevCostRatio1", "DevCostRatio2", "DevCostRatio3"]
            
            d_feats2 = []
            for f in feats2:
                d_feats2.append("Diff"+f)
                tmp_fs["Diff"+f] = tmp_fs[f].diff(1)
            
            d_feats3 = []
            for f in feats3:
                d_feats3.append("Diff"+f)
                tmp_fs["Diff"+f] = tmp_fs[f].diff(1)
            
            tmp_fs["DevNetSales"] = tmp_fs["YearlyNetSales"] - tmp_fs["ForecastNetSales"].shift(1)
            tmp_fs["DevOperatingProfit"] = tmp_fs["YearlyOperatingProfit"] - tmp_fs["ForecastOperatingProfit"].shift(1)
            tmp_fs["DevOrdinaryProfit"] = tmp_fs["YearlyOrdinaryProfit"] - tmp_fs["ForecastOrdinaryProfit"].shift(1)
            tmp_fs["DevProfit"] = tmp_fs["YearlyProfit"] - tmp_fs["ForecastProfit"].shift(1)
            
            tmp_fs["DevExpense1"] = tmp_fs["Expense1"] - tmp_fs["ForecastExpense1"].shift(1)
            tmp_fs["DevExpense2"] = tmp_fs["Expense2"] - tmp_fs["ForecastExpense2"].shift(1)
            tmp_fs["DevExpense3"] = tmp_fs["Expense3"] - tmp_fs["ForecastExpense3"].shift(1)
            
            tmp_fs["DevProfitMargin1"] = tmp_fs["ProfitMargin1"] - tmp_fs["ForecastProfitMargin1"].shift(1)
            tmp_fs["DevProfitMargin2"] = tmp_fs["ProfitMargin2"] - tmp_fs["ForecastProfitMargin2"].shift(1)
            tmp_fs["DevProfitMargin3"] = tmp_fs["ProfitMargin3"] - tmp_fs["ForecastProfitMargin3"].shift(1)
            
            tmp_fs["DevROE1"] = tmp_fs["ROE1"] - tmp_fs["ForecastROE1"].shift(1)
            tmp_fs["DevROE2"] = tmp_fs["ROE2"] - tmp_fs["ForecastROE2"].shift(1)
            tmp_fs["DevROE3"] = tmp_fs["ROE3"] - tmp_fs["ForecastROE3"].shift(1)
            
            tmp_fs["DevROA1"] = tmp_fs["ROA1"] - tmp_fs["ForecastROA1"].shift(1)
            tmp_fs["DevROA2"] = tmp_fs["ROA2"] - tmp_fs["ForecastROA2"].shift(1)
            tmp_fs["DevROA3"] = tmp_fs["ROA3"] - tmp_fs["ForecastROA3"].shift(1)
            
            tmp_fs["DevCostRatio1"] = tmp_fs["CostRatio1"] - tmp_fs["ForecastCostRatio1"].shift(1)
            tmp_fs["DevCostRatio2"] = tmp_fs["CostRatio2"] - tmp_fs["ForecastCostRatio2"].shift(1)
            tmp_fs["DevCostRatio3"] = tmp_fs["CostRatio3"] - tmp_fs["ForecastCostRatio3"].shift(1)
            
            # pick up columns
            tmp_fs["Date"] = tmp["Date"].iloc[i]
            tmp_fs["SecuritiesCode"] = tmp["SecuritiesCode"].iloc[i]
            
            feats = ["Date", "SecuritiesCode"]
            feats.extend(feats1)
            feats.extend(feats2)
            feats.extend(feats3)
            feats.extend(feats4)
            feats.extend(d_feats1)
            feats.extend(d_feats2)
            feats.extend(d_feats3)
            
            res.append(tmp_fs[feats].iloc[-1])
        
        res = pd.DataFrame(res)
        res = res.drop_duplicates(keep="last", subset=["Date"])
        
        return res
    
    def _get_techs(self, code):
        tmp = self.stock_prices[self.stock_prices["SecuritiesCode"]==code].copy()
        
        # adjust price data
        adj = tmp[tmp["AdjustmentFactor"]!=1]
        if len(adj) >= 2:
            for i in reversed(range(len(adj))):
                if i == 0:
                    continue
                adj["AdjustmentFactor"].iloc[i-1] *= adj["AdjustmentFactor"].iloc[i]
        
        tmp.loc[tmp["RowId"].isin(adj["RowId"]), "AdjustmentFactor"] = adj["AdjustmentFactor"]
        tmp.loc[tmp["AdjustmentFactor"]==1, "AdjustmentFactor"] = np.nan
        tmp["AdjustmentFactor"] = tmp["AdjustmentFactor"].bfill()
        
        tmp.loc[np.isnan(tmp["AdjustmentFactor"]), "AdjustmentFactor"] = 1
        
        for f in ["Open", "High", "Low", "Close", "Volume"]:
            if f == "Volume":
                tmp[f] = tmp[f] / tmp["AdjustmentFactor"]
            else:
                tmp[f] = tmp[f] * tmp["AdjustmentFactor"]
        
        # merge market data
        tmp = tmp.merge(self.markets, on="Date", how="left")
        
        # technical feature
        tmp["ror1"] = tmp["Close"].pct_change(1)
        tmp["ror22"] = tmp["Close"].pct_change(21).shift(1)
        tmp["ror252"] = tmp["Close"].pct_change(230).shift(22)
        tmp["topix_ror1"] = tmp["TOPIX"].pct_change(1)
        tmp["nikkei_ror1"] = tmp["NIKKEI"].pct_change(1)
        
        tmp["volatility252"] = tmp["ror1"].rolling(252).std()
        tmp["topix_corr"] = tmp["ror1"].rolling(252).corr(tmp["topix_ror1"])
        tmp["nikkei_corr"] = tmp["ror1"].rolling(252).corr(tmp["nikkei_ror1"])
        
        tmp["Volume"] = tmp["Volume"].fillna(0)
        tmp["Volume5"] = tmp["Volume"].rolling(5).mean()
        tmp["Volume20"] = tmp["Volume"].rolling(20).mean()
        tmp["Volume60"] = tmp["Volume"].rolling(60).mean()
        
        tmp["trading_value"] = tmp["Volume"]*tmp["Close"]
        tmp["trading_value5"] = tmp["trading_value"].rolling(5).mean()
        tmp["trading_value20"] = tmp["trading_value"].rolling(20).mean()
        tmp["trading_value60"] = tmp["trading_value"].rolling(60).mean()
        
        tmp["range_rt"] = np.abs(tmp["ror1"])
        tmp["prev_close"] = tmp["Close"].shift(1)
        tmp["range_hl"] = np.abs( (tmp[["High", "prev_close"]].max(axis=1) - tmp[["Low", "prev_close"]].min(axis=1)) / tmp["prev_close"])
        
        tmp["marketImpact_rt"] = (tmp["range_rt"] / tmp["trading_value"]).replace([-np.inf, np.inf], np.nan)
        tmp["marketImpact_rt5"] = tmp["marketImpact_rt"].rolling(5).mean()
        tmp["marketImpact_rt20"] = tmp["marketImpact_rt"].rolling(20).mean()
        tmp["marketImpact_rt60"] = tmp["marketImpact_rt"].rolling(60).mean()
        
        tmp["marketImpact_hl"] = (tmp["range_hl"] / tmp["trading_value"]).replace([-np.inf, np.inf], np.nan)
        tmp["marketImpact_hl5"] = tmp["marketImpact_hl"].rolling(5).mean()
        tmp["marketImpact_hl20"] = tmp["marketImpact_hl"].rolling(20).mean()
        tmp["marketImpact_hl60"] = tmp["marketImpact_hl"].rolling(60).mean()
        
        tmp = tmp[["RowId", "Date", "SecuritiesCode", "Open", "High", "Low", "Close", "AdjustmentFactor", "Target", "limit", "large_stock_tick", "small_stock_tick",
                   "prev_close", "ror1", "ror22", "ror252", "volatility252", "topix_corr", "nikkei_corr",
                   "Volume", "Volume5", "Volume20", "Volume60",
                   "trading_value", "trading_value5", "trading_value20", "trading_value60",
                   "marketImpact_rt", "marketImpact_rt5", "marketImpact_rt20", "marketImpact_rt60",
                   "marketImpact_hl", "marketImpact_hl5", "marketImpact_hl20", "marketImpact_hl60"]]
        
        # merge fundamentals
        tmp_fins = self.fundamentals[self.fundamentals["SecuritiesCode"]==code]
        tmp = tmp.merge(tmp_fins, on=["Date", "SecuritiesCode"], how="left")
        tmp = tmp.ffill()
        
        # fundamentals feature
        shares = self.stock_list.loc[self.stock_list["SecuritiesCode"]==code, "IssuedShares"].values[0]
        
        tmp["market_cap"] = tmp["Close"] * (shares/1000000)
        
        tmp["CalcBPS"] = tmp["Equity"] / (shares/1000000000)
        tmp["CalcYearlyEPS"] = tmp["YearlyProfit"] / (shares/1000000000)
        tmp["CalcForecastEPS"] = tmp["ForecastProfit"] / (shares/1000000000)
        
        tmp["PBR"] = tmp["Close"] / tmp["CalcBPS"]
        tmp["PER"] = tmp["Close"] / tmp["CalcYearlyEPS"]
        tmp["ForecastPER"] = tmp["Close"] / tmp["CalcForecastEPS"]
        
        tmp["volumeTurnover"] = tmp["Volume"] / (shares/1000)
        tmp["volumeTurnover5"] = tmp["volumeTurnover"].rolling(5).mean()
        tmp["volumeTurnover20"] = tmp["volumeTurnover"].rolling(20).mean()
        tmp["volumeTurnover60"] = tmp["volumeTurnover"].rolling(60).mean()
        
        # ticks and limitations
        tmp["sdaka"]  = tmp["prev_close"] + tmp["limit"].shift(1)
        tmp["syasu"]  = tmp["prev_close"] - tmp["limit"].shift(1)
        
        tmp["stop_close"] = 1
        tmp.loc[(tmp["Close"]==tmp["sdaka"]) & (tmp["Open"]!=tmp["sdaka"]), "stop_close"] = 2
        tmp.loc[(tmp["Close"]==tmp["syasu"]) & (tmp["Open"]!=tmp["syasu"]), "stop_close"] = 0
        
        tmp["stop_open"] = 1
        tmp.loc[(tmp["Close"]==tmp["sdaka"]) & (tmp["Open"]==tmp["sdaka"]), "stop_open"] = 2
        tmp.loc[(tmp["Close"]==tmp["syasu"]) & (tmp["Open"]==tmp["syasu"]), "stop_open"] = 0
        
        tmp["limit_rate"] = tmp["limit"] / tmp["Close"]
        
        tmp["tick"] = np.nan
        tmp.loc[tmp["SecuritiesCode"].isin(self.topix100_codes), "tick"] = tmp["large_stock_tick"]
        tmp.loc[~tmp["SecuritiesCode"].isin(self.topix100_codes), "tick"] = tmp["small_stock_tick"]
        
        tmp["tick_rate"] = tmp["tick"] / tmp["Close"]
        
        # uptick rule
        tmp["uptick"] = 0
        tmp["down_rate"] = tmp["Low"] / tmp["prev_close"]
        tmp.loc[tmp["down_rate"] <= 0.9, "uptick"] = 1
        
        # pick up feature
        feats = ["RowId", "Date", "SecuritiesCode", "Target",
                 "Close", "ror1", "ror22", "ror252", "volatility252", "topix_corr", "nikkei_corr",
                 "Volume", "Volume5", "Volume20", "Volume60",
                 "trading_value", "trading_value5", "trading_value20", "trading_value60",
                 "marketImpact_rt", "marketImpact_rt5", "marketImpact_rt20", "marketImpact_rt60",
                 "marketImpact_hl", "marketImpact_hl5", "marketImpact_hl20", "marketImpact_hl60",
                 'market_cap', 'PBR', 'PER', 'ForecastPER',
                 "volumeTurnover", "volumeTurnover5", "volumeTurnover20", "volumeTurnover60",
                 "tick_rate", "limit_rate", "stop_close", "stop_open", "uptick"]
        remove_feats = ["Date", "SecuritiesCode", "Period", "BPS", "YearlyEPS", "ForecastEPS"]
        fins_feats   = [f for f in self.fundamentals.columns if f not in remove_feats]
        feats.extend(fins_feats)
        
        tmp = tmp[feats]
        
        return tmp

    def create_fundamentals(self):
        #tmp = Parallel(n_jobs=-1, verbose=1)(delayed(self._get_fundas)(code) for code in self.univ_codes)
        tmp = []
        for code in tqdm(self.univ_codes):
            tmp.append(self._get_fundas(code))
            
        self.fundamentals = pd.concat(tmp)
        
    def update_fins(self, iter_data):
        new_data = self._correct_fins(iter_data)
        if len(new_data) != 0:
            self.stock_fins = pd.concat([self.stock_fins, new_data])
            self.stock_fins = self.stock_fins.drop_duplicates(subset=["Date", "SecuritiesCode"], keep="last")
            self.stock_fins = self.stock_fins.sort_values("Date")
    
    def update_markets(self, iter_data):
        new_data = self._secondary_to_market(iter_data)
        self.markets = pd.concat([self.markets, new_data])
        self.markets = self.markets.drop_duplicates(subset=["Date"], keep="last")
        self.markets = self.markets.sort_values("Date")
    
    def update_prices(self, iter_data):
        new_data = self._correct_prices(iter_data)
        self.stock_prices = pd.concat([self.stock_prices, new_data])
        self.stock_prices = self.stock_prices.drop_duplicates(subset=["Date", "SecuritiesCode"], keep="last")
        self.stock_prices = self.stock_prices.sort_values("Date")
    
    def update_options(self, iter_data):
        new_data = self._correct_options(iter_data)
        self.options = pd.concat([self.options, new_data])
        self.options = self.options.drop_duplicates(subset=["Date", "OptionsCode"], keep="last")
        self.options = self.options.sort_values("Date")
        
    def update_fundamentals(self, codes):
        try:
            new_fundamentals = []
            for code in codes:
                new_fundamentals.append(self._get_fundas(code))
            
            new_fundamentals = pd.concat(new_fundamentals)
            
            self.fundamentals = self.fundamentals[~self.fundamentals["SecuritiesCode"].isin(codes)]   # remove updated stocks
            self.fundamentals = pd.concat([self.fundamentals, new_fundamentals])                      # add updated stocks
        except:
            pass
    
    def get_feature(self):
        df_feats = []
        
        for code in self.univ_codes:
            df_feats.append(self._get_techs(code))
        
        df_feats = pd.concat(df_feats)
        
        # sectors
        df_feats = df_feats.merge(self.stock_list[["SecuritiesCode", "33SectorCode"]], on="SecuritiesCode", how="left")
        
        sector_dict = {}
        for i, c in enumerate(sorted(df_feats["33SectorCode"].unique())):
            sector_dict[c] = i

        df_feats["33SectorCode"] = df_feats["33SectorCode"].map(sector_dict)
        
        # weekday
        df_feats["weekday"] = df_feats["Date"].dt.weekday
        
        df_feats = df_feats.replace([-np.inf, np.inf], np.nan)
        
        return df_feats
    
    def reduce_data(self, date):
        self.markets      = self.markets[self.markets["Date"]>=date]
        self.stock_prices = self.stock_prices[self.stock_prices["Date"]>=date]
        self.options      = self.options[self.options["Date"]>=date]

In [None]:
isTRAIN = False
USE_SUPPLEMENT = True

jpx = JPX(isTRAIN, USE_SUPPLEMENT)
jpx.initialize()
jpx.create_fundamentals()

# Feature Engineering of Stock-based Feature and Time-based Feature

In [None]:
UMAP_PERIOD = "2020-12-31"
umap_df = pd.read_csv(f"input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv", usecols=["Date", "SecuritiesCode", "Close"])
umap_df["Date"] = pd.to_datetime(umap_df["Date"])
umap_df = umap_df[umap_df["Date"]<=UMAP_PERIOD]

umap_df = umap_df.pivot(index="Date", columns="SecuritiesCode", values="Close").copy()
umap_df = umap_df.pct_change(1).fillna(0)

umap_dim = 100
n_neighbors = 30

## Stock UMAP Feature

In [None]:
if isTRAIN:
    # stock UMAP model based on daily return
    stock_umap_model_rt = umap.UMAP(n_components=umap_dim, n_neighbors=n_neighbors, random_state=0, n_jobs=-1)

    stock_umap_res_rt = pd.DataFrame(
        stock_umap_model_rt.fit_transform(umap_df.T),
        index=umap_df.columns,
        columns=["StockUmapRt"+str(i) for i in range(umap_dim)]
    )

    # stock UMAP model based on return correlation
    stock_umap_model_corr = umap.UMAP(n_components=umap_dim, n_neighbors=n_neighbors, random_state=0, n_jobs=-1)

    stock_umap_res_corr = pd.DataFrame(
        stock_umap_model_corr.fit_transform(umap_df.corr().fillna(0)),
        index=umap_df.columns,
        columns=["StockUmapCorr"+str(i) for i in range(umap_dim)]
    )
else:
    # stock UMAP model based on daily return
    stock_umap_model_rt = pickle_load("input/jpx-pre-trained-model/stock_umap_model_rt.pickle")

    stock_umap_res_rt = pd.DataFrame(
        stock_umap_model_rt.transform(umap_df.T),
        index=umap_df.columns,
        columns=["StockUmapRt"+str(i) for i in range(umap_dim)]
    )
    
    # stock UMAP model based on return correlation
    stock_umap_model_corr = pickle_load("input/jpx-pre-trained-model/stock_umap_model_corr.pickle")

    stock_umap_res_corr = pd.DataFrame(
        stock_umap_model_corr.transform(umap_df.corr().fillna(0)),
        index=umap_df.columns,
        columns=["StockUmapCorr"+str(i) for i in range(umap_dim)]
    )

In [None]:
stock_umap_res_rt

In [None]:
stock_umap_res_corr

## Time UMAP Feature

In [None]:
if isTRAIN:
    # time UMAP model based on daily return
    time_umap_model_rt = umap.UMAP(n_components=umap_dim, n_neighbors=n_neighbors, random_state=0)

    time_umap_res_rt = pd.DataFrame(
        time_umap_model_rt.fit_transform(umap_df),
        index=umap_df.index,
        columns=["TimeUmapRt"+str(i) for i in range(umap_dim)]
    )
else:
    # time UMAP model based on daily return
    time_umap_model_rt = pickle_load("input/jpx-pre-trained-model/time_umap_model_rt.pickle")

    time_umap_res_rt = pd.DataFrame(
        time_umap_model_rt.transform(umap_df),
        index=umap_df.index,
        columns=["TimeUmapRt"+str(i) for i in range(umap_dim)]
    )

def get_time_umap_feats(df, dates=[]):
    tmp = df.pivot(index="Date", columns="SecuritiesCode", values="Close").copy()
    tmp = tmp.pct_change(1).fillna(0)
    if len(dates) != 0:
        tmp = tmp[tmp.index.isin(dates)]
    
    tmp_res = pd.DataFrame(
        time_umap_model_rt.transform(tmp),
        index=tmp.index,
        columns=["TimeUmapRt"+str(i) for i in range(umap_dim)]
    )
    
    tmp_res.reset_index(inplace=True)
    
    return tmp_res

In [None]:
time_umap_res_rt

## Option Feature

In [None]:
def get_contracts(x):
    if   x < "2017-03-01": return [201703, 201706, 201709]
    elif x < "2017-06-01": return [201706, 201709, 201712]
    elif x < "2017-09-01": return [201709, 201712, 201803]
    elif x < "2017-12-01": return [201712, 201803, 201806]
    elif x < "2018-03-01": return [201803, 201806, 201809]
    elif x < "2018-06-01": return [201806, 201809, 201812]
    elif x < "2018-09-01": return [201809, 201812, 201903]
    elif x < "2018-12-01": return [201812, 201903, 201906]
    elif x < "2019-03-01": return [201903, 201906, 201909]
    elif x < "2019-06-01": return [201906, 201909, 201912]
    elif x < "2019-09-01": return [201909, 201912, 202003]
    elif x < "2019-12-01": return [201912, 202003, 202006]
    elif x < "2020-03-01": return [202003, 202006, 202009]
    elif x < "2020-06-01": return [202006, 202009, 202012]
    elif x < "2020-09-01": return [202009, 202012, 202103]
    elif x < "2020-12-01": return [202012, 202103, 202106]
    elif x < "2021-03-01": return [202103, 202106, 202109]
    elif x < "2021-06-01": return [202106, 202109, 202112]
    elif x < "2021-09-01": return [202109, 202112, 202203]
    elif x < "2021-12-01": return [202112, 202203, 202206]
    elif x < "2022-03-01": return [202203, 202206, 202209]
    elif x < "2022-06-01": return [202206, 202209, 202212]
    elif x < "2022-09-01": return [202209, 202212, 202303]
    else                 : return [202212, 202303, 202306]

def get_option_feats(tmp_markets, tmp_options):
    tmp_markets["DateStr"]  = tmp_markets["Date"].astype("str")
    tmp_markets["contract"] = tmp_markets["DateStr"].apply(get_contracts)
    
    civs = []   # Implied Volatility of call options
    pivs = []   # Implied Volatility of put options
    
    for dt in tmp_markets["Date"].unique():
        try:
            tmp = tmp_options[tmp_options["Date"]==dt].copy()
            tmp = tmp[tmp["ContractMonth"].isin(tmp_markets.loc[tmp_markets["Date"]==dt, "contract"].values[0])]
            
            strikeprices = tmp["StrikePrice"].unique()
            atm = strikeprices[np.argmin(abs(strikeprices - tmp_markets.loc[tmp_markets["Date"]==dt, "NIKKEI"].values[0]))]
            
            tmp = tmp[tmp["StrikePrice"]==atm]
            civ = tmp.loc[tmp["Putcall"]==2, "ImpliedVolatility"].mean()
            piv = tmp.loc[tmp["Putcall"]==1, "ImpliedVolatility"].mean()
            
            civs.append(civ)
            pivs.append(piv)
        except:
            civs.append(np.nan)
            pivs.append(np.nan)
    
    tmp_markets["civ"] = civs
    tmp_markets["piv"] = pivs
    tmp_markets["diff_civ"] = tmp_markets["civ"].diff(1)
    tmp_markets["diff_piv"] = tmp_markets["piv"].diff(1)
    tmp_markets["iv_spread"] = tmp_markets["civ"] - tmp_markets["piv"]
    tmp_markets["diff_iv_spread"] = tmp_markets["iv_spread"].diff(1)
    
    tmp_markets = tmp_markets[["Date", "iv_spread", "diff_civ", "diff_piv", "diff_iv_spread"]].fillna(0)
    
    return tmp_markets

## Time Regime Feature

In [1]:
def get_regime_feats(df):
    feats = ["Date", "33SectorCode", "ror1", "ror22", "ror252", "trading_value", "trading_value5", "trading_value20", "volatility252", "topix_corr", "nikkei_corr", "Equity", "EquityRatio", "PBR"]
    tmp_df = df[feats].copy()
    
    # time aggregate feature
    agg_feats = ["ror1", "ror22", "ror252", "trading_value", "trading_value5", "trading_value20", "volatility252", "topix_corr", "nikkei_corr"]
    
    tmp = tmp_df.groupby("Date")[agg_feats].mean().add_suffix("_timemean_regime").reset_index()
    df = df.merge(tmp, on="Date", how="left")

    tmp = tmp_df.groupby("Date")[agg_feats].std().add_suffix("_timestd_regime").reset_index()
    df = df.merge(tmp, on="Date", how="left")
    
    # sector regime feature
    for f in ["ror1", "ror22"]:
        tmp = tmp_df.groupby(["Date", "33SectorCode"])[f].mean().reset_index().pivot(index="Date", columns="33SectorCode", values=f).add_suffix("_sector_regime_"+f).reset_index()
        df = df.merge(tmp, on="Date", how="left")
    
    # factor regime feature
    fr_feats = ["Equity", "EquityRatio", "PBR", "ror252", "topix_corr", "volatility252"]

    tmp1  = []
    tmp22 = []
    
    for f in fr_feats:
        tmp1.append(tmp_df.groupby("Date")[f].corr(tmp_df["ror1"]))
        tmp22.append(tmp_df.groupby("Date")[f].corr(tmp_df["ror22"]))

    tmp = pd.concat([pd.concat(tmp1, axis=1).add_suffix("_factor_regime1"), pd.concat(tmp22, axis=1).add_suffix("_factor_regime22")], axis=1).reset_index()
    df = df.merge(tmp, on="Date", how="left")

    return df

# Training

In [None]:
def train_lgbm(train_df, x_feats, cat_feats, target, seed=0):
    model = LGBMRegressor(max_depth=4, learning_rate=0.01, num_leaves=20, n_estimators=2000, n_jobs=-1, colsample_bytree=0.1, random_state=seed)
    model.fit(train_df[x_feats], train_df[target], categorical_feature=cat_feats, verbose=False)
    
    return model

def get_feature_list(df):
    x_feats    = [f for f in df.columns if f not in ["RowId", "Date", "SecuritiesCode", "Target"]]
    x_feats    = [f for f in x_feats if f not in ['YearlyNetSales', 'YearlyOperatingProfit', 'YearlyOrdinaryProfit', 'YearlyProfit', 'ForecastNetSales', 'ForecastOperatingProfit',
                                                  'ForecastOrdinaryProfit', 'ForecastProfit', 'Expense1', 'Expense2', 'Expense3', 'ForecastExpense1', 'ForecastExpense2',
                                                  'ForecastExpense3']]
    cat_feats       = ["33SectorCode", "stop_close", "stop_open", "uptick", "weekday"]
    regime_feats    = [f for f in x_feats if "regime" in f]
    option_feats    = ["iv_spread", "diff_civ", "diff_piv", "diff_iv_spread"]
    time_umap_feats = [f for f in x_feats if "TimeUmap" in f]
    neut_feats      = [f for f in x_feats if f not in cat_feats+regime_feats+option_feats+time_umap_feats]
    
    return x_feats, cat_feats, neut_feats


# dataFrame for training
TRAIN_PERIOD = "2020-12-31"

df_feats = jpx.get_feature()
df_feats = get_regime_feats(df_feats)
df_feats = df_feats.merge(get_option_feats(jpx.markets.copy(), jpx.options.copy()), on="Date", how="left")
df_feats = df_feats.merge(get_time_umap_feats(jpx.stock_prices), on="Date", how="left")

df_train = df_feats[(df_feats["Date"]<=TRAIN_PERIOD) & (df_feats["Date"].dt.year!=2020)].copy()
df_valid = df_feats[df_feats["Date"]> TRAIN_PERIOD].copy()

# feature lists
x_feats, cat_feats, neut_feats = get_feature_list(df_feats)
target = ["Target"]

if isTRAIN:
    # trained in GCP, region:us-west1-b, machine type:N1 high memory (16vCPUs, 104GB RAM)
    model1 = train_lgbm(df_train, x_feats, cat_feats, target, seed=1)
    model2 = train_lgbm(df_train, x_feats, cat_feats, target, seed=24)
    model3 = train_lgbm(df_train, x_feats, cat_feats, target, seed=39)
    model4 = train_lgbm(df_train, x_feats, cat_feats, target, seed=114)
    model5 = train_lgbm(df_train, x_feats, cat_feats, target, seed=251)
else:
    model1 = pickle_load("input/jpx-pre-trained-model/jpx_lgbm_01.pickle")
    model2 = pickle_load("input/jpx-pre-trained-model/jpx_lgbm_02.pickle")
    model3 = pickle_load("input/jpx-pre-trained-model/jpx_lgbm_03.pickle")
    model4 = pickle_load("input/jpx-pre-trained-model/jpx_lgbm_04.pickle")
    model5 = pickle_load("input/jpx-pre-trained-model/jpx_lgbm_05.pickle")

# Neutralization

In [None]:
def neutralize_pred(df, neut_feats):
    df = df.replace([-np.inf, np.inf], np.nan).fillna(0)
    df = pd.get_dummies(df, columns=["33SectorCode"])
    df = df.merge(stock_umap_res_rt.reset_index(),   on="SecuritiesCode", how="left")
    df = df.merge(stock_umap_res_corr.reset_index(), on="SecuritiesCode", how="left")
    
    sector_feats = [f for f in df.columns if "Sector" in f]
    sector_feats = sector_feats[:-1]
    
    umap_feats_rt   = [f for f in df.columns if "UmapRt" in f]
    umap_feats_corr = [f for f in df.columns if "UmapCorr" in f]
    
    neut_feats = neut_feats + sector_feats + umap_feats_rt + umap_feats_corr
    
    df["pred_nt"] = np.nan
    
    for dt in sorted(df["Date"].unique()):
        tmp_df = df[df["Date"]==dt].copy()
        lm = LinearRegression()
        lm.fit(tmp_df[neut_feats], tmp_df["pred"])
        tmp_df["pred_nt"] = tmp_df["pred"] - lm.predict(tmp_df[neut_feats])
        
        df.loc[df["Date"]==dt, "pred_nt"] = tmp_df["pred_nt"]
    
    return df

# Evaluation

In [None]:
def calc_spread_return_per_day(df, portfolio_size=200, toprank_weight_ratio=2.0, obj="Rank_nt"):
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    purchase = (df.sort_values(by=obj)["Target"][:portfolio_size] * weights).sum() / weights.mean()
    short = (df.sort_values(by=obj, ascending=False)["Target"][:portfolio_size] * weights).sum() / weights.mean()
    return purchase - short

def calc_spread_return_sharpe(df, portfolio_size=200, toprank_weight_ratio=2.0, obj="Rank_nt"):
    buf = df.groupby("Date").apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio, obj)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio, buf.mean(), buf.std(), buf

def plot_results(df):
    df["Rank"]    = df.groupby("Date")["pred"].rank(ascending=False)
    df["Rank_nt"] = df.groupby("Date")["pred_nt"].rank(ascending=False)
    
    sr, mean, std, res = calc_spread_return_sharpe(df, 200, 2.0, "Rank")
    sr_nt, mean_nt, std_nt, res_nt = calc_spread_return_sharpe(df, 200, 2.0, "Rank_nt")
    sr1, mean1, std1, _ = calc_spread_return_sharpe(df[(df["Date"]>="2021-01-01") & (df["Date"]<"2021-04-01")], 200, 2.0, "Rank_nt")
    sr2, mean2, std2, _ = calc_spread_return_sharpe(df[(df["Date"]>="2021-04-01") & (df["Date"]<"2021-07-01")], 200, 2.0, "Rank_nt")
    sr3, mean3, std3, _ = calc_spread_return_sharpe(df[(df["Date"]>="2021-07-01") & (df["Date"]<"2021-10-01")], 200, 2.0, "Rank_nt")
    sr4, mean4, std4, _ = calc_spread_return_sharpe(df[(df["Date"]>="2021-10-01") & (df["Date"]<"2022-01-01")], 200, 2.0, "Rank_nt")
    sr5, mean5, std5, _ = calc_spread_return_sharpe(df[(df["Date"]>="2022-01-01") & (df["Date"]<"2022-04-01")], 200, 2.0, "Rank_nt")
    sr6, mean6, std6, _ = calc_spread_return_sharpe(df[(df["Date"]>="2022-04-01") & (df["Date"]<"2022-07-01")], 200, 2.0, "Rank_nt")
    sr7, mean7, std7, _ = calc_spread_return_sharpe(df[(df["Date"]>="2021-12-06") & (df["Date"]<"2022-03-01")], 200, 2.0, "Rank_nt")
    
    print(f"validation total sharpe : {sr_nt}")
    print(f"public LB sharpe : {sr7}")
    
    res = pd.DataFrame(res)
    res.columns = ["pnl"]
    res_nt = pd.DataFrame(res_nt)
    res_nt.columns = ["pnl_nt"]
    res = res.join(res_nt)
    res = res.join(jpx.markets[["Date", "NIKKEI"]].set_index("Date"))
    
    fig = plt.figure(figsize=(15, 8))
    ax1 = fig.add_subplot(211)
    ax2 = fig.add_subplot(234)
    ax3 = fig.add_subplot(235)
    ax4 = fig.add_subplot(236)
    
    # plot balance curve
    ax1.plot(res["NIKKEI"])
    ax1.legend(["NIKKEI"], loc='upper left')
    
    ax5 = ax1.twinx()
    ax5.plot(res["pnl"].cumsum(), color="red")
    ax5.plot(res["pnl_nt"].cumsum(), color="green")
    ax5.legend(["pnl without neutralization", "pnl with neutralization"], loc='lower right')
    
    # plot sharpe ratio
    ax2.bar(["21-1Q", "21-2Q", "21-3Q", "21-4Q", "22-1Q", "22-2Q"], [sr1, sr2, sr3, sr4, sr5, sr6])
    ax2.set_title("quarterly sharpe ratio")
    
    # plot mean
    ax3.bar(["21-1Q", "21-2Q", "21-3Q", "21-4Q", "22-1Q", "22-2Q"], [mean1, mean2, mean3, mean4, mean5, mean6])
    ax3.set_title("quarterly pnl mean")
    
    # plot std
    ax4.bar(["21-1Q", "21-2Q", "21-3Q", "21-4Q", "22-1Q", "22-2Q"], [std1, std2, std3, std4, std5, std6])
    ax4.set_title("quarterly pnl std")
    
    fig.show()

In [None]:
df_valid["pred1"] = model1.predict(df_valid[x_feats])
df_valid["pred2"] = model2.predict(df_valid[x_feats])
df_valid["pred3"] = model3.predict(df_valid[x_feats])
df_valid["pred4"] = model4.predict(df_valid[x_feats])
df_valid["pred5"] = model5.predict(df_valid[x_feats])

df_valid["pred"] = df_valid[["pred1", "pred2", "pred3", "pred4", "pred5"]].mean(axis=1)
df_valid = neutralize_pred(df_valid, neut_feats)
df_valid["pred_nt"] = df_valid["pred_nt"].replace([-np.inf, np.inf], np.nan).fillna(0)

plot_results(df_valid)

# Submission

In [None]:
jpx.reduce_data(date="2020-11-15")
del df_feats, df_train, df_valid
gc.collect()

In [None]:
%%time
import jpx_tokyo_market_prediction

env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    # update data
    jpx.update_prices(prices)
    jpx.update_markets(secondary_prices)
    jpx.update_options(options)
    
    if len(financials) != 0:
        update_fins_codes = [c for c in financials["SecuritiesCode"].unique() if c in jpx.univ_codes]
        if len(update_fins_codes) != 0:
            jpx.update_fins(financials)
            jpx.update_fundamentals(update_fins_codes)
    
    # make feature
    df_feats = jpx.get_feature()
    df_feats = df_feats[df_feats["Date"]==sample_prediction["Date"].iloc[0]]
    df_feats = get_regime_feats(df_feats)
    df_feats = df_feats.merge(get_option_feats(jpx.markets.copy(), jpx.options.copy()), on="Date", how="left")
    df_feats = df_feats.merge(get_time_umap_feats(jpx.stock_prices, dates=[sample_prediction["Date"].iloc[0]]), on="Date", how="left")
    
    # feature lists
    x_feats, cat_feats, neut_feats = get_feature_list(df_feats)
    
    # prediction & neutralization
    df_feats["pred1"] = model1.predict(df_feats[x_feats])
    df_feats["pred2"] = model2.predict(df_feats[x_feats])
    df_feats["pred3"] = model3.predict(df_feats[x_feats])
    df_feats["pred4"] = model4.predict(df_feats[x_feats])
    df_feats["pred5"] = model5.predict(df_feats[x_feats])
    
    df_feats["pred"] = df_feats[["pred1", "pred2", "pred3", "pred4", "pred5"]].mean(axis=1)
    
    df_feats = neutralize_pred(df_feats, neut_feats)
    df_feats["pred_nt"] = df_feats["pred_nt"].replace([-np.inf, np.inf], np.nan).fillna(0)
    
    # submission
    sample_prediction = sample_prediction.merge(df_feats[["SecuritiesCode", "pred_nt"]], on="SecuritiesCode", how="left")
    sample_prediction.sort_values("pred_nt", ascending=False, inplace=True)
    #sample_prediction["Rank"] = np.arange(0,2000)
    sample_prediction["Rank"] = np.arange(0,len(sample_prediction))
    sample_prediction.sort_values("SecuritiesCode", ascending=True, inplace=True)
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    env.predict(submission)