In [1]:
# imports
%matplotlib inline
import pandas as pd
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import jpx_tokyo_market_prediction
import numpy as np

In [2]:
import os
from decimal import ROUND_HALF_UP, Decimal
from typing import Tuple

import numpy as np
import pandas as pd

""" 
Methods for preprocessing the dataset 
"""


def data_pipeline(dir_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load data and merge datasets

    Args:
        dir_path (str): path to data dir

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: Train/Test set
    """

    train = pd.read_csv(os.path.join(dir_path, "train_files/stock_prices.csv"))
    test = pd.read_csv(os.path.join(dir_path, "supplemental_files/stock_prices.csv"))
    stock_list = pd.read_csv(os.path.join(dir_path, "stock_list.csv"))
    target_stock_list = stock_list[stock_list["Universe0"]]

    train = train.drop(["ExpectedDividend", "RowId"], axis=1)
    train = train.fillna(0)
    test = test.drop(["ExpectedDividend", "RowId"], axis=1)
    test = test.fillna(0)

    # merge stock categories as additional features
    sec_info = target_stock_list[["SecuritiesCode", "33SectorName", "17SectorName"]]
    train = pd.merge(train, sec_info, on="SecuritiesCode")
    train["33SectorName"] = train["33SectorName"].astype("category")
    train["17SectorName"] = train["17SectorName"].astype("category")

    # use supplemental stock prices as test set to evaluate performance of classifiers
    test = pd.merge(test, sec_info, on="SecuritiesCode")
    test["33SectorName"] = test["33SectorName"].astype("category")
    test["17SectorName"] = test["17SectorName"].astype("category")

    train.update(train.groupby("SecuritiesCode")["Target"].ffill().fillna(0))
    test.update(test.groupby("SecuritiesCode")["Target"].ffill().fillna(0))


    train["SupervisionFlag"] = train["SupervisionFlag"].map({True: 1, False: 0})
    test["SupervisionFlag"] = test["SupervisionFlag"].map({True: 1, False: 0})

    # cut timeframe where not all targets are present
    time_config = {"train_split_date": "2020-12-23"}
    train = train[train.Date >= time_config["train_split_date"]]

    return train, test, sec_info

In [3]:
train, test, sec_info = data_pipeline("../input/jpx-tokyo-stock-exchange-prediction") 
train = pd.concat([train, test])

In [4]:
import optuna
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, ndcg_score
from sklearn.preprocessing import LabelEncoder
# from utils import GroupTimeSeriesSplit, calc_spread_return_sharpe


class LGBM():
    def __init__(self, device=None, seed=42):
        self.seed = seed
        self._best_found_params = {
            "num_leaves": 17,
            "learning_rate": 0.02,
            "n_estimators": 1000,
            "max_depth": -1,
        }
        self.models = {}

    def train(self, train: pd.DataFrame, use_params=False):
        for name, group in train.groupby("33SectorName"):
            y = group["Target"].to_numpy()
            X = group.drop(["Target"], axis=1)
            X = X.drop(["Date", "SecuritiesCode"], axis=1)
            model = LGBMRegressor(**self._best_found_params)
            model.fit(X, y, verbose=False)
            self.models[name] = model

    def predict(self, test: pd.DataFrame):
        y_preds = []
        for name, group in test.groupby("33SectorName"):
            sec_codes = group["SecuritiesCode"]
            X_test = group.drop(["Date", "SecuritiesCode"], axis=1)
            y_pred = self.models[name].predict(X_test)
            y_preds.extend(list(zip(sec_codes, y_pred)))
        df = pd.DataFrame(y_preds, columns=["codes", "pred"])
        return df.sort_values("codes", ascending=True)["pred"].values

In [5]:
import torch 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LGBM(device=device, seed=42)
model.train(train.copy(), use_params=True)



In [6]:
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [7]:
# test loop
for (df_test, options, financials, trades, secondary_prices, df_pred) in iter_test:
    x_test = df_test.drop(["ExpectedDividend", "RowId"], axis=1)
    x_test = x_test.fillna(0)

    # merge stock categories as additional features
    x_test = pd.merge(x_test, sec_info, on="SecuritiesCode")
    x_test["33SectorName"] = x_test["33SectorName"].astype("category")
    x_test["17SectorName"] = x_test["17SectorName"].astype("category")

    x_test["SupervisionFlag"] = x_test["SupervisionFlag"].map({True: 1, False: 0})
    
    y_pred = model.predict(x_test)
    df_pred['Target'] = y_pred
    df_pred = df_pred.sort_values(by = "Target", ascending = False)
    df_pred['Rank'] = np.arange(len(df_pred.index))
    df_pred = df_pred.sort_values(by = "SecuritiesCode", ascending = True)
    df_pred.drop(["Target"], axis = 1)
    submission = df_pred[["Date", "SecuritiesCode", "Rank"]]    
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
