In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
stock_prices = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv")

In [None]:
stock_list = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")
target_stock_list = stock_list[stock_list['Universe0']]
#sec_prod_val_cnt = target_stock_list['Section/Products'].value_counts()
#print(sec_prod_val_cnt)
#sec_33_val_cnt = target_stock_list['33SectorName'].value_counts()
#print(sec_33_val_cnt)
#sec_17_val_cnt = target_stock_list['17SectorName'].value_counts()
#print(sec_17_val_cnt)
sec_info = target_stock_list[['SecuritiesCode', '33SectorName', '17SectorName']]
stock_prices = pd.merge(stock_prices, sec_info, on='SecuritiesCode')
stock_prices['33SectorName'] = stock_prices['33SectorName'].astype("category")
stock_prices['17SectorName'] = stock_prices['17SectorName'].astype("category")

In [None]:
# Forwardfill NaN-values
print(stock_prices["Target"].isnull().values.any())
stock_prices.update(stock_prices.groupby('SecuritiesCode')["Target"].ffill().fillna(0))
print(stock_prices["Target"].isnull().values.any())
print(stock_prices.columns)

In [None]:
# Form 31 Targetgroups
stock_prices["Target"] = stock_prices.groupby("Date")["Target"].rank("dense", ascending=False).astype(int, errors='ignore')
print(stock_prices["Target"].isnull().values.any(), np.isinf(stock_prices["Target"]).values.any())
print(stock_prices.columns)
stock_prices["Target"] = pd.qcut(stock_prices.Target, 30).cat.codes

In [None]:
# Just some arbitrary dates
time_config = {'train_split_date': '2021-12-06',
               'val_split_date'  : '2022-04-15', 
               'test_split_date' : '2022-04-25'} # after 25th of April there are missing entries!

train = stock_prices[(stock_prices.Date >= time_config['train_split_date']) & (stock_prices.Date < time_config['val_split_date'])]
val = stock_prices[(stock_prices.Date >= time_config['val_split_date']) & (stock_prices.Date < time_config['test_split_date'])]
test = stock_prices[(stock_prices.Date >= time_config['test_split_date'])]


print(train.shape)
print(val.shape)
print(test.shape)

col_use = [c for c in stock_prices.columns if c not in ["RowId","Date", "Target"]]

In [None]:
query_train = [train.shape[0] / 2000] * 2000 #Because we have 2000 stock in each time group
query_val = [val.shape[0] / 2000] * 2000
query_test = [test.shape[0] / 2000] * 2000
#print(train.dtypes)
#print(val.dtypes)
#print(test.dtypes)

In [None]:
from lightgbm import LGBMRanker
import optuna

def objective(trial):
    param = {
        'boosting_type': 'dart',
        'num_leaves': trial.suggest_categorical('num_leaves', [15, 31, 63, 127, 255, 511, 1023]),
        'learning_rate': trial.suggest_uniform('learning_rate', 5e-5,1e-3),
        'n_estimators': trial.suggest_categorical('n_estimators', [500, 1000, 1500, 2000]),
        'random_state': 42,
        'num_iterations': trial.suggest_categorical('num_iterations', [100, 200, 300, 400]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.5,0.7,0.8,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.5,0.6,0.7,0.8,0.9,1.0]),
        'max_bin': trial.suggest_categorical('max_bin', [127,255,511])
    }

    model = LGBMRanker(**param)
    model.fit(train[col_use], train['Target'], group = query_train, eval_set=[(val[col_use], val['Target'])], eval_group=[query_val], eval_at=[1], verbose=100)

    return model.evals_result_['valid_0']['ndcg@1'][-1]

In [None]:
# Comment out HPO when submitting
#study = optuna.create_study(direction='maximize')
#study.optimize(objective, n_trials=10)

#print(study.best_trial.params)
#model = LGBMRanker(**best_trial)
#model.fit(train[col_use], train['Target'], group = query_train, eval_set=[(val[col_use], val['Target'])], eval_group=[query_val], eval_at=[1], verbose=100)

In [None]:
# Enter best modelparameters here to speedup leaderboard evaluation
best_model = LGBMRanker(boosting_type='dart',
                          num_leaves=511,
                          learning_rate=0.0040828871812121775,
                          n_estimators=1000,
                          random_state=42,
                          num_iterations=100,
                          colsample_bytree=1.0,
                          subsample=0.8,
                          max_bin=127,
                          n_jobs=2)
best_model.fit(train[col_use], train['Target'],
             group = query_train,
             verbose=100,
             eval_set=[(val[col_use], val['Target'])],
             eval_group=[query_val],
             eval_at=[1])

In [None]:
# load Time Series API
import jpx_tokyo_market_prediction
# make Time Series API environment (this function can be called only once in a session)
env = jpx_tokyo_market_prediction.make_env()
# get iterator to fetch data day by day
iter_test = env.iter_test()

In [None]:
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    prices = pd.merge(prices, sec_info, on='SecuritiesCode')
    prices['33SectorName'] = prices['33SectorName'].astype("category")
    prices['17SectorName'] = prices['17SectorName'].astype("category")
    try:
        sample_prediction['Rank'] = best_model.predict(prices[col_use]) * -1
        # Get the ranks from prediction first and for the duplicated ones, just rank again
        sample_prediction['Rank'] = sample_prediction.groupby("Date")["Rank"].rank("dense", 
                                                                                   ascending=False).astype(int)
        sample_prediction['Rank'] = sample_prediction.groupby("Date")["Rank"].rank("first").astype(int) - 1
    except:
        sample_prediction['Rank'] = 0
    sample_prediction = sample_prediction.replace([-np.inf, np.inf], np.nan).fillna(0.0)
    # register your predictions
    env.predict(sample_prediction)
    display(sample_prediction)