In [7]:
import numpy as np
import pandas as pd 
import optuna
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import lightgbm as lgb

import os
for dirname, _, filenames in os.walk('input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

input/jpx-tokyo-stock-exchange-prediction/stock_list.csv
input/jpx-tokyo-stock-exchange-prediction/data_specifications/options_spec.csv
input/jpx-tokyo-stock-exchange-prediction/data_specifications/trades_spec.csv
input/jpx-tokyo-stock-exchange-prediction/data_specifications/stock_list_spec.csv
input/jpx-tokyo-stock-exchange-prediction/data_specifications/stock_fin_spec.csv
input/jpx-tokyo-stock-exchange-prediction/data_specifications/stock_price_spec.csv
input/jpx-tokyo-stock-exchange-prediction/example_test_files/financials.csv
input/jpx-tokyo-stock-exchange-prediction/example_test_files/options.csv
input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv
input/jpx-tokyo-stock-exchange-prediction/example_test_files/secondary_stock_prices.csv
input/jpx-tokyo-stock-exchange-prediction/example_test_files/stock_prices.csv
input/jpx-tokyo-stock-exchange-prediction/example_test_files/trades.csv
input/jpx-tokyo-stock-exchange-prediction/jpx_tokyo_market_prediction/

In [8]:
stock_prices = pd.read_csv("input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
stock_list = pd.read_csv("input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")
target_stock_list = stock_list[stock_list['Universe0']]
sec_info = target_stock_list[['SecuritiesCode', '33SectorName', '17SectorName']]
stock_prices = pd.merge(stock_prices, sec_info, on='SecuritiesCode')
stock_prices['33SectorName'] = stock_prices['33SectorName'].astype("category")
stock_prices['17SectorName'] = stock_prices['17SectorName'].astype("category")

In [9]:
# preprocess dataset
def preprocess(df, cols):
    df_stock = df.copy()
 
    df_stock = df_stock.sort_values(by = "Date", ascending = False).reset_index(drop = True)
    df_stock['average'] = (df_stock['High'] + df_stock['Low'] + df_stock['Close'])/3
    df_stock['Date'] = pd.to_datetime(df_stock['Date'], format  = "%Y-%m-%d")
    df_stock['dayofweek'] = df_stock['Date'].dt.dayofweek
    df_stock['is_quater_start'] = df_stock['Date'].dt.is_quarter_start.map({False:0,True:1})
    df_stock['is_month_start'] = df_stock['Date'].dt.is_month_start.map({False:0,True:1})
    df_stock['is_month_end'] = df_stock['Date'].dt.is_month_end.map({False:0,True:1})
    # Another feature day of the week will also be added.
    df_stock = df_stock.sort_values(by = "Date").reset_index(drop = True)
    df_model = df_stock[cols]
    
    return df_model

cols = ['Open', 'High', 'Low', 'Close', 'average', 'dayofweek', 'is_quater_start', 'is_month_start', 'is_month_end', 'Target', 'SecuritiesCode', 'Volume', '17SectorName', '33SectorName']
stock_prices = preprocess(stock_prices, cols)

In [10]:
lgbm_params = {
    'task': 'train',
    'boosting_type': 'dart', 
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'lambda_l1': 0.5,
    'lambda_l2': 0.5, 
    'num_leaves': 10,
    'feature_fraction': 0.5, 
    'bagging_fraction': 0.5, 
    'bagging_freq': 5,
    'min_child_samples': 10,
    'seed': 42
}

In [12]:
# group dataframe and train regressor on each category
stock_cats = stock_prices.groupby("17SectorName")
models = {}
for name, prices in stock_cats:
    target = prices.pop('Target')
    train_f, valid_f = train_test_split(prices, test_size=0.2)
    train_idx = train_f.index
    valid_idx = valid_f.index
    lgb_train = lgb.Dataset(train_f, target[train_idx])
    lgb_valid = lgb.Dataset(valid_f, target[valid_idx], reference=lgb_train)
    models[name] = lgb.train(
        lgbm_params,
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=['Train', 'Valid'],
        num_boost_round=1,
        verbose_eval=False,
    )




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1609
[LightGBM] [Info] Number of data points in the train set: 61542, number of used features: 12




[LightGBM] [Info] Start training from score 0.000106




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1606
[LightGBM] [Info] Number of data points in the train set: 60223, number of used features: 11




[LightGBM] [Info] Start training from score -0.000374




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1693
[LightGBM] [Info] Number of data points in the train set: 143656, number of used features: 11




[LightGBM] [Info] Start training from score 0.000464




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1703
[LightGBM] [Info] Number of data points in the train set: 149104, number of used features: 12




[LightGBM] [Info] Start training from score 0.000337




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1734
[LightGBM] [Info] Number of data points in the train set: 180792, number of used features: 12




[LightGBM] [Info] Start training from score 0.000622




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1565
[LightGBM] [Info] Number of data points in the train set: 22088, number of used features: 11




[LightGBM] [Info] Start training from score 0.000151




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1559
[LightGBM] [Info] Number of data points in the train set: 13462, number of used features: 12




[LightGBM] [Info] Start training from score 0.000245




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1603
[LightGBM] [Info] Number of data points in the train set: 52583, number of used features: 12




[LightGBM] [Info] Start training from score 0.000292




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1639
[LightGBM] [Info] Number of data points in the train set: 88939, number of used features: 12




[LightGBM] [Info] Start training from score 0.000180




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1796
[LightGBM] [Info] Number of data points in the train set: 422433, number of used features: 12




[LightGBM] [Info] Start training from score 0.000840




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 122504, number of used features: 11




[LightGBM] [Info] Start training from score 0.000396




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1589
[LightGBM] [Info] Number of data points in the train set: 43436, number of used features: 11




[LightGBM] [Info] Start training from score 0.000507




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1718
[LightGBM] [Info] Number of data points in the train set: 161965, number of used features: 12




[LightGBM] [Info] Start training from score 0.000398




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1611
[LightGBM] [Info] Number of data points in the train set: 62899, number of used features: 11




[LightGBM] [Info] Start training from score 0.000500




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1721
[LightGBM] [Info] Number of data points in the train set: 170037, number of used features: 11




[LightGBM] [Info] Start training from score 0.000281




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1589
[LightGBM] [Info] Number of data points in the train set: 42268, number of used features: 12




[LightGBM] [Info] Start training from score 0.000226




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1618
[LightGBM] [Info] Number of data points in the train set: 68086, number of used features: 12




[LightGBM] [Info] Start training from score 0.000338


In [None]:
# load Time Series API
import jpx_tokyo_market_prediction
# make Time Series API environment (this function can be called only once in a session)
env = jpx_tokyo_market_prediction.make_env()
# get iterator to fetch data day by day
iter_test = env.iter_test()


In [None]:
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    prices = pd.merge(prices, sec_info, on='SecuritiesCode')
    prices['33SectorName'] = prices['33SectorName'].astype("category")
    prices['17SectorName'] = prices['17SectorName'].astype("category")

    cols = ['Open', 'High', 'Low', 'Close', 'average', 'dayofweek', 'is_quater_start', 'is_month_start', 'is_month_end', 'SecuritiesCode', 'Volume', '17SectorName', '33SectorName']
    prices = preprocess(prices, cols)
    stock_cats = prices.groupby("17SectorName")
    prediction = pd.DataFrame()
    
    for name, sec_prices in stock_cats:
         sec_prices["pred"] = models[name].predict(sec_prices)
         prediction = pd.concat([prediction, sec_prices]).reset_index(drop=True)
        
    prediction["Rank"] = (prediction["pred"].rank(method="first", ascending=False)-1).astype(int)
    sample_prediction["Rank"] = prediction["Rank"].values
        
    
    sample_prediction = sample_prediction.replace([-np.inf, np.inf], np.nan).fillna(0.0)

    assert sample_prediction["Rank"].notna().all()
    assert sample_prediction["Rank"].min() == 0
    assert sample_prediction["Rank"].max() == len(sample_prediction["Rank"]) - 1

    env.predict(sample_prediction)