In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import torch
import math
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
/kaggle/input/pytorchtabnet/pytorch_tabnet-3.1.0-py3-none-any.whl
/kaggle/input/pytorchtabnet/pytorch_tabnet-4.0-py3-none-any.whl
/kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.1-py3-none-any.whl
/kaggle/input/pytorchtabnet/pytorch_tabnet-1.2.0-py3-none-any.whl
/kaggle/input/pytorchtabnet/pytorch_tabnet-3.1.1-py3-none-any.whl
/kaggle/input/pytorchtabnet/pytorch_tabnet-3.0.0-py3-none-any.whl
/kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl
/kaggle/input/jpx-tokyo-stock-exchange-prediction/stock_list.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/options.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/financials.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/secondary_stock_prices.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example

In [2]:
!pip -q install ../input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl

[0m

In [3]:
from sklearn.preprocessing import StandardScaler
from decimal import ROUND_HALF_UP, Decimal

In [4]:
stand = StandardScaler()

In [5]:
def read_data(data):
    data.sort_values(by=['SecuritiesCode', 'Date'], inplace=True)
    data.drop("RowId", axis=1, inplace=True)
    data.reset_index(drop=True, inplace=True)
    data.replace('－', np.nan, inplace=True)
    cols = ['Open', 'High', 'Low', 'Close','Volume']
    #data.set_index(['Date','SecuritiesCode'], inplace=True)
    for col in cols:
        data[col] = data.groupby(['SecuritiesCode'])[col].ffill()  # 分组前向填充
        data[col] = data.groupby(['SecuritiesCode'])[col].bfill()  # 分组后向填充
        data[col] = data.groupby('SecuritiesCode')[col].transform(lambda x: stand.fit_transform(x.values.reshape(-1, 1)).flatten())
    data['Target'] = data['Target'].fillna(0)
    data['AdjustmentFactor'] = data['AdjustmentFactor'].fillna(1)
    data['ExpectedDividend'] = data['ExpectedDividend'].fillna(0)
    data['SupervisionFlag'] = data['SupervisionFlag'].fillna(False)
    
    #data.reset_index(inplace=True)
    data['SecuritiesCode'] = data['SecuritiesCode'].astype('int')
    data['SupervisionFlag'] = data['SupervisionFlag'].astype('int')
    data.loc[: ,"Date"] = pd.to_datetime(data.loc[: ,"Date"], format="%Y-%m-%d")
    
    return data

In [6]:
def qround(x):
    return float(Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))

In [7]:
def adjust_prices(df):
    df = df.sort_values("Date", ascending=True)
    df.loc[:, "CumAdjust"] = df["AdjustmentFactor"].cumprod()

    # generate adjusted prices
    pcols = ["Open", "High", "Low", "Close"]
    for p in pcols:
        df.loc[:, p] = (df["CumAdjust"] * df[p]).apply(qround)
    df.loc[:, "Volume"] = df["Volume"] / df["CumAdjust"]
    return df

In [8]:
data_df = pd.read_csv('/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')
data2_df = pd.read_csv('/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv')
#data = pd.concat([data_df,data2_df], axis=0)

#Rawdata = read_data(data)
train = read_data(data_df)
valid = read_data(data2_df)

In [9]:
#Rawdata = Rawdata.groupby("SecuritiesCode").apply(adjust_prices)
train = train.groupby("SecuritiesCode").apply(adjust_prices)
valid = valid.groupby("SecuritiesCode").apply(adjust_prices)

In [10]:
from pytorch_tabnet.tab_model import TabNetRegressor
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

In [11]:
# 计算分割点
split_point = pd.to_datetime('2020-12-23 00:00:00')

# 分割数据集
# train_data = Rawdata[Rawdata['Date'] < split_point]
# valid_data = Rawdata[Rawdata['Date'] >= split_point]
train_data = train[train['Date'] >= split_point]
valid_data = valid

In [12]:
print('train_size:',len(train_data)/(len(train_data)+len(valid_data))*100,'%')
print('valid_size:',len(valid_data)/(len(train_data)+len(valid_data))*100,'%')

train_size: 63.22550931281774 %
valid_size: 36.77449068718225 %


In [13]:
features = ['Open', 'High', 'Low', 'Close', 'Volume','ExpectedDividend','SupervisionFlag']

In [14]:
tabnet_params=dict(
    n_d = 8, 
    n_a = 8, 
    n_steps = 5,
    gamma = 1.3,
    n_independent = 2, 
    n_shared = 2, 
    optimizer_fn = torch.optim.Adam,
    mask_type = "entmax",
    cat_idxs = [6],
    cat_dims = [2],
    cat_emb_dim = [1],
    lambda_sparse = 1e-1,
    scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
    scheduler_fn = CosineAnnealingWarmRestarts,
    seed = 42,
    verbose = 1, 
)

In [15]:
clf=TabNetRegressor(**tabnet_params)



In [16]:
x_train = train_data[features]
y_train = train_data['Target']

x_valid = valid_data[features]
y_valid = valid_data['Target']

In [17]:
from pytorch_tabnet.augmentations import RegressionSMOTE
aug = RegressionSMOTE(p=0.2)

In [18]:
clf.fit(
    X_train=x_train.values, y_train=y_train.values.reshape(-1, 1),
    eval_set=[(x_train.values, y_train.values.reshape(-1, 1)), (x_valid.values, y_valid.values.reshape(-1,1))],
    eval_name=['train', 'valid'],
    eval_metric=['mae', 'rmse', 'mse'],
    max_epochs=100,
    patience=10,
    batch_size=1024*20, virtual_batch_size=128*20,
    num_workers=4,
    drop_last=False,
    #augmentations=aug, #aug
) 

epoch 0  | loss: 0.4612  | train_mae: 0.07937 | train_rmse: 0.22175 | train_mse: 0.04918 | valid_mae: 0.06027 | valid_rmse: 0.18406 | valid_mse: 0.03388 |  0:00:09s
epoch 1  | loss: 0.10513 | train_mae: 0.0363  | train_rmse: 0.11305 | train_mse: 0.01278 | valid_mae: 0.02813 | valid_rmse: 0.06736 | valid_mse: 0.00454 |  0:00:19s
epoch 2  | loss: 0.06224 | train_mae: 0.02332 | train_rmse: 0.04213 | train_mse: 0.00178 | valid_mae: 0.02018 | valid_rmse: 0.02867 | valid_mse: 0.00082 |  0:00:28s
epoch 3  | loss: 0.03491 | train_mae: 0.02478 | train_rmse: 0.03797 | train_mse: 0.00144 | valid_mae: 0.02179 | valid_rmse: 0.03136 | valid_mse: 0.00098 |  0:00:37s
epoch 4  | loss: 0.02388 | train_mae: 0.01697 | train_rmse: 0.02665 | train_mse: 0.00071 | valid_mae: 0.01745 | valid_rmse: 0.02769 | valid_mse: 0.00077 |  0:00:46s
epoch 5  | loss: 0.0123  | train_mae: 0.01673 | train_rmse: 0.02397 | train_mse: 0.00057 | valid_mae: 0.01761 | valid_rmse: 0.02551 | valid_mse: 0.00065 |  0:00:55s
epoch 6  |



In [19]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files

In [20]:
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:  
    prices.sort_values(by=['SecuritiesCode', 'Date'], inplace=True)
    prices.drop("RowId", axis=1, inplace=True)
    prices.reset_index(drop=True, inplace=True)
    prices.replace('－', np.nan, inplace=True)
    cols = ['Open', 'High', 'Low', 'Close','Volume']
    for col in cols:
        prices[col] = prices[col].fillna(0)
        prices[col] = prices.groupby('SecuritiesCode')[col].transform(lambda x: stand.fit_transform(x.values.reshape(-1, 1)).flatten())
    prices['AdjustmentFactor'] = prices['AdjustmentFactor'].fillna(1)
    prices['ExpectedDividend'] = prices['ExpectedDividend'].fillna(0)
    prices['SupervisionFlag'] = prices['SupervisionFlag'].fillna(False)
    
    prices['SecuritiesCode'] = prices['SecuritiesCode'].astype('int')
    prices['SupervisionFlag'] = prices['SupervisionFlag'].astype('int')
    prices.loc[: ,"Date"] = pd.to_datetime(prices.loc[: ,"Date"], format="%Y-%m-%d")
    
    prices = prices.groupby("SecuritiesCode").apply(adjust_prices)
    
    pred = clf.predict(prices[features].values)
    prices['Target'] = pred
    # sort in descending order by Target
    prices = prices.sort_values(by = "Target", ascending = False)
    # add Rank
    prices['Rank'] = np.arange(len(prices.index))
    prices = prices.sort_values(by = "SecuritiesCode", ascending = True)
    prices.drop(["Target"], axis = 1)
    submission = prices[["Date", "SecuritiesCode", "Rank"]]
    # register your predictions
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
