In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import torch
import math
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jpx-tokyo-stock-exchange-prediction/stock_list.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/options.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/financials.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/secondary_stock_prices.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/trades.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/stock_prices.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/jpx_tokyo_market_prediction/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/jpx-tokyo-stock-exchange-prediction/jpx_tokyo_market_prediction/__init__.py
/kaggle/input/jpx-tokyo-stock-exchange-prediction/data_specifications/stock_fin_spec.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/data_specifications/trades_spec.csv
/kaggle/input/jpx-tokyo-stock-

In [2]:
!pip -q install ../input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl

[0m

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from decimal import ROUND_HALF_UP, Decimal

In [4]:
stand = StandardScaler()
le = LabelEncoder()

In [5]:
def read_data(data):
    stock_list = pd.read_csv(os.path.join('/kaggle/input/jpx-tokyo-stock-exchange-prediction/', "stock_list.csv"))
    target_stock_list = stock_list[stock_list["Universe0"]]
    
    data.sort_values(by=['SecuritiesCode', 'Date'], inplace=True)
    data.drop("RowId", axis=1, inplace=True)
    data.reset_index(drop=True, inplace=True)
    data.replace('－', np.nan, inplace=True)
    cols = ['Open', 'High', 'Low', 'Close','Volume']
    #data.set_index(['Date','SecuritiesCode'], inplace=True)
    for col in cols:
        data[col] = data.groupby(['SecuritiesCode'])[col].ffill()  # 分组前向填充
        data[col] = data.groupby(['SecuritiesCode'])[col].bfill()  # 分组后向填充
        data[col] = data.groupby('SecuritiesCode')[col].transform(lambda x: stand.fit_transform(x.values.reshape(-1, 1)).flatten())
    data['Target'] = data['Target'].fillna(0)
    data['AdjustmentFactor'] = data['AdjustmentFactor'].fillna(1)
    data['ExpectedDividend'] = data['ExpectedDividend'].fillna(0)
    data['SupervisionFlag'] = data['SupervisionFlag'].fillna(False)
    
    sec_info = target_stock_list[["SecuritiesCode", "33SectorName"]]
    data = pd.merge(data, sec_info, on="SecuritiesCode")
    data["33SectorName"] = data["33SectorName"].astype("category")
    data.loc[:, 'sector'] = le.fit_transform(data['33SectorName'])
    
    #data.reset_index(inplace=True)
    data['SecuritiesCode'] = data['SecuritiesCode'].astype('int')
    data['SupervisionFlag'] = data['SupervisionFlag'].astype('int')
    data.loc[: ,"Date"] = pd.to_datetime(data.loc[: ,"Date"], format="%Y-%m-%d")
    
    return data

In [6]:
def qround(x):
    return float(Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))

In [7]:
def adjust_prices(df):
    df = df.sort_values("Date", ascending=True)
    df.loc[:, "CumAdjust"] = df["AdjustmentFactor"].cumprod()

    # generate adjusted prices
    pcols = ["Open", "High", "Low", "Close"]
    for p in pcols:
        df.loc[:, p] = (df["CumAdjust"] * df[p]).apply(qround)
    df.loc[:, "Volume"] = df["Volume"] / df["CumAdjust"]
    return df

In [8]:
data_df = pd.read_csv('/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')
data2_df = pd.read_csv('/kaggle/input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv')
#data = pd.concat([data_df,data2_df], axis=0)

#Rawdata = read_data(data)
train = read_data(data_df)
valid = read_data(data2_df)

In [9]:
#Rawdata = Rawdata.groupby("SecuritiesCode").apply(adjust_prices)
train = train.groupby("SecuritiesCode").apply(adjust_prices)
valid = valid.groupby("SecuritiesCode").apply(adjust_prices)

In [10]:
from pytorch_tabnet.tab_model import TabNetRegressor
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

In [11]:
# 计算分割点
split_point = pd.to_datetime('2020-12-23 00:00:00')

# 分割数据集
# train_data = Rawdata[Rawdata['Date'] < split_point]
# valid_data = Rawdata[Rawdata['Date'] >= split_point]
# train_data = train[train['Date'] >= split_point]
train_data = train
valid_data = valid

In [12]:
print('train_size:',len(train_data)/(len(train_data)+len(valid_data))*100,'%')
print('valid_size:',len(valid_data)/(len(train_data)+len(valid_data))*100,'%')

train_size: 89.6295820953792 %
valid_size: 10.37041790462079 %


In [13]:
features = ['Open', 'High', 'Low', 'Close', 'Volume','ExpectedDividend','SupervisionFlag','sector']

In [14]:
# 训练每个行业的模型
sectors = train['sector'].unique()
models = {}

In [15]:
tabnet_params=dict(
    n_d = 8, 
    n_a = 8, 
    n_steps = 3,
    gamma = 1.3,
    n_independent = 2, 
    n_shared = 2, 
    optimizer_fn = torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR, 
    mask_type = "entmax",
    cat_idxs = [6],
    cat_dims = [2],
    cat_emb_dim = [1],
    lambda_sparse = 1e-3,
    #scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
    #scheduler_fn = CosineAnnealingWarmRestarts,
    seed = 42,
    verbose = 0, 
)

In [16]:
clf=TabNetRegressor(**tabnet_params)

In [17]:
x_train = train_data[features]
y_train = train_data['Target']

x_valid = valid_data[features]
y_valid = valid_data['Target']

In [18]:
from pytorch_tabnet.augmentations import RegressionSMOTE
aug = RegressionSMOTE(p=0.2)

In [19]:
for sector in sectors:
    print(f"Training model for sector {sector}")
    # 数据划分
    train_indices = x_train['sector'] == sector
    valid_indices = x_valid['sector'] == sector

    x_train_sector = x_train[train_indices].drop(columns=['sector'])
    y_train_sector = y_train[train_indices]
    x_valid_sector = x_valid[valid_indices].drop(columns=['sector'])
    y_valid_sector = y_valid[valid_indices]

    # 初始化 TabNet 模型
    model = TabNetRegressor(**tabnet_params)

    # 训练模型
    model.fit(
        X_train=x_train_sector.values, y_train=y_train_sector.values.reshape(-1, 1),
        eval_set=[(x_train_sector.values, y_train_sector.values.reshape(-1, 1)), (x_valid_sector.values, y_valid_sector.values.reshape(-1, 1))],
        eval_name=['train','valid'], eval_metric=['mae', 'rmse', 'mse'],
        max_epochs=100, patience=10,
        batch_size=1024, virtual_batch_size=128,
        num_workers=4,drop_last=False,
        augmentations=aug, #aug
    )

    # 保存模型
    models[sector] = model

Training model for sector 6

Early stopping occurred at epoch 98 with best_epoch = 88 and best_valid_mse = 0.00023




Training model for sector 3

Early stopping occurred at epoch 15 with best_epoch = 5 and best_valid_mse = 0.00032




Training model for sector 16

Early stopping occurred at epoch 21 with best_epoch = 11 and best_valid_mse = 0.00202
Training model for sector 28





Early stopping occurred at epoch 11 with best_epoch = 1 and best_valid_mse = 0.0008




Training model for sector 9

Early stopping occurred at epoch 22 with best_epoch = 12 and best_valid_mse = 0.00102




Training model for sector 7

Early stopping occurred at epoch 22 with best_epoch = 12 and best_valid_mse = 0.00023




Training model for sector 21

Early stopping occurred at epoch 19 with best_epoch = 9 and best_valid_mse = 0.00099




Training model for sector 24

Early stopping occurred at epoch 10 with best_epoch = 0 and best_valid_mse = 0.00201




Training model for sector 12

Early stopping occurred at epoch 34 with best_epoch = 24 and best_valid_mse = 0.00029




Training model for sector 25

Early stopping occurred at epoch 18 with best_epoch = 8 and best_valid_mse = 0.00036




Training model for sector 32

Early stopping occurred at epoch 11 with best_epoch = 1 and best_valid_mse = 0.0004




Training model for sector 2

Early stopping occurred at epoch 25 with best_epoch = 15 and best_valid_mse = 0.00049




Training model for sector 29

Early stopping occurred at epoch 26 with best_epoch = 16 and best_valid_mse = 0.00041




Training model for sector 4

Early stopping occurred at epoch 37 with best_epoch = 27 and best_valid_mse = 0.00066




Training model for sector 8

Early stopping occurred at epoch 13 with best_epoch = 3 and best_valid_mse = 0.0011




Training model for sector 30

Early stopping occurred at epoch 18 with best_epoch = 8 and best_valid_mse = 0.00055




Training model for sector 18

Early stopping occurred at epoch 40 with best_epoch = 30 and best_valid_mse = 0.00051




Training model for sector 15

Early stopping occurred at epoch 17 with best_epoch = 7 and best_valid_mse = 0.00035




Training model for sector 23

Early stopping occurred at epoch 37 with best_epoch = 27 and best_valid_mse = 0.00037




Training model for sector 22

Early stopping occurred at epoch 42 with best_epoch = 32 and best_valid_mse = 0.00067




Training model for sector 26

Early stopping occurred at epoch 75 with best_epoch = 65 and best_valid_mse = 0.00038




Training model for sector 11

Early stopping occurred at epoch 33 with best_epoch = 23 and best_valid_mse = 0.00059




Training model for sector 13

Early stopping occurred at epoch 25 with best_epoch = 15 and best_valid_mse = 0.00051




Training model for sector 17

Early stopping occurred at epoch 50 with best_epoch = 40 and best_valid_mse = 0.00068




Training model for sector 27

Early stopping occurred at epoch 27 with best_epoch = 17 and best_valid_mse = 0.00053




Training model for sector 10

Early stopping occurred at epoch 43 with best_epoch = 33 and best_valid_mse = 0.00046




Training model for sector 19

Early stopping occurred at epoch 48 with best_epoch = 38 and best_valid_mse = 0.00042




Training model for sector 1

Early stopping occurred at epoch 22 with best_epoch = 12 and best_valid_mse = 0.0003




Training model for sector 20

Early stopping occurred at epoch 25 with best_epoch = 15 and best_valid_mse = 0.0005




Training model for sector 31

Early stopping occurred at epoch 38 with best_epoch = 28 and best_valid_mse = 0.0004




Training model for sector 14

Early stopping occurred at epoch 83 with best_epoch = 73 and best_valid_mse = 0.00135
Training model for sector 0





Early stopping occurred at epoch 42 with best_epoch = 32 and best_valid_mse = 0.00052
Training model for sector 5





Early stopping occurred at epoch 14 with best_epoch = 4 and best_valid_mse = 0.00121




In [20]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files

In [21]:
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:  
    prices.sort_values(by=['SecuritiesCode', 'Date'], inplace=True)
    prices.drop("RowId", axis=1, inplace=True)
    prices.reset_index(drop=True, inplace=True)
    prices.replace('－', np.nan, inplace=True)
    cols = ['Open', 'High', 'Low', 'Close','Volume']
    for col in cols:
        prices[col] = prices[col].fillna(0)
        prices[col] = prices.groupby('SecuritiesCode')[col].transform(lambda x: stand.fit_transform(x.values.reshape(-1, 1)).flatten())
    prices['AdjustmentFactor'] = prices['AdjustmentFactor'].fillna(1)
    prices['ExpectedDividend'] = prices['ExpectedDividend'].fillna(0)
    prices['SupervisionFlag'] = prices['SupervisionFlag'].fillna(False)
    
    prices['SecuritiesCode'] = prices['SecuritiesCode'].astype('int')
    prices['SupervisionFlag'] = prices['SupervisionFlag'].astype('int')
    prices.loc[: ,"Date"] = pd.to_datetime(prices.loc[: ,"Date"], format="%Y-%m-%d")
    
    prices = prices.groupby("SecuritiesCode").apply(adjust_prices)
    
    stock_list = pd.read_csv(os.path.join('/kaggle/input/jpx-tokyo-stock-exchange-prediction/', "stock_list.csv"))
    target_stock_list = stock_list[stock_list["Universe0"]]
    sec_info = target_stock_list[["SecuritiesCode", "33SectorName"]]
    prices = pd.merge(prices, sec_info, on="SecuritiesCode")
    prices["33SectorName"] = prices["33SectorName"].astype("category")
    prices.loc[:, 'sector'] = le.fit_transform(prices['33SectorName'])
    
    for row in prices.itertuples():
        sector = getattr(row, 'sector')
        feature = np.array([getattr(row, 'Open'), getattr(row, 'High'), getattr(row, 'Low'), getattr(row, 'Close'), getattr(row, 'Volume'), getattr(row, 'ExpectedDividend'), getattr(row, 'SupervisionFlag')]).reshape(1,-1)
        pred = models[sector].predict(feature)
        prices.at[row.Index, 'Target'] = pred


    # sort in descending order by Target
    prices = prices.sort_values(by = "Target", ascending = False)
    # add Rank
    prices['Rank'] = np.arange(len(prices.index))
    prices = prices.sort_values(by = "SecuritiesCode", ascending = True)
    prices.drop(["Target"], axis = 1)
    submission = prices[["Date", "SecuritiesCode", "Rank"]]
    # register your predictions
    env.predict(submission)
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:  
    prices.sort_values(by=['SecuritiesCode', 'Date'], inplace=True)
    prices.drop("RowId", axis=1, inplace=True)
    prices.reset_index(drop=True, inplace=True)
    prices.replace('－', np.nan, inplace=True)
    cols = ['Open', 'High', 'Low', 'Close','Volume']
    for col in cols:
        prices[col] = prices[col].fillna(0)
        prices[col] = prices.groupby('SecuritiesCode')[col].transform(lambda x: stand.fit_transform(x.values.reshape(-1, 1)).flatten())
    prices['AdjustmentFactor'] = prices['AdjustmentFactor'].fillna(1)
    prices['ExpectedDividend'] = prices['ExpectedDividend'].fillna(0)
    prices['SupervisionFlag'] = prices['SupervisionFlag'].fillna(False)
    
    prices['SecuritiesCode'] = prices['SecuritiesCode'].astype('int')
    prices['SupervisionFlag'] = prices['SupervisionFlag'].astype('int')
    prices.loc[: ,"Date"] = pd.to_datetime(prices.loc[: ,"Date"], format="%Y-%m-%d")
    
    prices = prices.groupby("SecuritiesCode").apply(adjust_prices)
    
    stock_list = pd.read_csv(os.path.join('/kaggle/input/jpx-tokyo-stock-exchange-prediction/', "stock_list.csv"))
    target_stock_list = stock_list[stock_list["Universe0"]]
    sec_info = target_stock_list[["SecuritiesCode", "33SectorName"]]
    prices = pd.merge(prices, sec_info, on="SecuritiesCode")
    prices["33SectorName"] = prices["33SectorName"].astype("category")
    prices.loc[:, 'sector'] = le.fit_transform(prices['33SectorName'])
    
    for row in prices.itertuples():
        sector = getattr(row, 'sector')
        features = np.array([getattr(row, 'Open'), getattr(row, 'High'), getattr(row, 'Low'), getattr(row, 'Close'), getattr(row, 'Volume'), getattr(row, 'ExpectedDividend'), getattr(row, 'SupervisionFlag')]).reshape(1, -1)
        pred = models[sector].predict(features)
        prices.at[row.Index, 'Target'] = pred
    
    # sort in descending order by Target
    prices = prices.sort_values(by = "Target", ascending = False)
    # add Rank
    prices['Rank'] = np.arange(len(prices.index))
    prices = prices.sort_values(by = "SecuritiesCode", ascending = True)
    prices.drop(["Target"], axis = 1)
    submission = prices[["Date", "SecuritiesCode", "Rank"]]
    # register your predictions
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
