# Imports & Load Data
作業に必要なライブラリをインポートして、 以下のデータを読み込みます。

* stock_price : 株価情報
* stock_list : 銘柄情報
* stock_fin : 財務諸表
* stock_labels : 目的変数

In [1]:
# shap用にg++とgccをインストールします
! apt-get update
! apt-get install -y --no-install-recommends g++ gcc

# 必要なライブラリをインストールします
! pip install shap==0.37.0 slicer==0.0.3 xgboost==1.3.0.post0

Ign:1 http://deb.debian.org/debian stretch InRelease
Get:2 http://security.debian.org/debian-security stretch/updates InRelease [53.0 kB]
Get:3 http://deb.debian.org/debian stretch-updates InRelease [93.6 kB]
Hit:4 http://deb.debian.org/debian stretch Release                  
Get:5 http://security.debian.org/debian-security stretch/updates/main amd64 Packages [656 kB]
Fetched 803 kB in 0s (1615 kB/s)                          
Reading package lists... Done
Reading package lists... Done
Building dependency tree       
Reading state information... Done
g++ is already the newest version (4:6.3.0-4).
gcc is already the newest version (4:6.3.0-4).
0 upgraded, 0 newly installed, 0 to remove and 60 not upgraded.


In [2]:
import os
import pickle
import sys
import warnings
from glob import glob

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shap
from xgboost import XGBRegressor
from scipy.stats import spearmanr
from sklearn.ensemble import (
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.metrics import accuracy_score, mean_squared_error
from tqdm.auto import tqdm


# 表示用の設定を変更します
%matplotlib inline
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.width = 120

In [3]:
# python 3.7.3であることを確認します
import sys
print(sys.version)

3.7.3 (default, Mar 27 2019, 22:11:17) 
[GCC 7.3.0]


In [4]:
# データセット保存先ディレクトリ（""の中身はご自身の環境に合わせて定義してください。）
dataset_dir="/path/to"

In [5]:
# 読み込むファイルを定義します。
inputs = {
    "stock_list": f"{dataset_dir}/stock_list.csv.gz",
    "stock_price": f"{dataset_dir}/stock_price.csv.gz",
    "stock_fin": f"{dataset_dir}/stock_fin.csv.gz",
    # 本チュートリアルでは使用しないため、コメントアウトしています。
    # "stock_fin_price": f"{dataset_dir}/stock_fin_price.csv.gz",
    "stock_labels": f"{dataset_dir}/stock_labels.csv.gz",
}

# ファイルを読み込みます
dfs = {}
for k, v in inputs.items():
    print(k)
    dfs[k] = pd.read_csv(v)

stock_list
stock_price
stock_fin
stock_labels


In [6]:
SELECT_FIN_DATA_COLUMNS = ['Result_FinancialStatement FiscalYear', 'Result_FinancialStatement NetSales',
       'Result_FinancialStatement OperatingIncome', 'Result_FinancialStatement OrdinaryIncome',
       'Result_FinancialStatement NetIncome', 'Result_FinancialStatement TotalAssets',
       'Result_FinancialStatement NetAssets', 'Result_FinancialStatement CashFlowsFromOperatingActivities',
       'Result_FinancialStatement CashFlowsFromFinancingActivities',
       'Result_FinancialStatement CashFlowsFromInvestingActivities', 'Forecast_FinancialStatement FiscalYear',
       'Forecast_FinancialStatement NetSales', 'Forecast_FinancialStatement OperatingIncome',
       'Forecast_FinancialStatement OrdinaryIncome', 'Forecast_FinancialStatement NetIncome',
       'Result_Dividend FiscalYear', 'Result_Dividend QuarterlyDividendPerShare',
       'Result_Dividend AnnualDividendPerShare', 'Forecast_Dividend FiscalYear',
       'Forecast_Dividend QuarterlyDividendPerShare', 'Forecast_Dividend AnnualDividendPerShare',
       'IssuedShareEquityQuote IssuedShare','Section/Products', '33 Sector(Code)', '17 Sector(Code)']

In [7]:
section_products = {
    "First Section (Domestic)" : 1,
    "JASDAQ(Standard / Domestic)" :2,
    "Second Section(Domestic)" :3,
    "Mothers (Domestic)" : 4,
    "JASDAQ(Growth/Domestic)" :5
}

In [8]:
def calculate_glossary_of_financial_analysis(row):
    operating_profit_margin = 0
    ordinary_profit_margin = 0
    net_profit_margin = 0
    total_asset_turnover = 0
    net_sales_growth_rate = 0
    ordinary_income_growth_rate = 0
    operationg_income_growth_rate = 0
    total_assets_growth_rate = 0
    net_assets_growth_rate = 0
    eps = 0
    bps = 0
    roe = 0

    # 売上高営業利益率 売上高営業利益率（％）＝営業利益÷売上高×100
    if row['Result_FinancialStatement NetSales'] != 0:
        operating_profit_margin = \
            row['Result_FinancialStatement OperatingIncome'] / \
            row['Result_FinancialStatement NetSales'] * 100
    # 売上高経常利益率　売上高経常利益率（％）＝経常利益÷売上高×100
    if row['Result_FinancialStatement NetSales'] != 0:
        ordinary_profit_margin = \
            row['Result_FinancialStatement OrdinaryIncome'] / \
            row['Result_FinancialStatement NetSales'] * 100
    # 売上高純履歴率　売上高純利益率（％）＝当期純利益÷売上高×100
    if row['Result_FinancialStatement NetSales'] != 0:
        net_profit_margin = row['Result_FinancialStatement NetIncome'] / \
                            row['Result_FinancialStatement NetSales'] * 100
    # 総資本回転率 総資本回転率（％）＝売上高÷総資本（自己資本＋他人資本）×100
    if row['Result_FinancialStatement NetAssets'] != 0:
        total_asset_turnover = row['Result_FinancialStatement NetSales'] / \
                            row['Result_FinancialStatement NetAssets'] * 100
    # 売上高増加率
    if row['Previous_FinancialStatement NetSales'] != 0:
        net_sales_growth_rate = \
            (row['Result_FinancialStatement NetSales'] -
            row['Previous_FinancialStatement NetSales']) / \
            row['Previous_FinancialStatement NetSales'] * 100
    # 経常利益増加率
    if row['Previous_FinancialStatement OrdinaryIncome'] != 0:
        ordinary_income_growth_rate = \
            (row['Result_FinancialStatement OrdinaryIncome'] -
            row['Previous_FinancialStatement OrdinaryIncome']) / \
            row['Previous_FinancialStatement OrdinaryIncome'] * 100

    # 営業利益増加率
    if row['Previous_FinancialStatement OperatingIncome'] != 0:
        operationg_income_growth_rate = \
            (row['Result_FinancialStatement OperatingIncome'] -
            row['Previous_FinancialStatement OperatingIncome']) / \
            row['Previous_FinancialStatement OperatingIncome'] * 100
    # 総資本増加率
    if row['Previous_FinancialStatement TotalAssets'] != 0:
        total_assets_growth_rate = \
            (row['Result_FinancialStatement TotalAssets'] -
            row['Previous_FinancialStatement TotalAssets']) / \
            row['Previous_FinancialStatement TotalAssets'] * 100
    # 純資本増加率
    if row['Previous_FinancialStatement NetAssets'] != 0:
        net_assets_growth_rate = \
            (row['Result_FinancialStatement NetAssets'] -
            row['Previous_FinancialStatement NetAssets']) / \
            row['Previous_FinancialStatement NetAssets'] * 100
    # 一株当たり当期純利益（EPS）
    if row['IssuedShareEquityQuote IssuedShare'] != 0:
        eps = row['Result_FinancialStatement NetIncome'] / \
            row['IssuedShareEquityQuote IssuedShare']
        # BPS 一株当たり純資産（円） ＝ 純資産 ÷ 発行済株式総数
        bps = row['Result_FinancialStatement NetAssets'] / \
            row['IssuedShareEquityQuote IssuedShare']
        # ROE EPS（一株当たり利益）÷ BPS（一株当たり純資産）× 100
        if bps > 0:
            roe = eps / bps * 100
    return pd.Series(
        [operating_profit_margin, ordinary_profit_margin,
            net_profit_margin, total_asset_turnover,
            net_sales_growth_rate, ordinary_income_growth_rate,
            operationg_income_growth_rate, total_assets_growth_rate,
            net_assets_growth_rate, eps, bps, roe])

# 特徴量の生成

In [9]:
TRAIN_END = "2017-12-31"
VAL_START = "2018-02-01"
VAL_END = "2018-12-01"
TEST_START = "2019-01-01"

In [10]:
def get_features_for_predict(dfs, code):
    """
    Args:
        dfs (dict)  : dict of pd.DataFrame include stock_fin, stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    # おおまかな手順の1つ目
    # stock_finデータを読み込み
    stock_fin = dfs["stock_fin"].copy()
    
    stock_list = dfs["stock_list"].copy()
    stock_fin = pd.merge(stock_fin, stock_list, on=["Local Code"] )

    # 特定の銘柄コードのデータに絞る
    fin_data = stock_fin[stock_fin["Local Code"] == code].copy()
    # 日付列をpd.Timestamp型に変換してindexに設定
    fin_data["datetime"] = pd.to_datetime(fin_data["base_date"])
    fin_data.set_index("datetime", inplace=True)
    # fin_dataを選択
    fin_data = fin_data[SELECT_FIN_DATA_COLUMNS]
    fin_data = fin_data.join(fin_data[['Result_FinancialStatement NetSales', 'Result_FinancialStatement OperatingIncome', 
                                   'Result_FinancialStatement OrdinaryIncome', 'Result_FinancialStatement NetIncome', 
                                   'Result_FinancialStatement TotalAssets', 'Result_FinancialStatement NetAssets',
                                   'Result_FinancialStatement CashFlowsFromOperatingActivities', 
                                   'Result_FinancialStatement CashFlowsFromFinancingActivities',
                                   'Result_FinancialStatement CashFlowsFromInvestingActivities']].rename(columns =
                                                                                                         {'Result_FinancialStatement NetSales': 'Previous_FinancialStatement NetSales',
                                                                                                          'Result_FinancialStatement OperatingIncome': 'Previous_FinancialStatement OperatingIncome', 
                                                                                                          'Result_FinancialStatement OrdinaryIncome': 'Previous_FinancialStatement OrdinaryIncome', 
                                                                                                          'Result_FinancialStatement NetIncome':'Previous_FinancialStatement NetIncome', 
                                                                                                          'Result_FinancialStatement TotalAssets': 'Previous_FinancialStatement TotalAssets', 
                                                                                                          'Result_FinancialStatement NetAssets':'Previous_FinancialStatement NetAssets',
                                                                                                          'Result_FinancialStatement CashFlowsFromOperatingActivities': 'Previous_FinancialStatement CashFlowsFromOperatingActivities', 
                                                                                                          'Result_FinancialStatement CashFlowsFromFinancingActivities':'Previous_FinancialStatement CashFlowsFromFinancingActivities',
                                                                                                          'Result_FinancialStatement CashFlowsFromInvestingActivities':'Previous_FinancialStatement CashFlowsFromInvestingActivities'}).shift(-1))
    fin_data[['operating_profit_margin', 'ordinary_profit_margin', 'net_profit_margin', 'total_asset_turnover',
         'net_sales_growth_rate', 'ordinary_income_growth_rate', 'operationg_income_growth_rate',
          'total_assets_growth_rate', 'net_assets_growth_rate', 'eps', 'bps', 'roe']] = fin_data.apply(calculate_glossary_of_financial_analysis, axis=1)

    # 欠損値処理
    fin_feats = fin_data.fillna(0)

    # おおまかな手順の2つ目
    # stock_priceデータを読み込む
    price = dfs["stock_price"].copy()
    # 特定の銘柄コードのデータに絞る
    price_data = price[price["Local Code"] == code].copy()
    # 日付列をpd.Timestamp型に変換してindexに設定
    price_data["datetime"] = pd.to_datetime(price_data["EndOfDayQuote Date"])
    price_data.set_index("datetime", inplace=True)
    # 終値のみに絞る
    feats = price_data[["EndOfDayQuote ExchangeOfficialClose"]].copy()
    # 終値の20営業日リターン
    feats["return_1month"] = feats["EndOfDayQuote ExchangeOfficialClose"].pct_change(20)
    # 終値の40営業日リターン
    feats["return_2month"] = feats["EndOfDayQuote ExchangeOfficialClose"].pct_change(40)
    # 終値の60営業日リターン
    feats["return_3month"] = feats["EndOfDayQuote ExchangeOfficialClose"].pct_change(60)
    # 終値の20営業日ボラティリティ
    feats["volatility_1month"] = (
        np.log(feats["EndOfDayQuote ExchangeOfficialClose"]).diff().rolling(20).std()
    )
    # 終値の40営業日ボラティリティ
    feats["volatility_2month"] = (
        np.log(feats["EndOfDayQuote ExchangeOfficialClose"]).diff().rolling(40).std()
    )
    # 終値の60営業日ボラティリティ
    feats["volatility_3month"] = (
        np.log(feats["EndOfDayQuote ExchangeOfficialClose"]).diff().rolling(60).std()
    )
    # 終値と20営業日の単純移動平均線の乖離
    feats["MA_gap_1month"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote ExchangeOfficialClose"].rolling(20).mean()
    )
    # 終値と40営業日の単純移動平均線の乖離
    feats["MA_gap_2month"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote ExchangeOfficialClose"].rolling(40).mean()
    )
    # 終値と60営業日の単純移動平均線の乖離
    feats["MA_gap_3month"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote ExchangeOfficialClose"].rolling(60).mean()
    )
    
    # EWMA
    ALPHA = 0.25
    feats["EWMA"] = feats["EndOfDayQuote ExchangeOfficialClose"]

    for t in zip(feats.index, feats.index[1:]):
        feats.loc[t[1], "EWMA"] = ALPHA * feats.loc[t[1], "EndOfDayQuote ExchangeOfficialClose"] + (1 - ALPHA) * feats.loc[t[0], "EWMA"]
    
    # EMA 10日
    feats["ema_10"] = feats["EndOfDayQuote ExchangeOfficialClose"].ewm(span=10).mean()
    
    # MACD 
    # EMA12
    feats["ema_12"] = feats["EndOfDayQuote ExchangeOfficialClose"].ewm(span=12).mean()
    # EMA 26
    feats["ema_26"] = feats["EndOfDayQuote ExchangeOfficialClose"].ewm(span=26).mean()
    feats["macd"] = feats["ema_12"] - feats["ema_26"]
    feats["signal"] = feats["macd"].ewm(span=9).mean()
    
    # PBR 株価 ÷ BPS（1株あたり純資産）
    feats["pbr"] = feats["EndOfDayQuote ExchangeOfficialClose"] / fin_data["bps"]
    # PER 株価 ÷ 1株当たり利益（EPS）
    feats["per"] = feats["EndOfDayQuote ExchangeOfficialClose"] / fin_data["eps"]

    # おおまかな手順の3つ目
    # 欠損値処理
    feats = feats.fillna(0)
    # 元データのカラムを削除
    feats = feats.drop(["EndOfDayQuote ExchangeOfficialClose"], axis=1)

    # 財務データの特徴量とマーケットデータの特徴量のインデックスを合わせる
    feats = feats.loc[feats.index.isin(fin_feats.index)]
    fin_feats = fin_feats.loc[fin_feats.index.isin(feats.index)]

    # データを結合
    feats = pd.concat([feats, fin_feats], axis=1).dropna()

    # 欠損値処理を行います。
    feats = feats.replace([np.inf, -np.inf], 0)
    
    # 市場・商品区分を数値に変換
    feats["Section/Products"] = section_products[feats["Section/Products"][0]]
    # 銘柄コードを設定
    feats["code"] = code

    return feats

In [11]:
def get_features_and_label(dfs, codes, feature, label):
    """
    Args:
        dfs (dict[pd.DataFrame]): loaded data
        codes  (array) : target codes
        feature (pd.DataFrame): features
        label (str) : label column name
    Returns:
        train_X (pd.DataFrame): training data
        train_y (pd.DataFrame): label for train_X
        val_X (pd.DataFrame): validation data
        val_y (pd.DataFrame): label for val_X
        test_X (pd.DataFrame): test data
        test_y (pd.DataFrame): label for test_X
    """
    # 分割データ用の変数を定義
    trains_X, vals_X, tests_X = [], [], []
    trains_y, vals_y, tests_y = [], [], []

    # 銘柄コード毎に特徴量を作成
    for code in tqdm(codes):
        # 特徴量取得
        feats = feature[feature["code"] == code]

        # stock_labelデータを読み込み
        stock_labels = dfs["stock_labels"].copy()
        # 特定の銘柄コードのデータに絞る
        stock_labels = stock_labels[stock_labels["Local Code"] == code]
        # 日付列をpd.Timestamp型に変換してindexに設定
        stock_labels["datetime"] = pd.to_datetime(stock_labels["base_date"])
        stock_labels.set_index("datetime", inplace=True)

        # 特定の目的変数に絞る
        labels = stock_labels[label]
        # nanを削除
        labels.dropna(inplace=True)

        if feats.shape[0] > 0 and labels.shape[0] > 0:
            # 特徴量と目的変数のインデックスを合わせる
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]
            labels.index = feats.index

            # データを分割（ホールドアウト法）
            _train_X = feats[: TRAIN_END].copy()
            _val_X = feats[VAL_START : VAL_END].copy()
            _test_X = feats[TEST_START :].copy()

            _train_y = labels[: TRAIN_END].copy()
            _val_y = labels[VAL_START : VAL_END].copy()
            _test_y = labels[TEST_START :].copy()

            # データを配列に格納 (後ほど結合するため)
            trains_X.append(_train_X)
            vals_X.append(_val_X)
            tests_X.append(_test_X)

            trains_y.append(_train_y)
            vals_y.append(_val_y)
            tests_y.append(_test_y)

    # 銘柄毎に作成した説明変数データを結合します。
    train_X = pd.concat(trains_X)
    val_X = pd.concat(vals_X)
    test_X = pd.concat(tests_X)
    # 銘柄毎に作成した目的変数データを結合します。
    train_y = pd.concat(trains_y)
    val_y = pd.concat(vals_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, val_X, val_y, test_X, test_y

In [12]:
# 対象銘柄コードを定義
codes = [9984]
# 対象の目的変数を定義
label = "label_high_20"
# 特徴量を取得
feat = get_features_for_predict(dfs, codes[0])
# 特徴量と目的変数を入力し、分割データを取得
ret = get_features_and_label(dfs, codes, feat, label)
for v in ret:
    print(v.T)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


datetime                                              2016-02-10    2016-04-21    2016-05-10    2016-07-28  \
return_1month                                      -1.913212e-01  8.908766e-02  8.688554e-02 -7.610193e-02   
return_2month                                       0.000000e+00  1.441458e-01  2.409844e-02 -1.417147e-01   
return_3month                                       0.000000e+00  2.555166e-01  1.739812e-01 -1.038744e-01   
volatility_1month                                   4.402584e-02  1.717726e-02  2.271106e-02  3.175834e-02   
volatility_2month                                   0.000000e+00  1.705030e-02  1.818629e-02  3.062359e-02   
volatility_3month                                   0.000000e+00  3.428422e-02  3.271347e-02  2.758085e-02   
MA_gap_1month                                       9.107187e-01  1.071322e+00  1.008618e+00  9.588478e-01   
MA_gap_2month                                       0.000000e+00  1.067462e+00  1.037082e+00  9.375874e-01   
MA_gap_3m

In [13]:
def get_codes(dfs):
    """
    Args:
        dfs (dict[pd.DataFrame]): loaded data
    Returns:
        array: list of stock codes
    """
    stock_list = dfs["stock_list"].copy()
    # 予測対象の銘柄コードを取得
    codes = stock_list[stock_list["prediction_target"] == True][
        "Local Code"
    ].values
    return codes

In [14]:
model_path = os.path.join(os.path.dirname("__file__"), "../model")

with open(os.path.join(model_path, "test_X"), "rb") as f:
    test_X = pickle.load(f)
with open(os.path.join(model_path, "test_y"), "rb") as f:
    test_y = pickle.load(f)

In [15]:
with open(os.path.join(model_path, "train_X"), "rb") as f:
    train_X = pickle.load(f)
with open(os.path.join(model_path, "train_y"), "rb") as f:
    train_y = pickle.load(f)

In [16]:
with open(os.path.join(model_path, "val_X"), "rb") as f:
    val_X = pickle.load(f)
with open(os.path.join(model_path, "val_y"), "rb") as f:
    val_y = pickle.load(f)

In [17]:
# 目的変数を指定
label = "label_high_20"

# 学習用データセット定義
# ファンダメンタル情報
#fundamental_cols = dfs["stock_fin"].select_dtypes("float64").columns
fundamental_cols = pd.Index(SELECT_FIN_DATA_COLUMNS)
fundamental_cols = fundamental_cols[fundamental_cols != "Result_Dividend DividendPayableDate"]
fundamental_cols = fundamental_cols[fundamental_cols != "Local Code"]
# 価格変化率
returns_cols = [x for x in train_X[label].columns if "return" in x]
# テクニカル
technical_cols = [x for x in train_X[label].columns if (x not in fundamental_cols) and (x != "code")]

columns = {
    "fundamental_only": fundamental_cols,
    "return_only": returns_cols,
    "technical_only": technical_cols,
    "fundamental+technical": list(fundamental_cols) + list(technical_cols),
}
# 学習用データセットを指定
col = "fundamental+technical"

In [None]:
'''reg_cv = GridSearchCV(xgb_model, {
    "eta": [0.1], 
    "gamma": [0.1,0.2,0.3,0.4,0.5],
    "n_estimators": [50, 100, 200], 
    "max_depth": [5, 7, 9,10,20,30],
    "subsample":[0.6,0.8,1],
    "colsample_bytree": [0.5,0.7,0.9],
}, verbose=1)
'''

In [None]:
'''
{'colsample_bytree': 0.5, 'eta': 0.1, 'gamma': 0.5, 'max_depth': 5, 'n_estimators': 50, 'subsample': 1}
0.07780464612358796
'''

In [18]:
# ライブラリインポート
from sklearn.model_selection import GridSearchCV

# モデル定義
model = XGBRegressor()

# ハイパーパラメータ探索
reg_cv = GridSearchCV(model, {
    "eta": [0.1], 
    "gamma": [0.4, 0.5],
    "max_depth": [5],
    "n_estimators": [50], 
    "subsample":[1],
    "colsample_bytree": [0.5],
}, verbose=1)

# 訓練実施
reg_cv.fit(train_X[label][columns[col]].values, train_y[label])

You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    6.2s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
       colsample_bynode=None, colsample_bytree=None, gamma=None,
       gpu_id=None, importance_type='gain', interaction_constraints=None,
       learning_rate=None, max_delta_step=None, max_depth=None,
       min_child_we..._pos_weight=None, subsample=None,
       tree_method=None, validate_parameters=None, verbosity=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'eta': [0.1], 'gamma': [0.4, 0.5], 'max_depth': [5], 'n_estimators': [50], 'subsample': [1], 'colsample_bytree': [0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [19]:
# 結果を表示
print(reg_cv.best_params_)
print(reg_cv.best_score_)

{'colsample_bytree': 0.5, 'eta': 0.1, 'gamma': 0.5, 'max_depth': 5, 'n_estimators': 50, 'subsample': 1}
0.07780464612358796


In [20]:
best_model = reg_cv.best_estimator_

In [21]:
# 目的変数を指定
label = "label_high_20"

# 学習用データセット定義
# ファンダメンタル情報
#fundamental_cols = dfs["stock_fin"].select_dtypes("float64").columns
fundamental_cols = pd.Index(SELECT_FIN_DATA_COLUMNS)
fundamental_cols = fundamental_cols[fundamental_cols != "Result_Dividend DividendPayableDate"]
fundamental_cols = fundamental_cols[fundamental_cols != "Local Code"]
# 価格変化率
returns_cols = [x for x in train_X[label].columns if "return" in x]
# テクニカル
technical_cols = [x for x in train_X[label].columns if (x not in fundamental_cols) and (x != "code")]

columns = {
    "fundamental_only": fundamental_cols,
    "return_only": returns_cols,
    "technical_only": technical_cols,
    "fundamental+technical": list(fundamental_cols) + list(technical_cols),
}
# 学習用データセットを指定
col = "fundamental+technical"

# 学習
#pred_model = models[model](reg_cv.best_estimator_, random_state=0)
best_model.fit(train_X[label][columns[col]].values, train_y[label])


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.5, eta=0.1, gamma=0.5,
       gpu_id=-1, importance_type='gain', interaction_constraints='',
       learning_rate=0.100000001, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=50, n_jobs=6, num_parallel_tree=1,
       objective='reg:squarederror', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [22]:
# 予測
result = {}
result[label] = pd.DataFrame(
    best_model.predict(val_X[label][columns[col]]), columns=["predict"]
)

# 予測結果に日付と銘柄コードを追加
result[label]["datetime"] = val_X[label][columns[col]].index
result[label]["code"] = val_X[label]["code"].values

# 予測の符号を取得
result[label]["predict_dir"] = np.sign(result[label]["predict"])

# 実際の値を追加
result[label]["actual"] = val_y[label].values

ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62'] ['Result_FinancialStatement FiscalYear', 'Result_FinancialStatement NetSales', 'Result_FinancialStatement OperatingIncome', 'Result_FinancialStatement OrdinaryIncome', 'Result_FinancialStatement NetIncome', 'Result_FinancialStatement TotalAssets', 'Result_FinancialStatement NetAssets', 'Result_FinancialStatement CashFlowsFromOperatingActivities', 'Result_FinancialStatement CashFlowsFromFinancingActivities', 'Result_FinancialStatement CashFlowsFromInvestingActivities', 'Forecast_FinancialStatement FiscalYear', 'Forecast_FinancialStatement NetSales', 'Forecast_FinancialStatement OperatingIncome', 'Forecast_FinancialStatement OrdinaryIncome', 'Forecast_FinancialStatement NetIncome', 'Result_Dividend FiscalYear', 'Result_Dividend QuarterlyDividendPerShare', 'Result_Dividend AnnualDividendPerShare', 'Forecast_Dividend FiscalYear', 'Forecast_Dividend QuarterlyDividendPerShare', 'Forecast_Dividend AnnualDividendPerShare', 'IssuedShareEquityQuote IssuedShare', 'Section/Products', '33 Sector(Code)', '17 Sector(Code)', 'return_1month', 'return_2month', 'return_3month', 'volatility_1month', 'volatility_2month', 'volatility_3month', 'MA_gap_1month', 'MA_gap_2month', 'MA_gap_3month', 'EWMA', 'ema_10', 'ema_12', 'ema_26', 'macd', 'signal', 'pbr', 'per', 'Previous_FinancialStatement NetSales', 'Previous_FinancialStatement OperatingIncome', 'Previous_FinancialStatement OrdinaryIncome', 'Previous_FinancialStatement NetIncome', 'Previous_FinancialStatement TotalAssets', 'Previous_FinancialStatement NetAssets', 'Previous_FinancialStatement CashFlowsFromOperatingActivities', 'Previous_FinancialStatement CashFlowsFromFinancingActivities', 'Previous_FinancialStatement CashFlowsFromInvestingActivities', 'operating_profit_margin', 'ordinary_profit_margin', 'net_profit_margin', 'total_asset_turnover', 'net_sales_growth_rate', 'ordinary_income_growth_rate', 'operationg_income_growth_rate', 'total_assets_growth_rate', 'net_assets_growth_rate', 'eps', 'bps', 'roe']
expected f17, f49, f15, f1, f37, f12, f60, f53, f29, f45, f6, f42, f41, f18, f30, f22, f28, f9, f56, f44, f51, f2, f35, f5, f58, f47, f3, f11, f50, f20, f21, f25, f55, f39, f19, f43, f0, f8, f27, f4, f31, f48, f36, f57, f61, f33, f10, f40, f16, f32, f46, f54, f23, f52, f7, f34, f13, f24, f62, f14, f26, f38, f59 in input data
training data did not have the following fields: return_2month, return_3month, Result_Dividend QuarterlyDividendPerShare, Previous_FinancialStatement CashFlowsFromFinancingActivities, Previous_FinancialStatement CashFlowsFromOperatingActivities, Result_FinancialStatement OrdinaryIncome, eps, net_profit_margin, Forecast_Dividend AnnualDividendPerShare, MA_gap_1month, Forecast_FinancialStatement NetIncome, Previous_FinancialStatement NetAssets, MA_gap_2month, bps, ordinary_income_growth_rate, Result_FinancialStatement NetSales, 33 Sector(Code), roe, Result_FinancialStatement NetIncome, Result_FinancialStatement CashFlowsFromInvestingActivities, net_assets_growth_rate, Forecast_Dividend FiscalYear, Previous_FinancialStatement NetSales, EWMA, Forecast_Dividend QuarterlyDividendPerShare, Previous_FinancialStatement TotalAssets, signal, Result_FinancialStatement OperatingIncome, total_asset_turnover, volatility_1month, Forecast_FinancialStatement NetSales, pbr, macd, Forecast_FinancialStatement OrdinaryIncome, Previous_FinancialStatement OperatingIncome, operating_profit_margin, Forecast_FinancialStatement FiscalYear, Result_Dividend FiscalYear, volatility_3month, MA_gap_3month, Previous_FinancialStatement CashFlowsFromInvestingActivities, Result_FinancialStatement CashFlowsFromFinancingActivities, Previous_FinancialStatement OrdinaryIncome, ema_12, Forecast_FinancialStatement OperatingIncome, per, Result_Dividend AnnualDividendPerShare, IssuedShareEquityQuote IssuedShare, ema_26, Result_FinancialStatement TotalAssets, Result_FinancialStatement NetAssets, return_1month, volatility_2month, ema_10, Result_FinancialStatement CashFlowsFromOperatingActivities, Previous_FinancialStatement NetIncome, ordinary_profit_margin, 17 Sector(Code), operationg_income_growth_rate, net_sales_growth_rate, total_assets_growth_rate, Result_FinancialStatement FiscalYear, Section/Products

In [None]:
pred_model.fit(train_X[label][columns[col]].values, train_y[label])

In [None]:
pred_model.feature_importances_

# Submit Model

In [None]:
model_path = os.path.join(os.path.dirname("__file__"), "../model")
# tag::save_model[]
# モデル保存先ディレクトリを作成
os.makedirs(model_path, exist_ok=True)
with open(os.path.join(model_path, f"my_model_{label}.pkl"), "wb") as f:
    # モデルをpickle形式で保存
    pickle.dump(pred_model, f)


In [None]:
sns.jointplot(data=result[label], x="predict", y="actual")

In [None]:
pred_model

In [None]:
# 学習済みモデルを指定
rf = pred_model

# 重要度順を取得
sorted_idx = rf.feature_importances_.argsort()
# プロット
fig, ax = plt.subplots(figsize=(8, 8))
ax.barh(fundamental_cols[sorted_idx], rf.feature_importances_[sorted_idx])
ax.set_xlabel("Random Forest Feature Importance")

In [None]:
# モデルを定義します
sample_model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(train_X["label_high_20"], label=train_y["label_high_20"]), 100)

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(model=sample_model, feature_perturbation='tree_path_dependent', model_output='margin')
# SHAP値
shap_values = explainer.shap_values(X=train_X["label_high_20"])
# プロット
shap.summary_plot(shap_values, train_X["label_high_20"], plot_type="bar")

In [None]:
shap.summary_plot(shap_values, train_X["label_high_20"])

In [24]:
# 対象の目的変数を定義
labels = {
    "label_high_5",
    "label_high_10",
    "label_high_20",
    "label_low_5",
    "label_low_10",
    "label_low_20",
}

In [25]:
# モデルを定義
models = {
    "rf": RandomForestRegressor,
    "extraTree": ExtraTreesRegressor,
    "gbr": GradientBoostingRegressor,
}

# 学習用データセット定義
columns = {
    "fundamental_only": fundamental_cols,
    "return_only": returns_cols,
    "technical_only": technical_cols,
    "fundamental+technical": list(fundamental_cols) + list(technical_cols),
}

# 結果保存用
all_results = dict()
# モデル毎に処理
for model in tqdm(models.keys()):
    all_results[model] = dict()
    # データセット毎に処理
    for col in columns.keys():
        result = dict()
        # 目的変数毎に処理
        for label in tqdm(labels):
            if len(test_X[label][columns[col]]) > 0:
                # モデル取得
                pred_model = models[model](random_state=0)
                # 学習
                pred_model.fit(train_X[label][columns[col]].values, train_y[label])
                # 結果データ作成
                result[label] = test_X[label][["code"]].copy()
                result[label]["datetime"] = test_X[label][columns[col]].index
                # 予測
                result[label]["predict"] = pred_model.predict(test_X[label][columns[col]])
                result[label]["predict_dir"] = np.sign(result[label]["predict"])
                # 実際の結果
                result[label]["actual"] = test_y[label].values
                result[label]["actual_dir"] = np.sign(result[label]["actual"])
                result[label].dropna(inplace=True)

        all_results[model][col] = result

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




In [26]:
results = []
for model in all_results.keys():
    for col in all_results[model]:
        tmp = pd.concat(all_results[model][col])
        tmp["model"] = model
        tmp["feature"] = col
        results.append(tmp)
results = pd.concat(results)
results["label"] = [x[0] for x in results.index]
results.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,code,datetime,predict,predict_dir,actual,actual_dir,model,feature,label
Unnamed: 0_level_1,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
label_high_10,2019-02-08,1301,2019-02-08,0.009904,1.0,0.07143,1.0,rf,fundamental_only,label_high_10
label_high_10,2019-05-13,1301,2019-05-13,0.02806,1.0,0.04379,1.0,rf,fundamental_only,label_high_10
label_high_10,2019-08-02,1301,2019-08-02,0.026464,1.0,0.00498,1.0,rf,fundamental_only,label_high_10
label_high_10,2019-11-05,1301,2019-11-05,0.045721,1.0,0.00841,1.0,rf,fundamental_only,label_high_10
label_high_10,2020-02-07,1301,2020-02-07,0.025525,1.0,0.01212,1.0,rf,fundamental_only,label_high_10


In [28]:
[columns['fundamental_only']]

[Index(['Result_FinancialStatement FiscalYear', 'Result_FinancialStatement NetSales',
        'Result_FinancialStatement OperatingIncome', 'Result_FinancialStatement OrdinaryIncome',
        'Result_FinancialStatement NetIncome', 'Result_FinancialStatement TotalAssets',
        'Result_FinancialStatement NetAssets', 'Result_FinancialStatement CashFlowsFromOperatingActivities',
        'Result_FinancialStatement CashFlowsFromFinancingActivities',
        'Result_FinancialStatement CashFlowsFromInvestingActivities', 'Forecast_FinancialStatement FiscalYear',
        'Forecast_FinancialStatement NetSales', 'Forecast_FinancialStatement OperatingIncome',
        'Forecast_FinancialStatement OrdinaryIncome', 'Forecast_FinancialStatement NetIncome',
        'Result_Dividend FiscalYear', 'Result_Dividend QuarterlyDividendPerShare',
        'Result_Dividend AnnualDividendPerShare', 'Forecast_Dividend FiscalYear',
        'Forecast_Dividend QuarterlyDividendPerShare', 'Forecast_Dividend AnnualDiv

In [112]:
[columns['return_only']]

[['return_1month', 'return_2month', 'return_3month']]

In [29]:
[columns['technical_only']]

[['return_1month',
  'return_2month',
  'return_3month',
  'volatility_1month',
  'volatility_2month',
  'volatility_3month',
  'MA_gap_1month',
  'MA_gap_2month',
  'MA_gap_3month',
  'EWMA',
  'ema_10',
  'ema_12',
  'ema_26',
  'macd',
  'signal',
  'pbr',
  'per',
  'Previous_FinancialStatement NetSales',
  'Previous_FinancialStatement OperatingIncome',
  'Previous_FinancialStatement OrdinaryIncome',
  'Previous_FinancialStatement NetIncome',
  'Previous_FinancialStatement TotalAssets',
  'Previous_FinancialStatement NetAssets',
  'Previous_FinancialStatement CashFlowsFromOperatingActivities',
  'Previous_FinancialStatement CashFlowsFromFinancingActivities',
  'Previous_FinancialStatement CashFlowsFromInvestingActivities',
  'operating_profit_margin',
  'ordinary_profit_margin',
  'net_profit_margin',
  'total_asset_turnover',
  'net_sales_growth_rate',
  'ordinary_income_growth_rate',
  'operationg_income_growth_rate',
  'total_assets_growth_rate',
  'net_assets_growth_rate',
  'e

In [30]:
# 結果保存用変数
all_metrics = []

# データセット毎に処理
for feature in columns:
    matrix = dict()
    # モデル毎に処理
    for model in models:
        # 目的変数毎に処理
        for label in labels:
            # 処理対象データに絞り込み
            tmp_df = results[(results["model"] == model) & (results["label"] == label) & (results["feature"] == feature)]
            # RMSE
            rmse = np.sqrt(mean_squared_error(tmp_df["predict"], tmp_df["actual"]))
            # 精度
            accuracy = accuracy_score(tmp_df["predict_dir"], tmp_df["actual_dir"])
            # 相関係数
            corr = np.corrcoef(tmp_df["actual"], tmp_df["predict"])[0, 1]
            # 順位相関
            spearman_corr = spearmanr(tmp_df["actual"], tmp_df["predict"])[0]
            # 結果を保存
            matrix[label] = [rmse, accuracy, spearman_corr,corr, corr**2, feature, model, tmp_df.shape[0]]
        res = pd.DataFrame.from_dict(matrix).T
        res.columns = ["RMSE","accuracy","spearman_corr","corr","R^2 score","feature", "model", "# of samples"]
        all_metrics.append(res)
all_metrics = pd.concat(all_metrics)
all_metrics.reset_index()

Unnamed: 0,index,RMSE,accuracy,spearman_corr,corr,R^2 score,feature,model,# of samples
0,label_low_10,0.0783953,0.829051,0.0899332,0.0955458,0.00912899,fundamental_only,rf,32729
1,label_high_20,0.185307,0.826142,0.10654,0.14112,0.0199149,fundamental_only,rf,32515
2,label_low_5,0.0693281,0.79192,0.0988698,0.0993797,0.00987633,fundamental_only,rf,32747
3,label_high_5,0.104058,0.717837,0.0853308,0.115034,0.0132328,fundamental_only,rf,32747
4,label_low_20,0.106349,0.858558,0.0840678,0.0887552,0.00787749,fundamental_only,rf,32515
5,label_high_10,0.131406,0.772006,0.0867679,0.122264,0.0149484,fundamental_only,rf,32729
6,label_low_10,0.081348,0.814049,0.0837957,0.0821204,0.00674376,fundamental_only,extraTree,32729
7,label_high_20,0.184969,0.818022,0.0970288,0.115024,0.0132306,fundamental_only,extraTree,32515
8,label_low_5,0.072517,0.773262,0.0677515,0.0656421,0.00430889,fundamental_only,extraTree,32747
9,label_high_5,0.109392,0.699484,0.0548793,0.0626485,0.00392484,fundamental_only,extraTree,32747


In [104]:
tmp.to_csv('result.csv', index=None)

In [109]:
tmp = pd.read_csv('result.csv')

In [111]:
tmp.groupby(['feature', 'model']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,RMSE,accuracy,spearman_corr,corr,R^2 score
feature,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
fundamental+technical,extraTree,0.111176,0.803839,0.16217,0.165913,0.028339
fundamental+technical,gbr,0.109173,0.813302,0.215872,0.178325,0.034918
fundamental+technical,rf,0.112864,0.803816,0.170462,0.171932,0.030275
fundamental_only,extraTree,0.115085,0.785385,0.076433,0.082718,0.007136
fundamental_only,gbr,0.104176,0.814057,0.145075,0.167051,0.028122
fundamental_only,rf,0.112474,0.799252,0.091918,0.11035,0.012496
return_only,extraTree,0.116527,0.788572,0.076523,0.081358,0.007041
return_only,gbr,0.105486,0.814476,0.163846,0.140811,0.022925
return_only,rf,0.115588,0.79364,0.082825,0.090202,0.008963
technical_only,extraTree,0.108655,0.803408,0.166644,0.17395,0.030838


In [31]:
# 学習用データセット定義
columns = {
    "fundamental_only": fundamental_cols,
    "return_only": returns_cols,
    "technical_only": technical_cols,
    "fundamental+technical": list(fundamental_cols) + list(technical_cols),
}

# 結果保存用
all_results[model] = dict()
# データセット毎に処理
for col in columns.keys():
    result = dict()
    # 目的変数毎に処理
    for label in tqdm(labels):
        if len(test_X[label][columns[col]]) > 0:
            # モデル取得
            pred_model = best_model
            # 学習
            pred_model.fit(train_X[label][columns[col]].values, train_y[label])
            # 結果データ作成
            result[label] = test_X[label][["code"]].copy()
            result[label]["datetime"] = test_X[label][columns[col]].index
            # 予測
            result[label]["predict"] = pred_model.predict(test_X[label][columns[col]])
            result[label]["predict_dir"] = np.sign(result[label]["predict"])
            # 実際の結果
            result[label]["actual"] = test_y[label].values
            result[label]["actual_dir"] = np.sign(result[label]["actual"])
            result[label].dropna(inplace=True)

    all_results[model][col] = result

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24'] ['Result_FinancialStatement FiscalYear', 'Result_FinancialStatement NetSales', 'Result_FinancialStatement OperatingIncome', 'Result_FinancialStatement OrdinaryIncome', 'Result_FinancialStatement NetIncome', 'Result_FinancialStatement TotalAssets', 'Result_FinancialStatement NetAssets', 'Result_FinancialStatement CashFlowsFromOperatingActivities', 'Result_FinancialStatement CashFlowsFromFinancingActivities', 'Result_FinancialStatement CashFlowsFromInvestingActivities', 'Forecast_FinancialStatement FiscalYear', 'Forecast_FinancialStatement NetSales', 'Forecast_FinancialStatement OperatingIncome', 'Forecast_FinancialStatement OrdinaryIncome', 'Forecast_FinancialStatement NetIncome', 'Result_Dividend FiscalYear', 'Result_Dividend QuarterlyDividendPerShare', 'Result_Dividend AnnualDividendPerShare', 'Forecast_Dividend FiscalYear', 'Forecast_Dividend QuarterlyDividendPerShare', 'Forecast_Dividend AnnualDividendPerShare', 'IssuedShareEquityQuote IssuedShare', 'Section/Products', '33 Sector(Code)', '17 Sector(Code)']
expected f17, f15, f1, f12, f6, f18, f22, f9, f2, f3, f11, f20, f21, f19, f0, f8, f4, f10, f16, f23, f7, f5, f13, f24, f14 in input data
training data did not have the following fields: Result_Dividend QuarterlyDividendPerShare, Result_FinancialStatement OrdinaryIncome, Forecast_Dividend AnnualDividendPerShare, Forecast_FinancialStatement NetIncome, Result_FinancialStatement NetSales, Result_FinancialStatement NetIncome, 33 Sector(Code), Result_FinancialStatement CashFlowsFromInvestingActivities, Forecast_Dividend FiscalYear, Forecast_Dividend QuarterlyDividendPerShare, Result_FinancialStatement OperatingIncome, Forecast_FinancialStatement NetSales, Forecast_FinancialStatement OrdinaryIncome, Forecast_FinancialStatement FiscalYear, Result_Dividend FiscalYear, Result_FinancialStatement CashFlowsFromFinancingActivities, Forecast_FinancialStatement OperatingIncome, Result_Dividend AnnualDividendPerShare, IssuedShareEquityQuote IssuedShare, Result_FinancialStatement TotalAssets, Result_FinancialStatement NetAssets, Result_FinancialStatement CashFlowsFromOperatingActivities, 17 Sector(Code), Result_FinancialStatement FiscalYear, Section/Products