# Imports & Load Data
作業に必要なライブラリをインポートして、 以下のデータを読み込みます。

* stock_price : 株価情報
* stock_list : 銘柄情報
* stock_fin : 財務諸表
* stock_labels : 目的変数

In [1]:
!pip install catboost==0.24.4 



In [2]:
import os
import pickle
import sys
import warnings
from glob import glob

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shap
import  xgboost as xgb
from scipy.stats import spearmanr
from sklearn.ensemble import (
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.metrics import accuracy_score, mean_squared_error
from tqdm.auto import tqdm
from catboost import CatBoostRegressor


# 表示用の設定を変更します
%matplotlib inline
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.width = 120

In [3]:
# python 3.7.3であることを確認します
import sys
print(sys.version)

3.7.3 (default, Mar 27 2019, 22:11:17) 
[GCC 7.3.0]


In [4]:
# データセット保存先ディレクトリ（""の中身はご自身の環境に合わせて定義してください。）
dataset_dir="/path/to"

In [5]:
# 読み込むファイルを定義します。
inputs = {
    "stock_list": f"{dataset_dir}/stock_list.csv.gz",
    "stock_price": f"{dataset_dir}/stock_price.csv.gz",
    "stock_fin": f"{dataset_dir}/stock_fin.csv.gz",
    # 本チュートリアルでは使用しないため、コメントアウトしています。
    # "stock_fin_price": f"{dataset_dir}/stock_fin_price.csv.gz",
    "stock_labels": f"{dataset_dir}/stock_labels.csv.gz",
}

# ファイルを読み込みます
dfs = {}
for k, v in inputs.items():
    print(k)
    dfs[k] = pd.read_csv(v)

stock_list
stock_price
stock_fin
stock_labels


# 特徴量の生成

In [6]:
def get_feature_columns(dfs, train_X, column_group="fundamental+technical"):
    # 特徴量グループを定義
    # ファンダメンタル
    fundamental_cols = dfs["stock_fin"].select_dtypes("float64").columns
    fundamental_cols = fundamental_cols[
        fundamental_cols != "Result_Dividend DividendPayableDate"
        ]
    fundamental_cols = fundamental_cols[fundamental_cols != "Local Code"]
    # 価格変化率
    returns_cols = [x for x in train_X.columns if "return" in x]
    # テクニカル
    technical_cols = [
        x for x in train_X.columns if
        (x not in fundamental_cols) and (x != "code")
    ]
    columns = {
        "fundamental_only": fundamental_cols,
        "return_only": returns_cols,
        "technical_only": technical_cols,
        "fundamental+technical": list(fundamental_cols) + list(
            technical_cols),
    }
    return columns[column_group]

In [7]:
model_path = os.path.join(os.path.dirname("__file__"), "../model")
test_X_path = os.path.join(os.path.dirname("__file__"), "../model/proceed_datas/test_X")
test_y_path = os.path.join(os.path.dirname("__file__"), "../model/proceed_datas/test_y")
val_X_path = os.path.join(os.path.dirname("__file__"), "../model/proceed_datas/val_X")
val_y_path = os.path.join(os.path.dirname("__file__"), "../model/proceed_datas/val_y")
train_X_path = os.path.join(os.path.dirname("__file__"), "../model/proceed_datas/train_X")
train_y_path = os.path.join(os.path.dirname("__file__"), "../model/proceed_datas/train_y")

In [8]:
# 対象の目的変数を定義
labels = {
#    "label_high_5",
#    "label_high_10",
    "label_high_20",
#    "label_low_5",
#    "label_low_10",
    "label_low_20",
}

In [9]:
# 特徴量追加済みデータ
proceed_datas = {
    "train_X",
    "train_y",
    "val_X",
    "val_y",
    "test_X",
    "test_y"
}

In [None]:
                     {'iterations': [222], 
                       'depth': [9], 'learning_rate': 
                       [0.18831273426065617],               
                       'random_strength': [33], 
                       'bagging_temperature': [0.06584346890760226], 
                       'od_type': ['Iter'], 'od_wait': [21]

In [None]:
params = {
    'iterations', 50, 300),                         
    depth', 4, 10),                                       
    'learning_rate', 0.01, 0.3),               
    random_strength', 0, 100),                       
   'bagging_temperature', 0.01, 100.00), 
    ('od_type', ['IncToDec', 'Iter']),
    ('od_wait', 10, 50)
}


In [10]:
# ライブラリインポート
from sklearn.model_selection import GridSearchCV

'''
reg_cv = GridSearchCV(xgb, {
    "eta": [0.01, 0.05, 0.1], 
    "gamma": [0.1,0.2,0.3,0.4,0.5],
    "n_estimators": [50, 100, 200], 
    "max_depth": [5, 7, 9,10,20,30],
    "subsample":[0.6,0.8,1],
    "colsample_bytree": [0.5,0.7,0.9],
}, verbose=1)
'''
reg_cv = GridSearchCV(CatBoostRegressor(), 
                      {'iterations': [50], 
                       'depth': [9], 
                       'learning_rate': [0.18831273426065617],               
                       'random_strength': [10], 
                       'bagging_temperature': [0.01], 
                       'od_type': ['IncToDec'], 
                       'od_wait': [10],
                       'random_seed':[0],
    }, verbose=1)


In [11]:
label = 'label_high_20'

In [12]:
data_X = os.path.join(train_X_path, f"train_X_{label}.pkl")
with open(data_X , "rb") as f:
    train_X = pickle.load(f)
data_y = os.path.join(train_y_path, f"train_y_{label}.pkl")
with open(data_y , "rb") as f:
    train_y = pickle.load(f)

feature_columns = get_feature_columns(dfs, train_X, column_group='fundamental+technical')
# 訓練実施
reg_cv.fit(train_X[feature_columns].values, train_y.values)

You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits
0:	learn: 0.1595924	total: 105ms	remaining: 5.14s
1:	learn: 0.1585786	total: 138ms	remaining: 3.3s
2:	learn: 0.1574360	total: 169ms	remaining: 2.65s
3:	learn: 0.1565074	total: 196ms	remaining: 2.25s
4:	learn: 0.1558120	total: 221ms	remaining: 1.99s
5:	learn: 0.1549336	total: 244ms	remaining: 1.79s
6:	learn: 0.1537988	total: 268ms	remaining: 1.65s
7:	learn: 0.1533792	total: 290ms	remaining: 1.52s
8:	learn: 0.1524131	total: 315ms	remaining: 1.43s
9:	learn: 0.1519081	total: 340ms	remaining: 1.36s
10:	learn: 0.1515575	total: 364ms	remaining: 1.29s
11:	learn: 0.1510704	total: 386ms	remaining: 1.22s
12:	learn: 0.1504726	total: 408ms	remaining: 1.16s
13:	learn: 0.1502805	total: 430ms	remaining: 1.11s
14:	learn: 0.1495041	total: 452ms	remaining: 1.05s
15:	learn: 0.1488290	total: 475ms	remaining: 1.01s
16:	learn: 0.1485055	total: 497ms	remaining: 966ms
17:	learn: 0.1481925	total: 523ms	remaining: 929ms
18:	learn: 0.1478771	total: 548ms	

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.4s finished


0:	learn: 0.1793123	total: 37.7ms	remaining: 1.85s
1:	learn: 0.1778489	total: 70ms	remaining: 1.68s
2:	learn: 0.1766594	total: 95.5ms	remaining: 1.5s
3:	learn: 0.1753568	total: 122ms	remaining: 1.41s
4:	learn: 0.1744981	total: 149ms	remaining: 1.34s
5:	learn: 0.1735522	total: 174ms	remaining: 1.27s
6:	learn: 0.1722608	total: 201ms	remaining: 1.24s
7:	learn: 0.1717815	total: 231ms	remaining: 1.21s
8:	learn: 0.1710893	total: 264ms	remaining: 1.2s
9:	learn: 0.1705112	total: 294ms	remaining: 1.18s
10:	learn: 0.1698499	total: 329ms	remaining: 1.17s
11:	learn: 0.1694820	total: 363ms	remaining: 1.15s
12:	learn: 0.1691689	total: 387ms	remaining: 1.1s
13:	learn: 0.1689332	total: 410ms	remaining: 1.05s
14:	learn: 0.1681805	total: 436ms	remaining: 1.02s
15:	learn: 0.1678040	total: 461ms	remaining: 979ms
16:	learn: 0.1675349	total: 483ms	remaining: 938ms
17:	learn: 0.1672257	total: 507ms	remaining: 901ms
18:	learn: 0.1669469	total: 531ms	remaining: 866ms
19:	learn: 0.1668159	total: 554ms	remaining

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=<catboost.core.CatBoostRegressor object at 0x7f978a8ba668>,
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'iterations': [50], 'depth': [9], 'learning_rate': [0.18831273426065617], 'random_strength': [10], 'bagging_temperature': [0.01], 'od_type': ['IncToDec'], 'od_wait': [10], 'random_seed': [0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [13]:
# 結果を表示
print(reg_cv.best_params_)
print(reg_cv.best_score_)

{'bagging_temperature': 0.01, 'depth': 9, 'iterations': 50, 'learning_rate': 0.18831273426065617, 'od_type': 'IncToDec', 'od_wait': 10, 'random_seed': 0, 'random_strength': 10}
0.08507998294617632


In [14]:
best_model = reg_cv.best_estimator_

In [15]:
SELECT_FIN_DATA_COLUMNS = ['Result_FinancialStatement FiscalYear', 'Result_FinancialStatement NetSales',
       'Result_FinancialStatement OperatingIncome', 'Result_FinancialStatement OrdinaryIncome',
       'Result_FinancialStatement NetIncome', 'Result_FinancialStatement TotalAssets',
       'Result_FinancialStatement NetAssets', 'Result_FinancialStatement CashFlowsFromOperatingActivities',
       'Result_FinancialStatement CashFlowsFromFinancingActivities',
       'Result_FinancialStatement CashFlowsFromInvestingActivities', 'Forecast_FinancialStatement FiscalYear',
       'Forecast_FinancialStatement NetSales', 'Forecast_FinancialStatement OperatingIncome',
       'Forecast_FinancialStatement OrdinaryIncome', 'Forecast_FinancialStatement NetIncome',
       'Result_Dividend FiscalYear', 'Result_Dividend QuarterlyDividendPerShare',
       'Result_Dividend AnnualDividendPerShare', 'Forecast_Dividend FiscalYear',
       'Forecast_Dividend QuarterlyDividendPerShare', 'Forecast_Dividend AnnualDividendPerShare',
       'IssuedShareEquityQuote IssuedShare','Section/Products', '33 Sector(Code)', '17 Sector(Code)']

In [16]:
# 学習用データセット定義
# ファンダメンタル情報
#fundamental_cols = dfs["stock_fin"].select_dtypes("float64").columns
fundamental_cols = pd.Index(SELECT_FIN_DATA_COLUMNS)
fundamental_cols = fundamental_cols[fundamental_cols != "Result_Dividend DividendPayableDate"]
fundamental_cols = fundamental_cols[fundamental_cols != "Local Code"]
# 価格変化率
returns_cols = [x for x in train_X.columns if "return" in x]
# テクニカル
technical_cols = [x for x in train_X.columns if (x not in fundamental_cols) and (x != "code")]

In [17]:
columns = {
    "fundamental_only": fundamental_cols,
    "return_only": returns_cols,
    "technical_only": technical_cols,
    "fundamental+technical": list(fundamental_cols) + list(technical_cols),
}

In [18]:
# 結果保存用
all_results = dict()
all_results['catboost'] = dict()

In [19]:
data_X = os.path.join(test_X_path, f"test_X_{label}.pkl")
with open(data_X , "rb") as f:
    test_X = pickle.load(f)
data_y = os.path.join(test_y_path, f"test_y_{label}.pkl")
with open(data_y , "rb") as f:
    test_y = pickle.load(f)
# データセット毎に処理
for col in columns.keys():
    result = dict()
    # 目的変数毎に処理
    for label in tqdm(labels):
        if len(test_X[columns[col]]) > 0:
            # モデル取得
            pred_model = best_model
            # 学習
            pred_model.fit(train_X[columns[col]].values, train_y)
            # 結果データ作成
            result[label] = test_X[["code"]].copy()
            result[label]["datetime"] = test_X[columns[col]].index
            # 予測
            result[label]["predict"] = pred_model.predict(test_X[columns[col]].values)
            result[label]["predict_dir"] = np.sign(result[label]["predict"])
            # 実際の結果
            result[label]["actual"] = test_y.values
            result[label]["actual_dir"] = np.sign(result[label]["actual"])
            result[label].dropna(inplace=True)

    all_results['catboost'][col] = result

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

0:	learn: 0.1797742	total: 27.6ms	remaining: 1.35s
1:	learn: 0.1787536	total: 48.4ms	remaining: 1.16s
2:	learn: 0.1781023	total: 70.1ms	remaining: 1.1s
3:	learn: 0.1775683	total: 91.7ms	remaining: 1.05s
4:	learn: 0.1769668	total: 107ms	remaining: 962ms
5:	learn: 0.1764093	total: 121ms	remaining: 885ms
6:	learn: 0.1759422	total: 136ms	remaining: 833ms
7:	learn: 0.1751852	total: 148ms	remaining: 775ms
8:	learn: 0.1749958	total: 159ms	remaining: 723ms
9:	learn: 0.1748891	total: 170ms	remaining: 680ms
10:	learn: 0.1745638	total: 181ms	remaining: 643ms
11:	learn: 0.1744248	total: 193ms	remaining: 611ms
12:	learn: 0.1741359	total: 204ms	remaining: 582ms
13:	learn: 0.1740691	total: 210ms	remaining: 540ms
14:	learn: 0.1738100	total: 224ms	remaining: 523ms
15:	learn: 0.1733493	total: 239ms	remaining: 508ms
16:	learn: 0.1732538	total: 251ms	remaining: 488ms
17:	learn: 0.1731559	total: 263ms	remaining: 467ms
18:	learn: 0.1730360	total: 274ms	remaining: 447ms
19:	learn: 0.1728974	total: 285ms	rema

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

0:	learn: 0.1804122	total: 8.75ms	remaining: 429ms
1:	learn: 0.1797402	total: 20ms	remaining: 480ms
2:	learn: 0.1790438	total: 29.6ms	remaining: 463ms
3:	learn: 0.1787244	total: 38ms	remaining: 437ms
4:	learn: 0.1782779	total: 46.6ms	remaining: 419ms
5:	learn: 0.1779408	total: 54.1ms	remaining: 397ms
6:	learn: 0.1777340	total: 61.8ms	remaining: 379ms
7:	learn: 0.1775904	total: 67.9ms	remaining: 356ms
8:	learn: 0.1773925	total: 74.3ms	remaining: 339ms
9:	learn: 0.1772442	total: 80.8ms	remaining: 323ms
10:	learn: 0.1770753	total: 86.5ms	remaining: 307ms
11:	learn: 0.1770022	total: 92.3ms	remaining: 292ms
12:	learn: 0.1769018	total: 97.6ms	remaining: 278ms
13:	learn: 0.1768293	total: 104ms	remaining: 267ms
14:	learn: 0.1767170	total: 110ms	remaining: 257ms
15:	learn: 0.1766182	total: 116ms	remaining: 246ms
16:	learn: 0.1765372	total: 122ms	remaining: 236ms
17:	learn: 0.1764895	total: 128ms	remaining: 228ms
18:	learn: 0.1763953	total: 134ms	remaining: 219ms
19:	learn: 0.1763394	total: 140m

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

0:	learn: 0.1791174	total: 20.4ms	remaining: 1000ms
1:	learn: 0.1777391	total: 42.2ms	remaining: 1.01s
2:	learn: 0.1766095	total: 62.5ms	remaining: 979ms
3:	learn: 0.1747344	total: 82.3ms	remaining: 946ms
4:	learn: 0.1732361	total: 103ms	remaining: 930ms
5:	learn: 0.1719805	total: 133ms	remaining: 976ms
6:	learn: 0.1713355	total: 156ms	remaining: 960ms
7:	learn: 0.1710202	total: 173ms	remaining: 908ms
8:	learn: 0.1702010	total: 190ms	remaining: 865ms
9:	learn: 0.1692684	total: 207ms	remaining: 828ms
10:	learn: 0.1690018	total: 226ms	remaining: 801ms
11:	learn: 0.1684063	total: 246ms	remaining: 780ms
12:	learn: 0.1674708	total: 265ms	remaining: 753ms
13:	learn: 0.1665129	total: 284ms	remaining: 729ms
14:	learn: 0.1660687	total: 301ms	remaining: 703ms
15:	learn: 0.1656388	total: 321ms	remaining: 683ms
16:	learn: 0.1653220	total: 343ms	remaining: 665ms
17:	learn: 0.1650735	total: 362ms	remaining: 644ms
18:	learn: 0.1649163	total: 383ms	remaining: 624ms
19:	learn: 0.1643217	total: 404ms	re

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

0:	learn: 0.1788496	total: 27.4ms	remaining: 1.34s
1:	learn: 0.1776900	total: 53.8ms	remaining: 1.29s
2:	learn: 0.1762702	total: 80.3ms	remaining: 1.26s
3:	learn: 0.1752692	total: 107ms	remaining: 1.23s
4:	learn: 0.1744710	total: 133ms	remaining: 1.2s
5:	learn: 0.1734452	total: 158ms	remaining: 1.16s
6:	learn: 0.1721373	total: 182ms	remaining: 1.12s
7:	learn: 0.1718186	total: 208ms	remaining: 1.09s
8:	learn: 0.1711996	total: 236ms	remaining: 1.07s
9:	learn: 0.1705718	total: 263ms	remaining: 1.05s
10:	learn: 0.1701135	total: 287ms	remaining: 1.01s
11:	learn: 0.1698086	total: 311ms	remaining: 985ms
12:	learn: 0.1696401	total: 334ms	remaining: 951ms
13:	learn: 0.1694437	total: 357ms	remaining: 918ms
14:	learn: 0.1684216	total: 380ms	remaining: 886ms
15:	learn: 0.1680899	total: 404ms	remaining: 858ms
16:	learn: 0.1675032	total: 428ms	remaining: 831ms
17:	learn: 0.1670833	total: 453ms	remaining: 805ms
18:	learn: 0.1668480	total: 481ms	remaining: 785ms
19:	learn: 0.1666977	total: 512ms	remai

In [20]:
results = []
for model in all_results.keys():
    for col in all_results[model]:
        tmp = pd.concat(all_results[model][col])
        tmp["model"] = model
        tmp["feature"] = col
        results.append(tmp)
results = pd.concat(results)
results["label"] = [x[0] for x in results.index]
results.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,code,datetime,predict,predict_dir,actual,actual_dir,model,feature,label
Unnamed: 0_level_1,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
label_high_20,2019-02-08,1301,2019-02-08,0.060852,1.0,0.07854,1.0,catboost,fundamental_only,label_high_20
label_high_20,2019-05-13,1301,2019-05-13,0.091461,1.0,0.04379,1.0,catboost,fundamental_only,label_high_20
label_high_20,2019-08-02,1301,2019-08-02,0.048003,1.0,0.00498,1.0,catboost,fundamental_only,label_high_20
label_high_20,2019-11-05,1301,2019-11-05,0.052313,1.0,0.00841,1.0,catboost,fundamental_only,label_high_20
label_high_20,2020-02-07,1301,2020-02-07,0.052894,1.0,0.01212,1.0,catboost,fundamental_only,label_high_20


In [21]:
# 結果保存用変数
all_metrics = []

# データセット毎に処理
for feature in columns:
    matrix = dict()
    # 処理対象データに絞り込み
    tmp_df = results[(results["model"] == "catboost") & (results["label"] == label) & (results["feature"] == feature)]
    # RMSE
    rmse = np.sqrt(mean_squared_error(tmp_df["predict"], tmp_df["actual"]))
    # 精度
    accuracy = accuracy_score(tmp_df["predict_dir"], tmp_df["actual_dir"])
    # 相関係数
    corr = np.corrcoef(tmp_df["actual"], tmp_df["predict"])[0, 1]
    # 順位相関
    spearman_corr = spearmanr(tmp_df["actual"], tmp_df["predict"])[0]
    # 結果を保存
    matrix[label] = [rmse, accuracy, spearman_corr,corr, corr**2, feature, model, tmp_df.shape[0]]
    res = pd.DataFrame.from_dict(matrix).T
    res.columns = ["RMSE","accuracy","spearman_corr","corr","R^2 score","feature", "model", "# of samples"]
    all_metrics.append(res)
all_metrics = pd.concat(all_metrics)
all_metrics.reset_index()

Unnamed: 0,index,RMSE,accuracy,spearman_corr,corr,R^2 score,feature,model,# of samples
0,label_low_20,0.164907,0.829648,0.104323,0.186488,0.0347776,fundamental_only,catboost,32515
1,label_low_20,0.163601,0.832293,0.222053,0.211547,0.044752,return_only,catboost,32515
2,label_low_20,0.160823,0.831801,0.254584,0.276099,0.0762304,technical_only,catboost,32515
3,label_low_20,0.160771,0.830878,0.246701,0.276321,0.0763536,fundamental+technical,catboost,32515


In [22]:
tmp = all_metrics.drop(columns=["# of samples"])

In [23]:
tmp

Unnamed: 0,RMSE,accuracy,spearman_corr,corr,R^2 score,feature,model
label_low_20,0.164907,0.829648,0.104323,0.186488,0.0347776,fundamental_only,catboost
label_low_20,0.163601,0.832293,0.222053,0.211547,0.044752,return_only,catboost
label_low_20,0.160823,0.831801,0.254584,0.276099,0.0762304,technical_only,catboost
label_low_20,0.160771,0.830878,0.246701,0.276321,0.0763536,fundamental+technical,catboost


In [24]:
label = 'label_low_20'

In [None]:
params = {
    'iterations', 50, 300),                         
    depth', 4, 10),                                       
    'learning_rate', 0.01, 0.3),               
    random_strength', 0, 100),                       
   'bagging_temperature', 0.01, 100.00), 
    ('od_type', ['IncToDec', 'Iter']),
    ('od_wait', 10, 50)
}


In [25]:

'''
reg_cv = GridSearchCV(xgb, {
    "eta": [0.01, 0.05, 0.1], 
    "gamma": [0.1,0.2,0.3,0.4,0.5],
    "n_estimators": [50, 100, 200], 
    "max_depth": [5, 7, 9,10,20,30],
    "subsample":[0.6,0.8,1],
    "colsample_bytree": [0.5,0.7,0.9],
}, verbose=1)
'''
reg_cv = GridSearchCV(CatBoostRegressor(), 
                      {'iterations': [100], 
                       'depth': [9], 
                       'learning_rate': [0.18831273426065617],               
                       'random_strength': [5], 
                       'bagging_temperature': [0.01], 
                       'od_type': ['IncToDec'], 
                       'od_wait': [10],
                       'random_seed':[0],
    }, verbose=1)


In [26]:
data_X = os.path.join(train_X_path, f"train_X_{label}.pkl")
with open(data_X , "rb") as f:
    train_X = pickle.load(f)
data_y = os.path.join(train_y_path, f"train_y_{label}.pkl")
with open(data_y , "rb") as f:
    train_y = pickle.load(f)

feature_columns = get_feature_columns(dfs, train_X, column_group='fundamental+technical')
# 訓練実施
reg_cv.fit(train_X[feature_columns].values, train_y.values)

You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits
0:	learn: 0.0706714	total: 34.8ms	remaining: 3.45s
1:	learn: 0.0692777	total: 64.3ms	remaining: 3.15s
2:	learn: 0.0680506	total: 91.8ms	remaining: 2.97s
3:	learn: 0.0670988	total: 119ms	remaining: 2.85s
4:	learn: 0.0662908	total: 145ms	remaining: 2.75s
5:	learn: 0.0656692	total: 171ms	remaining: 2.67s
6:	learn: 0.0650590	total: 199ms	remaining: 2.64s
7:	learn: 0.0647967	total: 224ms	remaining: 2.58s
8:	learn: 0.0645232	total: 253ms	remaining: 2.55s
9:	learn: 0.0641808	total: 279ms	remaining: 2.51s
10:	learn: 0.0638506	total: 304ms	remaining: 2.46s
11:	learn: 0.0635958	total: 326ms	remaining: 2.39s
12:	learn: 0.0634180	total: 348ms	remaining: 2.33s
13:	learn: 0.0632535	total: 373ms	remaining: 2.29s
14:	learn: 0.0630128	total: 394ms	remaining: 2.23s
15:	learn: 0.0627816	total: 419ms	remaining: 2.2s
16:	learn: 0.0626160	total: 446ms	remaining: 2.18s
17:	learn: 0.0625534	total: 469ms	remaining: 2.14s
18:	learn: 0.0623644	total: 492

68:	learn: 0.0570965	total: 1.66s	remaining: 747ms
69:	learn: 0.0570048	total: 1.69s	remaining: 723ms
70:	learn: 0.0568788	total: 1.71s	remaining: 698ms
71:	learn: 0.0566919	total: 1.73s	remaining: 673ms
72:	learn: 0.0564930	total: 1.75s	remaining: 649ms
73:	learn: 0.0562979	total: 1.78s	remaining: 624ms
74:	learn: 0.0561131	total: 1.8s	remaining: 600ms
75:	learn: 0.0559219	total: 1.82s	remaining: 575ms
76:	learn: 0.0558066	total: 1.84s	remaining: 551ms
77:	learn: 0.0556884	total: 1.87s	remaining: 528ms
78:	learn: 0.0555087	total: 1.89s	remaining: 503ms
79:	learn: 0.0553688	total: 1.91s	remaining: 479ms
80:	learn: 0.0552396	total: 1.94s	remaining: 455ms
81:	learn: 0.0551789	total: 1.96s	remaining: 430ms
82:	learn: 0.0550449	total: 1.98s	remaining: 406ms
83:	learn: 0.0549123	total: 2s	remaining: 381ms
84:	learn: 0.0548064	total: 2.02s	remaining: 357ms
85:	learn: 0.0546548	total: 2.05s	remaining: 333ms
86:	learn: 0.0544842	total: 2.07s	remaining: 309ms
87:	learn: 0.0542456	total: 2.09s	r

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   10.4s finished


0:	learn: 0.0742017	total: 33.6ms	remaining: 3.33s
1:	learn: 0.0727757	total: 66.6ms	remaining: 3.26s
2:	learn: 0.0714891	total: 92.2ms	remaining: 2.98s
3:	learn: 0.0705730	total: 119ms	remaining: 2.85s
4:	learn: 0.0698907	total: 148ms	remaining: 2.82s
5:	learn: 0.0693211	total: 174ms	remaining: 2.72s
6:	learn: 0.0688453	total: 200ms	remaining: 2.65s
7:	learn: 0.0686014	total: 226ms	remaining: 2.6s
8:	learn: 0.0681693	total: 257ms	remaining: 2.6s
9:	learn: 0.0678879	total: 284ms	remaining: 2.56s
10:	learn: 0.0676333	total: 311ms	remaining: 2.51s
11:	learn: 0.0674294	total: 336ms	remaining: 2.46s
12:	learn: 0.0672768	total: 361ms	remaining: 2.42s
13:	learn: 0.0671665	total: 387ms	remaining: 2.38s
14:	learn: 0.0669753	total: 410ms	remaining: 2.32s
15:	learn: 0.0667267	total: 440ms	remaining: 2.31s
16:	learn: 0.0666044	total: 464ms	remaining: 2.26s
17:	learn: 0.0665100	total: 486ms	remaining: 2.21s
18:	learn: 0.0663304	total: 511ms	remaining: 2.18s
19:	learn: 0.0662063	total: 534ms	remain

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=<catboost.core.CatBoostRegressor object at 0x7f978a721d68>,
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'iterations': [100], 'depth': [9], 'learning_rate': [0.18831273426065617], 'random_strength': [5], 'bagging_temperature': [0.01], 'od_type': ['IncToDec'], 'od_wait': [10], 'random_seed': [0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [27]:
# 結果を表示
print(reg_cv.best_params_)
print(reg_cv.best_score_)

{'bagging_temperature': 0.01, 'depth': 9, 'iterations': 100, 'learning_rate': 0.18831273426065617, 'od_type': 'IncToDec', 'od_wait': 10, 'random_seed': 0, 'random_strength': 5}
0.21894696562556767


In [28]:
best_model = reg_cv.best_estimator_

In [29]:
# 結果保存用
all_results = dict()
all_results['catboost'] = dict()

In [30]:
data_X = os.path.join(test_X_path, f"test_X_{label}.pkl")
with open(data_X , "rb") as f:
    test_X = pickle.load(f)
data_y = os.path.join(test_y_path, f"test_y_{label}.pkl")
with open(data_y , "rb") as f:
    test_y = pickle.load(f)
# データセット毎に処理
for col in columns.keys():
    result = dict()
    # 目的変数毎に処理
    for label in tqdm(labels):
        if len(test_X[columns[col]]) > 0:
            # モデル取得
            pred_model = best_model
            # 学習
            pred_model.fit(train_X[columns[col]].values, train_y)
            # 結果データ作成
            result[label] = test_X[["code"]].copy()
            result[label]["datetime"] = test_X[columns[col]].index
            # 予測
            result[label]["predict"] = pred_model.predict(test_X[columns[col]].values)
            result[label]["predict_dir"] = np.sign(result[label]["predict"])
            # 実際の結果
            result[label]["actual"] = test_y.values
            result[label]["actual_dir"] = np.sign(result[label]["actual"])
            result[label].dropna(inplace=True)

    all_results['catboost'][col] = result

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

0:	learn: 0.0752071	total: 8.11ms	remaining: 802ms
1:	learn: 0.0742069	total: 36.4ms	remaining: 1.78s
2:	learn: 0.0735541	total: 59.9ms	remaining: 1.94s
3:	learn: 0.0732139	total: 82.5ms	remaining: 1.98s
4:	learn: 0.0727903	total: 100ms	remaining: 1.91s
5:	learn: 0.0725205	total: 114ms	remaining: 1.78s
6:	learn: 0.0722796	total: 128ms	remaining: 1.7s
7:	learn: 0.0720692	total: 138ms	remaining: 1.59s
8:	learn: 0.0718802	total: 148ms	remaining: 1.49s
9:	learn: 0.0717793	total: 158ms	remaining: 1.42s
10:	learn: 0.0716031	total: 168ms	remaining: 1.36s
11:	learn: 0.0714437	total: 178ms	remaining: 1.3s
12:	learn: 0.0713506	total: 188ms	remaining: 1.26s
13:	learn: 0.0712865	total: 201ms	remaining: 1.23s
14:	learn: 0.0710881	total: 213ms	remaining: 1.21s
15:	learn: 0.0710098	total: 225ms	remaining: 1.18s
16:	learn: 0.0708968	total: 237ms	remaining: 1.16s
17:	learn: 0.0708515	total: 247ms	remaining: 1.12s
18:	learn: 0.0707723	total: 257ms	remaining: 1.09s
19:	learn: 0.0706892	total: 267ms	remai

63:	learn: 0.0678763	total: 920ms	remaining: 517ms
64:	learn: 0.0677655	total: 932ms	remaining: 502ms
65:	learn: 0.0676996	total: 945ms	remaining: 487ms
66:	learn: 0.0675939	total: 956ms	remaining: 471ms
67:	learn: 0.0675318	total: 968ms	remaining: 455ms
68:	learn: 0.0674643	total: 979ms	remaining: 440ms
69:	learn: 0.0673508	total: 991ms	remaining: 425ms
70:	learn: 0.0672498	total: 1s	remaining: 410ms
71:	learn: 0.0671999	total: 1.02s	remaining: 396ms
72:	learn: 0.0671526	total: 1.03s	remaining: 381ms
73:	learn: 0.0671168	total: 1.04s	remaining: 366ms
74:	learn: 0.0669902	total: 1.05s	remaining: 351ms
75:	learn: 0.0668576	total: 1.07s	remaining: 337ms
76:	learn: 0.0667478	total: 1.08s	remaining: 323ms
77:	learn: 0.0666923	total: 1.09s	remaining: 309ms
78:	learn: 0.0665906	total: 1.11s	remaining: 295ms
79:	learn: 0.0664866	total: 1.13s	remaining: 281ms
80:	learn: 0.0664203	total: 1.14s	remaining: 267ms
81:	learn: 0.0663261	total: 1.15s	remaining: 253ms
82:	learn: 0.0662842	total: 1.16s	

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

0:	learn: 0.0747108	total: 21.3ms	remaining: 2.11s
1:	learn: 0.0737721	total: 41ms	remaining: 2.01s
2:	learn: 0.0729751	total: 63.6ms	remaining: 2.05s
3:	learn: 0.0725129	total: 84.8ms	remaining: 2.04s
4:	learn: 0.0721090	total: 95.9ms	remaining: 1.82s
5:	learn: 0.0717857	total: 106ms	remaining: 1.66s
6:	learn: 0.0714823	total: 118ms	remaining: 1.57s
7:	learn: 0.0713662	total: 127ms	remaining: 1.46s
8:	learn: 0.0711733	total: 135ms	remaining: 1.36s
9:	learn: 0.0710167	total: 143ms	remaining: 1.29s
10:	learn: 0.0708553	total: 151ms	remaining: 1.22s
11:	learn: 0.0706293	total: 159ms	remaining: 1.17s
12:	learn: 0.0704844	total: 167ms	remaining: 1.11s
13:	learn: 0.0704501	total: 175ms	remaining: 1.07s
14:	learn: 0.0703712	total: 183ms	remaining: 1.04s
15:	learn: 0.0703015	total: 192ms	remaining: 1s
16:	learn: 0.0702513	total: 199ms	remaining: 973ms
17:	learn: 0.0702083	total: 213ms	remaining: 969ms
18:	learn: 0.0701510	total: 225ms	remaining: 960ms
19:	learn: 0.0700885	total: 234ms	remaini

77:	learn: 0.0686951	total: 571ms	remaining: 161ms
78:	learn: 0.0686600	total: 578ms	remaining: 154ms
79:	learn: 0.0686294	total: 585ms	remaining: 146ms
80:	learn: 0.0685949	total: 592ms	remaining: 139ms
81:	learn: 0.0685657	total: 599ms	remaining: 132ms
82:	learn: 0.0685405	total: 606ms	remaining: 124ms
83:	learn: 0.0685099	total: 613ms	remaining: 117ms
84:	learn: 0.0684846	total: 619ms	remaining: 109ms
85:	learn: 0.0684525	total: 625ms	remaining: 102ms
86:	learn: 0.0684292	total: 632ms	remaining: 94.4ms
87:	learn: 0.0683931	total: 638ms	remaining: 87ms
88:	learn: 0.0683632	total: 645ms	remaining: 79.7ms
89:	learn: 0.0683364	total: 651ms	remaining: 72.4ms
90:	learn: 0.0683134	total: 659ms	remaining: 65.1ms
91:	learn: 0.0682777	total: 666ms	remaining: 57.9ms
92:	learn: 0.0682459	total: 674ms	remaining: 50.7ms
93:	learn: 0.0682197	total: 681ms	remaining: 43.5ms
94:	learn: 0.0682025	total: 688ms	remaining: 36.2ms
95:	learn: 0.0681772	total: 694ms	remaining: 28.9ms
96:	learn: 0.0681466	to

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

0:	learn: 0.0742864	total: 34.7ms	remaining: 3.43s
1:	learn: 0.0728963	total: 61ms	remaining: 2.99s
2:	learn: 0.0718062	total: 88.4ms	remaining: 2.86s
3:	learn: 0.0710069	total: 110ms	remaining: 2.63s
4:	learn: 0.0702928	total: 129ms	remaining: 2.46s
5:	learn: 0.0696765	total: 149ms	remaining: 2.33s
6:	learn: 0.0692007	total: 174ms	remaining: 2.31s
7:	learn: 0.0689604	total: 192ms	remaining: 2.21s
8:	learn: 0.0685830	total: 211ms	remaining: 2.14s
9:	learn: 0.0682294	total: 234ms	remaining: 2.11s
10:	learn: 0.0680162	total: 254ms	remaining: 2.06s
11:	learn: 0.0677565	total: 273ms	remaining: 2s
12:	learn: 0.0675568	total: 294ms	remaining: 1.97s
13:	learn: 0.0672880	total: 313ms	remaining: 1.92s
14:	learn: 0.0671633	total: 330ms	remaining: 1.87s
15:	learn: 0.0670032	total: 349ms	remaining: 1.83s
16:	learn: 0.0668092	total: 369ms	remaining: 1.8s
17:	learn: 0.0666410	total: 388ms	remaining: 1.77s
18:	learn: 0.0664940	total: 407ms	remaining: 1.73s
19:	learn: 0.0663670	total: 429ms	remaining:

67:	learn: 0.0609450	total: 1.34s	remaining: 630ms
68:	learn: 0.0608350	total: 1.36s	remaining: 611ms
69:	learn: 0.0607556	total: 1.38s	remaining: 591ms
70:	learn: 0.0606090	total: 1.4s	remaining: 570ms
71:	learn: 0.0604989	total: 1.41s	remaining: 550ms
72:	learn: 0.0603165	total: 1.43s	remaining: 530ms
73:	learn: 0.0602134	total: 1.45s	remaining: 510ms
74:	learn: 0.0601114	total: 1.47s	remaining: 489ms
75:	learn: 0.0600360	total: 1.48s	remaining: 469ms
76:	learn: 0.0598978	total: 1.5s	remaining: 449ms
77:	learn: 0.0597918	total: 1.52s	remaining: 429ms
78:	learn: 0.0596644	total: 1.54s	remaining: 410ms
79:	learn: 0.0595586	total: 1.56s	remaining: 390ms
80:	learn: 0.0594430	total: 1.58s	remaining: 370ms
81:	learn: 0.0593389	total: 1.6s	remaining: 351ms
82:	learn: 0.0591901	total: 1.62s	remaining: 331ms
83:	learn: 0.0591406	total: 1.64s	remaining: 312ms
84:	learn: 0.0589724	total: 1.66s	remaining: 293ms
85:	learn: 0.0588628	total: 1.68s	remaining: 273ms
86:	learn: 0.0587649	total: 1.7s	r

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

0:	learn: 0.0743081	total: 26.2ms	remaining: 2.59s
1:	learn: 0.0728551	total: 52.5ms	remaining: 2.57s
2:	learn: 0.0718058	total: 78.8ms	remaining: 2.55s
3:	learn: 0.0709211	total: 104ms	remaining: 2.49s
4:	learn: 0.0702732	total: 129ms	remaining: 2.44s
5:	learn: 0.0695727	total: 154ms	remaining: 2.41s
6:	learn: 0.0690074	total: 180ms	remaining: 2.39s
7:	learn: 0.0686151	total: 203ms	remaining: 2.33s
8:	learn: 0.0682941	total: 229ms	remaining: 2.32s
9:	learn: 0.0680241	total: 252ms	remaining: 2.27s
10:	learn: 0.0677090	total: 275ms	remaining: 2.23s
11:	learn: 0.0675275	total: 298ms	remaining: 2.19s
12:	learn: 0.0673822	total: 321ms	remaining: 2.15s
13:	learn: 0.0672237	total: 344ms	remaining: 2.12s
14:	learn: 0.0669051	total: 368ms	remaining: 2.08s
15:	learn: 0.0667387	total: 391ms	remaining: 2.05s
16:	learn: 0.0666140	total: 418ms	remaining: 2.04s
17:	learn: 0.0663254	total: 444ms	remaining: 2.02s
18:	learn: 0.0661928	total: 468ms	remaining: 1.99s
19:	learn: 0.0660388	total: 491ms	rema

62:	learn: 0.0610453	total: 1.66s	remaining: 973ms
63:	learn: 0.0608974	total: 1.68s	remaining: 947ms
64:	learn: 0.0607040	total: 1.71s	remaining: 920ms
65:	learn: 0.0606187	total: 1.73s	remaining: 893ms
66:	learn: 0.0604855	total: 1.76s	remaining: 866ms
67:	learn: 0.0603448	total: 1.78s	remaining: 839ms
68:	learn: 0.0602601	total: 1.8s	remaining: 810ms
69:	learn: 0.0600837	total: 1.83s	remaining: 783ms
70:	learn: 0.0599220	total: 1.85s	remaining: 756ms
71:	learn: 0.0597639	total: 1.88s	remaining: 729ms
72:	learn: 0.0596284	total: 1.9s	remaining: 702ms
73:	learn: 0.0595228	total: 1.92s	remaining: 675ms
74:	learn: 0.0594848	total: 1.94s	remaining: 648ms
75:	learn: 0.0593472	total: 1.97s	remaining: 621ms
76:	learn: 0.0592716	total: 1.99s	remaining: 594ms
77:	learn: 0.0592351	total: 2.01s	remaining: 567ms
78:	learn: 0.0591711	total: 2.04s	remaining: 541ms
79:	learn: 0.0591099	total: 2.06s	remaining: 515ms
80:	learn: 0.0590214	total: 2.08s	remaining: 488ms
81:	learn: 0.0588412	total: 2.11s

In [31]:
results = []
for model in all_results.keys():
    for col in all_results[model]:
        tmp = pd.concat(all_results[model][col])
        tmp["model"] = model
        tmp["feature"] = col
        results.append(tmp)
results = pd.concat(results)
results["label"] = [x[0] for x in results.index]
results.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,code,datetime,predict,predict_dir,actual,actual_dir,model,feature,label
Unnamed: 0_level_1,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
label_high_20,2019-02-08,1301,2019-02-08,-0.045663,-1.0,0.00604,1.0,catboost,fundamental_only,label_high_20
label_high_20,2019-05-13,1301,2019-05-13,-0.043657,-1.0,-0.06317,-1.0,catboost,fundamental_only,label_high_20
label_high_20,2019-08-02,1301,2019-08-02,-0.072964,-1.0,-0.08723,-1.0,catboost,fundamental_only,label_high_20
label_high_20,2019-11-05,1301,2019-11-05,-0.0732,-1.0,-0.038,-1.0,catboost,fundamental_only,label_high_20
label_high_20,2020-02-07,1301,2020-02-07,-0.050807,-1.0,-0.17832,-1.0,catboost,fundamental_only,label_high_20


In [32]:
# 結果保存用変数
all_metrics = []

# データセット毎に処理
for feature in columns:
    matrix = dict()
    # 処理対象データに絞り込み
    tmp_df = results[(results["model"] == "catboost") & (results["label"] == label) & (results["feature"] == feature)]
    # RMSE
    rmse = np.sqrt(mean_squared_error(tmp_df["predict"], tmp_df["actual"]))
    # 精度
    accuracy = accuracy_score(tmp_df["predict_dir"], tmp_df["actual_dir"])
    # 相関係数
    corr = np.corrcoef(tmp_df["actual"], tmp_df["predict"])[0, 1]
    # 順位相関
    spearman_corr = spearmanr(tmp_df["actual"], tmp_df["predict"])[0]
    # 結果を保存
    matrix[label] = [rmse, accuracy, spearman_corr,corr, corr**2, feature, model, tmp_df.shape[0]]
    res = pd.DataFrame.from_dict(matrix).T
    res.columns = ["RMSE","accuracy","spearman_corr","corr","R^2 score","feature", "model", "# of samples"]
    all_metrics.append(res)
all_metrics = pd.concat(all_metrics)
all_metrics.reset_index()

Unnamed: 0,index,RMSE,accuracy,spearman_corr,corr,R^2 score,feature,model,# of samples
0,label_low_20,0.103721,0.867815,0.145102,0.152647,0.0233012,fundamental_only,catboost,32515
1,label_low_20,0.106608,0.867907,0.0867729,0.0619561,0.00383855,return_only,catboost,32515
2,label_low_20,0.104659,0.866216,0.164263,0.156261,0.0244174,technical_only,catboost,32515
3,label_low_20,0.104301,0.866523,0.155202,0.152032,0.0231136,fundamental+technical,catboost,32515


In [33]:
tmp = all_metrics.drop(columns=["# of samples"])

In [34]:
tmp

Unnamed: 0,RMSE,accuracy,spearman_corr,corr,R^2 score,feature,model
label_low_20,0.103721,0.867815,0.145102,0.152647,0.0233012,fundamental_only,catboost
label_low_20,0.106608,0.867907,0.0867729,0.0619561,0.00383855,return_only,catboost
label_low_20,0.104659,0.866216,0.164263,0.156261,0.0244174,technical_only,catboost
label_low_20,0.104301,0.866523,0.155202,0.152032,0.0231136,fundamental+technical,catboost
