# Imports & Load Data
作業に必要なライブラリをインポートして、 以下のデータを読み込みます。

* stock_price : 株価情報
* stock_list : 銘柄情報
* stock_fin : 財務諸表
* stock_labels : 目的変数

In [2]:
import os
import pickle
import sys
import warnings
from glob import glob

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shap
import  xgboost as xgb
from scipy.stats import spearmanr
from sklearn.ensemble import (
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.metrics import accuracy_score, mean_squared_error
from tqdm.auto import tqdm


# 表示用の設定を変更します
%matplotlib inline
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.width = 120

In [3]:
# python 3.7.3であることを確認します
import sys
print(sys.version)

3.7.3 (default, Mar 27 2019, 22:11:17) 
[GCC 7.3.0]


In [4]:
# データセット保存先ディレクトリ（""の中身はご自身の環境に合わせて定義してください。）
dataset_dir="/path/to"

In [5]:
# 読み込むファイルを定義します。
inputs = {
    "stock_list": f"{dataset_dir}/stock_list.csv.gz",
    "stock_price": f"{dataset_dir}/stock_price.csv.gz",
    "stock_fin": f"{dataset_dir}/stock_fin.csv.gz",
    # 本チュートリアルでは使用しないため、コメントアウトしています。
    # "stock_fin_price": f"{dataset_dir}/stock_fin_price.csv.gz",
    "stock_labels": f"{dataset_dir}/stock_labels.csv.gz",
}

# ファイルを読み込みます
dfs = {}
for k, v in inputs.items():
    print(k)
    dfs[k] = pd.read_csv(v)

stock_list
stock_price
stock_fin
stock_labels


# 特徴量の生成

In [None]:
'''
FEATURES = ['MA_gap_2month',
            'MA_gap_3month',
            'volatility_2month',
            'volatility_3month',
            'Result_Dividend FiscalYear',
            'return_3month',
            'Forecast_Dividend FiscalYear',
            'volatility_1month',
            'Forecast_FinancialStatement FiscalYear',
            'MA_gap_1month',
            'pbr',
            'Result_FinancialStatement FiscalYear',
            'return_1month',
            'ema_12',
            'Result_FinancialStatement TotalAssets',
            'signal',
            'Previous_FinancialStatement NetIncome',
            'per',
            'Result_FinancialStatement CashFlowsFromOperatingActivities',
            'Result_FinancialStatement CashFlowsFromInvestingActivities',
            'ema_10',
            'Forecast_FinancialStatement NetIncome',
            'macd',
            'Previous_FinancialStatement CashFlowsFromFinancingActivities',
            'bps',
            'Result_Dividend AnnualDividendPerShare',
            'Forecast_FinancialStatement OperatingIncome',
            'Previous_FinancialStatement CashFlowsFromOperatingActivities',
            'Result_FinancialStatement CashFlowsFromFinancingActivities',
            'Forecast_FinancialStatement NetSales',
            'roe',
            'Forecast_FinancialStatement OrdinaryIncome',
            'Previous_FinancialStatement OperatingIncome',
            'Result_FinancialStatement NetAssets',
            'EWMA',
            'IssuedShareEquityQuote IssuedShare',
            'Previous_FinancialStatement NetSales',
            'Previous_FinancialStatement OrdinaryIncome',
            'Section/Products',
            'net_profit_margin',
            'total_assets_growth_rate', 'operating_profit_margin']
'''

In [43]:
FEATURES = ['MA_gap_2month',
            'MA_gap_3month',
            'volatility_2month',
            'volatility_3month',
            'Result_Dividend FiscalYear',
            'return_3month',
            'Forecast_Dividend FiscalYear',
            'volatility_1month',
            'Forecast_FinancialStatement FiscalYear',
            'MA_gap_1month',
            'pbr',
            'Result_FinancialStatement FiscalYear',
            'return_1month',
            'ema_12',
            'Result_FinancialStatement TotalAssets',
            'signal',
            'Previous_FinancialStatement NetIncome',
            'per',
            'Result_FinancialStatement CashFlowsFromOperatingActivities',
            'Result_FinancialStatement CashFlowsFromInvestingActivities',
            'ema_10']

In [44]:
def get_feature_columns(dfs, train_X, column_group="fundamental+technical"):
    # 特徴量グループを定義
    # ファンダメンタル
    fundamental_cols = dfs["stock_fin"].select_dtypes("float64").columns
    fundamental_cols = fundamental_cols[
        fundamental_cols != "Result_Dividend DividendPayableDate"
        ]
    fundamental_cols = fundamental_cols[fundamental_cols != "Local Code"]
    # 価格変化率
    returns_cols = [x for x in train_X.columns if "return" in x]
    # テクニカル
    technical_cols = [
        x for x in train_X.columns if
        (x not in fundamental_cols) and (x != "code")
    ]
    columns = {
        "fundamental_only": fundamental_cols,
        "return_only": returns_cols,
        "technical_only": technical_cols,
        "fundamental+technical": list(fundamental_cols) + list(
            technical_cols),
        "selected_columns":FEATURES,
    }
    return columns[column_group]

In [45]:
model_path = os.path.join(os.path.dirname("__file__"), "../model")
test_X_path = os.path.join(os.path.dirname("__file__"), "../model/proceed_datas/test_X")
test_y_path = os.path.join(os.path.dirname("__file__"), "../model/proceed_datas/test_y")
val_X_path = os.path.join(os.path.dirname("__file__"), "../model/proceed_datas/val_X")
val_y_path = os.path.join(os.path.dirname("__file__"), "../model/proceed_datas/val_y")
train_X_path = os.path.join(os.path.dirname("__file__"), "../model/proceed_datas/train_X")
train_y_path = os.path.join(os.path.dirname("__file__"), "../model/proceed_datas/train_y")

In [46]:
# 対象の目的変数を定義
labels = {
#    "label_high_5",
#    "label_high_10",
    "label_high_20",
#    "label_low_5",
#    "label_low_10",
    "label_low_20",
}

In [47]:
# 特徴量追加済みデータ
proceed_datas = {
    "train_X",
    "train_y",
    "val_X",
    "val_y",
    "test_X",
    "test_y"
}

In [48]:
# ライブラリインポート
from sklearn.model_selection import GridSearchCV

'''
reg_cv = GridSearchCV(xgb, {
    "eta": [0.01, 0.05, 0.1], 
    "gamma": [0.1,0.2,0.3,0.4,0.5],
    "n_estimators": [50, 100, 200], 
    "max_depth": [5, 7, 9,10,20,30],
    "subsample":[0.6,0.8,1],
    "colsample_bytree": [0.5,0.7,0.9],
}, verbose=1)
'''
reg_cv = GridSearchCV(xgb.XGBRegressor(), {
    "objective" : ['reg:pseudohubererror'],
    "max_depth": [5],
    "min_child_weight": [3],
    "gamma": [0],
    "subsample": [0.9],
    "colsample_bytree": [0.9],
    "alpha":[0],
    "learning_rate": [0.01],
    "n_estimators": [700],
    "random_state": [0],
}, verbose=1)


In [None]:
{'booster': 'dart', 'colsample_bytree': 0.5, 'eta': 0.11, 
 'gamma': 1.0, 'max_depth': 5, 'min_chile_wight': 0, '
 n_estimators': 50, 'random_seed': 0, 'subsample': 1.0}
 
 
 0.07587034814005862

In [49]:
for label in tqdm(labels):
    data_X = os.path.join(train_X_path, f"train_X_{label}.pkl")
    with open(data_X , "rb") as f:
        train_X = pickle.load(f)
    data_y = os.path.join(train_y_path, f"train_y_{label}.pkl")
    with open(data_y , "rb") as f:
        train_y = pickle.load(f)
        
    feature_columns = get_feature_columns(dfs, train_X, column_group='selected_columns')
    # 訓練実施
    reg_cv.fit(train_X[feature_columns].values, train_y.values)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   33.8s finished
You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   33.9s finished





In [50]:
# 結果を表示
print(reg_cv.best_params_)
print(reg_cv.best_score_)

{'alpha': 0, 'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 700, 'objective': 'reg:pseudohubererror', 'random_state': 0, 'subsample': 0.9}
0.22772703486591486


In [51]:
best_model = reg_cv.best_estimator_

In [52]:
SELECT_FIN_DATA_COLUMNS = ['Result_FinancialStatement FiscalYear', 'Result_FinancialStatement NetSales',
       'Result_FinancialStatement OperatingIncome', 'Result_FinancialStatement OrdinaryIncome',
       'Result_FinancialStatement NetIncome', 'Result_FinancialStatement TotalAssets',
       'Result_FinancialStatement NetAssets', 'Result_FinancialStatement CashFlowsFromOperatingActivities',
       'Result_FinancialStatement CashFlowsFromFinancingActivities',
       'Result_FinancialStatement CashFlowsFromInvestingActivities', 'Forecast_FinancialStatement FiscalYear',
       'Forecast_FinancialStatement NetSales', 'Forecast_FinancialStatement OperatingIncome',
       'Forecast_FinancialStatement OrdinaryIncome', 'Forecast_FinancialStatement NetIncome',
       'Result_Dividend FiscalYear', 'Result_Dividend QuarterlyDividendPerShare',
       'Result_Dividend AnnualDividendPerShare', 'Forecast_Dividend FiscalYear',
       'Forecast_Dividend QuarterlyDividendPerShare', 'Forecast_Dividend AnnualDividendPerShare',
       'IssuedShareEquityQuote IssuedShare','Section/Products', '33 Sector(Code)', '17 Sector(Code)']

In [53]:
# 学習用データセット定義
# ファンダメンタル情報
#fundamental_cols = dfs["stock_fin"].select_dtypes("float64").columns
fundamental_cols = pd.Index(SELECT_FIN_DATA_COLUMNS)
fundamental_cols = fundamental_cols[fundamental_cols != "Result_Dividend DividendPayableDate"]
fundamental_cols = fundamental_cols[fundamental_cols != "Local Code"]
# 価格変化率
returns_cols = [x for x in train_X.columns if "return" in x]
# テクニカル
technical_cols = [x for x in train_X.columns if (x not in fundamental_cols) and (x != "code")]

In [54]:
columns = {
    "fundamental_only": fundamental_cols,
    "return_only": returns_cols,
    "technical_only": technical_cols,
    "fundamental+technical": list(fundamental_cols) + list(technical_cols),
     "selected_columns":FEATURES,
}

In [55]:
# 結果保存用
all_results = dict()
all_results['XGB'] = dict()

In [56]:
for label in labels:
    data_X = os.path.join(test_X_path, f"test_X_{label}.pkl")
    with open(data_X , "rb") as f:
        test_X = pickle.load(f)
    data_y = os.path.join(test_y_path, f"test_y_{label}.pkl")
    with open(data_y , "rb") as f:
        test_y = pickle.load(f)
    # データセット毎に処理
    for col in columns.keys():
        result = dict()
        # 目的変数毎に処理
        for label in tqdm(labels):
            if len(test_X[columns[col]]) > 0:
                # モデル取得
                pred_model = best_model
                # 学習
                pred_model.fit(train_X[columns[col]].values, train_y)
                # 結果データ作成
                result[label] = test_X[["code"]].copy()
                result[label]["datetime"] = test_X[columns[col]].index
                # 予測
                result[label]["predict"] = pred_model.predict(test_X[columns[col]].values)
                result[label]["predict_dir"] = np.sign(result[label]["predict"])
                # 実際の結果
                result[label]["actual"] = test_y.values
                result[label]["actual_dir"] = np.sign(result[label]["actual"])
                result[label].dropna(inplace=True)

        all_results['XGB'][col] = result

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




In [57]:
results = []
for model in all_results.keys():
    for col in all_results[model]:
        tmp = pd.concat(all_results[model][col])
        tmp["model"] = model
        tmp["feature"] = col
        results.append(tmp)
results = pd.concat(results)
results["label"] = [x[0] for x in results.index]
results.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,code,datetime,predict,predict_dir,actual,actual_dir,model,feature,label
Unnamed: 0_level_1,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
label_high_20,2019-02-08,1301,2019-02-08,-0.04258,-1.0,0.00604,1.0,XGB,fundamental_only,label_high_20
label_high_20,2019-05-13,1301,2019-05-13,-0.039166,-1.0,-0.06317,-1.0,XGB,fundamental_only,label_high_20
label_high_20,2019-08-02,1301,2019-08-02,-0.050355,-1.0,-0.08723,-1.0,XGB,fundamental_only,label_high_20
label_high_20,2019-11-05,1301,2019-11-05,-0.050498,-1.0,-0.038,-1.0,XGB,fundamental_only,label_high_20
label_high_20,2020-02-07,1301,2020-02-07,-0.040737,-1.0,-0.17832,-1.0,XGB,fundamental_only,label_high_20


In [58]:
# 結果保存用変数
all_metrics = []

# データセット毎に処理
for feature in columns:
    matrix = dict()
    # 目的変数毎に処理
    for label in labels:
        # 処理対象データに絞り込み
        tmp_df = results[(results["model"] == "XGB") & (results["label"] == label) & (results["feature"] == feature)]
        # RMSE
        rmse = np.sqrt(mean_squared_error(tmp_df["predict"], tmp_df["actual"]))
        # 精度
        accuracy = accuracy_score(tmp_df["predict_dir"], tmp_df["actual_dir"])
        # 相関係数
        corr = np.corrcoef(tmp_df["actual"], tmp_df["predict"])[0, 1]
        # 順位相関
        spearman_corr = spearmanr(tmp_df["actual"], tmp_df["predict"])[0]
        # 結果を保存
        matrix[label] = [rmse, accuracy, spearman_corr,corr, corr**2, feature, model, tmp_df.shape[0]]
    res = pd.DataFrame.from_dict(matrix).T
    res.columns = ["RMSE","accuracy","spearman_corr","corr","R^2 score","feature", "model", "# of samples"]
    all_metrics.append(res)
all_metrics = pd.concat(all_metrics)
all_metrics.reset_index()

Unnamed: 0,index,RMSE,accuracy,spearman_corr,corr,R^2 score,feature,model,# of samples
0,label_high_20,0.103733,0.867876,0.154843,0.166222,0.0276296,fundamental_only,XGB,32515
1,label_low_20,0.103733,0.867876,0.154843,0.166222,0.0276296,fundamental_only,XGB,32515
2,label_high_20,0.105796,0.867938,0.0933089,0.0787919,0.00620816,return_only,XGB,32515
3,label_low_20,0.105796,0.867938,0.0933089,0.0787919,0.00620816,return_only,XGB,32515
4,label_high_20,0.10431,0.867907,0.170704,0.155467,0.02417,technical_only,XGB,32515
5,label_low_20,0.10431,0.867907,0.170704,0.155467,0.02417,technical_only,XGB,32515
6,label_high_20,0.103339,0.867938,0.170204,0.157641,0.0248508,fundamental+technical,XGB,32515
7,label_low_20,0.103339,0.867938,0.170204,0.157641,0.0248508,fundamental+technical,XGB,32515
8,label_high_20,0.103166,0.867938,0.160004,0.147382,0.0217214,selected_columns,XGB,32515
9,label_low_20,0.103166,0.867938,0.160004,0.147382,0.0217214,selected_columns,XGB,32515


In [59]:
tmp = all_metrics.drop(columns=["# of samples"])

In [60]:
tmp.to_csv('result_XGB.csv', index=None)

In [61]:
tmp = pd.read_csv('result_XGB.csv')

In [62]:
tmp.groupby(['feature', 'model']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,RMSE,accuracy,spearman_corr,corr,R^2 score
feature,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
fundamental+technical,XGB,0.103339,0.867938,0.170204,0.157641,0.024851
fundamental_only,XGB,0.103733,0.867876,0.154843,0.166222,0.02763
return_only,XGB,0.105796,0.867938,0.093309,0.078792,0.006208
selected_columns,XGB,0.103166,0.867938,0.160004,0.147382,0.021721
technical_only,XGB,0.10431,0.867907,0.170704,0.155467,0.02417


In [63]:
results

Unnamed: 0_level_0,Unnamed: 1_level_0,code,datetime,predict,predict_dir,actual,actual_dir,model,feature,label
Unnamed: 0_level_1,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
label_high_20,2019-02-08,1301,2019-02-08,-0.042580,-1.0,0.00604,1.0,XGB,fundamental_only,label_high_20
label_high_20,2019-05-13,1301,2019-05-13,-0.039166,-1.0,-0.06317,-1.0,XGB,fundamental_only,label_high_20
label_high_20,2019-08-02,1301,2019-08-02,-0.050355,-1.0,-0.08723,-1.0,XGB,fundamental_only,label_high_20
label_high_20,2019-11-05,1301,2019-11-05,-0.050498,-1.0,-0.03800,-1.0,XGB,fundamental_only,label_high_20
label_high_20,2020-02-07,1301,2020-02-07,-0.040737,-1.0,-0.17832,-1.0,XGB,fundamental_only,label_high_20
label_high_20,2020-05-12,1301,2020-05-12,-0.036375,-1.0,0.00574,1.0,XGB,fundamental_only,label_high_20
label_high_20,2020-08-07,1301,2020-08-07,-0.050213,-1.0,-0.02542,-1.0,XGB,fundamental_only,label_high_20
label_high_20,2020-11-06,1301,2020-11-06,-0.045319,-1.0,-0.00899,-1.0,XGB,fundamental_only,label_high_20
label_high_20,2019-02-05,1332,2019-02-05,-0.050055,-1.0,-0.02078,-1.0,XGB,fundamental_only,label_high_20
label_high_20,2019-05-14,1332,2019-05-14,-0.045625,-1.0,-0.08223,-1.0,XGB,fundamental_only,label_high_20
