# Imports & Load Data
作業に必要なライブラリをインポートして、 以下のデータを読み込みます。

* stock_price : 株価情報
* stock_list : 銘柄情報
* stock_fin : 財務諸表
* stock_labels : 目的変数

In [1]:
!pip install lightgbm

Collecting lightgbm
[?25l  Downloading https://files.pythonhosted.org/packages/6e/1f/b2263713cd0a894ce5e856699bbc3e7e7efc97a300d51b7c787878cc8831/lightgbm-3.2.0-py3-none-manylinux1_x86_64.whl (2.0MB)
[K    100% |████████████████████████████████| 2.0MB 2.7MB/s ta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-3.2.0


In [2]:
import os
import pickle
import sys
import warnings
from glob import glob

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shap
import  xgboost as xgb
from scipy.stats import spearmanr
from sklearn.ensemble import (
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.metrics import accuracy_score, mean_squared_error
from tqdm.auto import tqdm
#from catboost import CatBoostRegressor
import lightgbm as lgb

# 表示用の設定を変更します
%matplotlib inline
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.width = 120

calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.


In [3]:
# python 3.7.3であることを確認します
import sys
print(sys.version)

3.7.3 (default, Mar 27 2019, 22:11:17) 
[GCC 7.3.0]


In [4]:
# データセット保存先ディレクトリ（""の中身はご自身の環境に合わせて定義してください。）
dataset_dir="/path/to"

In [5]:
# 読み込むファイルを定義します。
inputs = {
    "stock_list": f"{dataset_dir}/stock_list.csv.gz",
    "stock_price": f"{dataset_dir}/stock_price.csv.gz",
    "stock_fin": f"{dataset_dir}/stock_fin.csv.gz",
    # 本チュートリアルでは使用しないため、コメントアウトしています。
    # "stock_fin_price": f"{dataset_dir}/stock_fin_price.csv.gz",
    "stock_labels": f"{dataset_dir}/stock_labels.csv.gz",
}

# ファイルを読み込みます
dfs = {}
for k, v in inputs.items():
    print(k)
    dfs[k] = pd.read_csv(v)

stock_list
stock_price
stock_fin
stock_labels


# 特徴量の生成

In [8]:
FEATURES = ['MA_gap_2month',
            'MA_gap_3month',
            'volatility_2month',
            'volatility_3month',
            'Result_Dividend FiscalYear',
            'return_3month',
            'Forecast_Dividend FiscalYear',
            'volatility_1month',
            'Forecast_FinancialStatement FiscalYear',
            'MA_gap_1month',
            'pbr',
            'Result_FinancialStatement FiscalYear',
            'return_1month',
            'ema_12',
            'Result_FinancialStatement TotalAssets',
            'signal',
            'Previous_FinancialStatement NetIncome',
            'per',
            'Result_FinancialStatement CashFlowsFromOperatingActivities',
            'Result_FinancialStatement CashFlowsFromInvestingActivities',
            'ema_10']

FEATURES_HIGH = ['MA_gap_2month_high',
                 'MA_gap_3month_high',
                 'volatility_2month_high',
                 'volatility_3month_high',
                 'Result_Dividend FiscalYear',
                 'return_3month_high',
                 'Forecast_Dividend FiscalYear',
                 'volatility_1month_high',
                 'Forecast_FinancialStatement FiscalYear',
                 'MA_gap_1month_high',
                 'pbr',
                 'Result_FinancialStatement FiscalYear',
                 'return_1month_high',
                 'ema_12',
                 'Result_FinancialStatement TotalAssets',
                 'signal',
                 'Previous_FinancialStatement NetIncome',
                 'per',
                 'Result_FinancialStatement CashFlowsFromOperatingActivities',
                 'Result_FinancialStatement CashFlowsFromInvestingActivities',
                 'ema_10']

FEATURES_LOW = ['MA_gap_2month_low',
                'MA_gap_3month_low',
                'volatility_2month_low',
                'volatility_3month_low',
                'Result_Dividend FiscalYear',
                'return_3month_low',
                'Forecast_Dividend FiscalYear',
                'volatility_1month_low',
                'Forecast_FinancialStatement FiscalYear',
                'MA_gap_1month_low',
                'pbr',
                'Result_FinancialStatement FiscalYear',
                'return_1month_low',
                'ema_12',
                'Result_FinancialStatement TotalAssets',
                'signal',
                'Previous_FinancialStatement NetIncome',
                'per',
                'Result_FinancialStatement CashFlowsFromOperatingActivities',
                'Result_FinancialStatement CashFlowsFromInvestingActivities',
                'ema_10']


In [9]:
def get_feature_columns(dfs, train_X, column_group="fundamental+technical"):
    # 特徴量グループを定義
    # ファンダメンタル
    fundamental_cols = dfs["stock_fin"].select_dtypes("float64").columns
    fundamental_cols = fundamental_cols[
        fundamental_cols != "Result_Dividend DividendPayableDate"
        ]
    fundamental_cols = fundamental_cols[fundamental_cols != "Local Code"]
    # 価格変化率
    returns_cols = [x for x in train_X.columns if "return" in x]
    # テクニカル
    technical_cols = [
        x for x in train_X.columns if
        (x not in fundamental_cols) and (x != "code")
    ]
    columns = {
        "fundamental_only": fundamental_cols,
        "return_only": returns_cols,
        "technical_only": technical_cols,
        "fundamental+technical": list(fundamental_cols) + list(
            technical_cols),
        "selected_columns":FEATURES,
        "selected_high_columns":FEATURES_HIGH,
        "selected_low_columns":FEATURES_LOW,
    }
    return columns[column_group]

In [10]:
model_path = os.path.join(os.path.dirname("__file__"), "../../model")
test_X_path = os.path.join(os.path.dirname("__file__"), "../../../high_low_datas/test_X")
test_y_path = os.path.join(os.path.dirname("__file__"), "../../../high_low_datas/test_y")
val_X_path = os.path.join(os.path.dirname("__file__"), "../../../high_low_datas/val_X")
val_y_path = os.path.join(os.path.dirname("__file__"), "../../../high_low_datas/val_y")
train_X_path = os.path.join(os.path.dirname("__file__"), "../../../high_low_datas/train_X")
train_y_path = os.path.join(os.path.dirname("__file__"), "../../../high_low_datas/train_y")

In [11]:
# 対象の目的変数を定義
labels = {
#    "label_high_5",
#    "label_high_10",
    "label_high_20",
#    "label_low_5",
#    "label_low_10",
    "label_low_20",
}

In [12]:
# 特徴量追加済みデータ
proceed_datas = {
    "train_X",
    "train_y",
    "val_X",
    "val_y",
    "test_X",
    "test_y"
}

In [13]:
# ライブラリインポート
from sklearn.model_selection import GridSearchCV


reg_cv = GridSearchCV(lgb.LGBMRegressor(), {
    "learning_rate": [0.001, 0.01, 0.05, 0.1], 
    "n_estimators": [50, 100, 200], 
    "max_depth": [5, 7, 10],
    "num_leaves": [100],
}, verbose=1)




# reg_cv = GridSearchCV(ExtraTreesRegressor(), 
#                       {"n_estimators": [700],
#                        "max_depth": [5],
#                        "min_samples_split": [2],
#                        "min_samples_leaf": [1],
#                        "min_weight_fraction_leaf":[0.1],
#                        "random_state":[0],
#     }, verbose=1)

In [14]:
label = "label_high_20"

In [15]:
data_X = os.path.join(train_X_path, f"train_X_{label}.pkl")
with open(data_X , "rb") as f:
    train_X = pickle.load(f)
data_y = os.path.join(train_y_path, f"train_y_{label}.pkl")
with open(data_y , "rb") as f:
    train_y = pickle.load(f)

feature_columns = get_feature_columns(dfs, train_X, column_group='selected_high_columns')
# 訓練実施
reg_cv.fit(train_X[feature_columns].values, train_y.values)

You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:   45.3s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [0.001, 0.01, 0.05, 0.1], 'n_estimators': [50, 100, 200], 'max_depth': [5, 7, 10], 'num_leaves': [100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [16]:
# 結果を表示
print(reg_cv.best_params_)
print(reg_cv.best_score_)

{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'num_leaves': 100}
0.0821379400155062


In [15]:
label = "label_low_20"

In [16]:
data_X = os.path.join(train_X_path, f"train_X_{label}.pkl")
with open(data_X , "rb") as f:
    train_X = pickle.load(f)
data_y = os.path.join(train_y_path, f"train_y_{label}.pkl")
with open(data_y , "rb") as f:
    train_y = pickle.load(f)

feature_columns = get_feature_columns(dfs, train_X, column_group='selected_columns')
# 訓練実施
reg_cv.fit(train_X[feature_columns].values, train_y.values)

You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  4.8min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [0.001, 0.01, 0.05, 0.1], 'n_estimators': [50, 100, 200], 'max_depth': [5, 7, 10], 'num_leaves': [100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [17]:
# 結果を表示
print(reg_cv.best_params_)
print(reg_cv.best_score_)

{'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 100, 'num_leaves': 100}
0.23304218600961987


In [53]:
for label in tqdm(labels):
    data_X = os.path.join(train_X_path, f"train_X_{label}.pkl")
    with open(data_X , "rb") as f:
        train_X = pickle.load(f)
    data_y = os.path.join(train_y_path, f"train_y_{label}.pkl")
    with open(data_y , "rb") as f:
        train_y = pickle.load(f)
        
    feature_columns = get_feature_columns(dfs, train_X, column_group='selected_columns')
    # 訓練実施
    reg_cv.fit(train_X[feature_columns].values, train_y.values)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    1.4s finished
You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 4 candidates, totalling 12 fits



[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    1.4s finished


In [54]:
# 結果を表示
print(reg_cv.best_params_)
print(reg_cv.best_score_)

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'num_leaves': 100}
0.07809252182735595


In [39]:
best_model = reg_cv.best_estimator_

In [40]:
SELECT_FIN_DATA_COLUMNS = ['Result_FinancialStatement FiscalYear', 'Result_FinancialStatement NetSales',
       'Result_FinancialStatement OperatingIncome', 'Result_FinancialStatement OrdinaryIncome',
       'Result_FinancialStatement NetIncome', 'Result_FinancialStatement TotalAssets',
       'Result_FinancialStatement NetAssets', 'Result_FinancialStatement CashFlowsFromOperatingActivities',
       'Result_FinancialStatement CashFlowsFromFinancingActivities',
       'Result_FinancialStatement CashFlowsFromInvestingActivities', 'Forecast_FinancialStatement FiscalYear',
       'Forecast_FinancialStatement NetSales', 'Forecast_FinancialStatement OperatingIncome',
       'Forecast_FinancialStatement OrdinaryIncome', 'Forecast_FinancialStatement NetIncome',
       'Result_Dividend FiscalYear', 'Result_Dividend QuarterlyDividendPerShare',
       'Result_Dividend AnnualDividendPerShare', 'Forecast_Dividend FiscalYear',
       'Forecast_Dividend QuarterlyDividendPerShare', 'Forecast_Dividend AnnualDividendPerShare',
       'IssuedShareEquityQuote IssuedShare','Section/Products', '33 Sector(Code)', '17 Sector(Code)']

In [41]:
# 学習用データセット定義
# ファンダメンタル情報
#fundamental_cols = dfs["stock_fin"].select_dtypes("float64").columns
fundamental_cols = pd.Index(SELECT_FIN_DATA_COLUMNS)
fundamental_cols = fundamental_cols[fundamental_cols != "Result_Dividend DividendPayableDate"]
fundamental_cols = fundamental_cols[fundamental_cols != "Local Code"]
# 価格変化率
returns_cols = [x for x in train_X.columns if "return" in x]
# テクニカル
technical_cols = [x for x in train_X.columns if (x not in fundamental_cols) and (x != "code")]

In [42]:
columns = {
    "fundamental_only": fundamental_cols,
    "return_only": returns_cols,
    "technical_only": technical_cols,
    "fundamental+technical": list(fundamental_cols) + list(technical_cols),
    "selected_columns":FEATURES,
}

In [44]:
# 結果保存用
all_results = dict()
all_results['XGB'] = dict()

In [45]:
for label in labels:
    data_X = os.path.join(test_X_path, f"test_X_{label}.pkl")
    with open(data_X , "rb") as f:
        test_X = pickle.load(f)
    data_y = os.path.join(test_y_path, f"test_y_{label}.pkl")
    with open(data_y , "rb") as f:
        test_y = pickle.load(f)
    # データセット毎に処理
    for col in columns.keys():
        result = dict()
        # 目的変数毎に処理
        for label in tqdm(labels):
            if len(test_X[columns[col]]) > 0:
                # モデル取得
                pred_model = best_model
                # 学習
                pred_model.fit(train_X[columns[col]].values, train_y)
                # 結果データ作成
                result[label] = test_X[["code"]].copy()
                result[label]["datetime"] = test_X[columns[col]].index
                # 予測
                result[label]["predict"] = pred_model.predict(test_X[columns[col]].values)
                result[label]["predict_dir"] = np.sign(result[label]["predict"])
                # 実際の結果
                result[label]["actual"] = test_y.values
                result[label]["actual_dir"] = np.sign(result[label]["actual"])
                result[label].dropna(inplace=True)

        all_results['XGB'][col] = result

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




In [47]:
results = []
for model in all_results.keys():
    for col in all_results[model]:
        tmp = pd.concat(all_results[model][col])
        tmp["model"] = model
        tmp["feature"] = col
        results.append(tmp)
results = pd.concat(results)
results["label"] = [x[0] for x in results.index]
results.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,code,datetime,predict,predict_dir,actual,actual_dir,model,feature,label
Unnamed: 0_level_1,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
label_high_20,2019-02-08,1301,2019-02-08,0.064676,1.0,0.07854,1.0,XGB,fundamental_only,label_high_20
label_high_20,2019-05-13,1301,2019-05-13,0.060717,1.0,0.04379,1.0,XGB,fundamental_only,label_high_20
label_high_20,2019-08-02,1301,2019-08-02,0.050649,1.0,0.00498,1.0,XGB,fundamental_only,label_high_20
label_high_20,2019-11-05,1301,2019-11-05,0.047678,1.0,0.00841,1.0,XGB,fundamental_only,label_high_20
label_high_20,2020-02-07,1301,2020-02-07,0.062961,1.0,0.01212,1.0,XGB,fundamental_only,label_high_20


In [48]:
# 結果保存用変数
all_metrics = []

# データセット毎に処理
for feature in columns:
    matrix = dict()
    # 目的変数毎に処理
    for label in labels:
        # 処理対象データに絞り込み
        tmp_df = results[(results["model"] == "XGB") & (results["label"] == label) & (results["feature"] == feature)]
        # RMSE
        rmse = np.sqrt(mean_squared_error(tmp_df["predict"], tmp_df["actual"]))
        # 精度
        accuracy = accuracy_score(tmp_df["predict_dir"], tmp_df["actual_dir"])
        # 相関係数
        corr = np.corrcoef(tmp_df["actual"], tmp_df["predict"])[0, 1]
        # 順位相関
        spearman_corr = spearmanr(tmp_df["actual"], tmp_df["predict"])[0]
        # 結果を保存
        matrix[label] = [rmse, accuracy, spearman_corr,corr, corr**2, feature, model, tmp_df.shape[0]]
    res = pd.DataFrame.from_dict(matrix).T
    res.columns = ["RMSE","accuracy","spearman_corr","corr","R^2 score","feature", "model", "# of samples"]
    all_metrics.append(res)
all_metrics = pd.concat(all_metrics)
all_metrics.reset_index()

Unnamed: 0,index,RMSE,accuracy,spearman_corr,corr,R^2 score,feature,model,# of samples
0,label_low_20,0.164028,0.832231,0.134492,0.203962,0.0416004,fundamental_only,XGB,32515
1,label_high_20,0.164028,0.832231,0.134492,0.203962,0.0416004,fundamental_only,XGB,32515
2,label_low_20,0.164327,0.832293,0.242639,0.211186,0.0445996,return_only,XGB,32515
3,label_high_20,0.164327,0.832293,0.242639,0.211186,0.0445996,return_only,XGB,32515
4,label_low_20,0.160718,0.832262,0.255127,0.280388,0.0786172,technical_only,XGB,32515
5,label_high_20,0.160718,0.832262,0.255127,0.280388,0.0786172,technical_only,XGB,32515
6,label_low_20,0.160898,0.832293,0.252184,0.278138,0.0773606,fundamental+technical,XGB,32515
7,label_high_20,0.160898,0.832293,0.252184,0.278138,0.0773606,fundamental+technical,XGB,32515
8,label_low_20,0.161093,0.832293,0.250238,0.275418,0.0758552,selected_columns,XGB,32515
9,label_high_20,0.161093,0.832293,0.250238,0.275418,0.0758552,selected_columns,XGB,32515


In [49]:
tmp = all_metrics.drop(columns=["# of samples"])

In [50]:
tmp.to_csv('result_XGB.csv', index=None)

In [51]:
tmp = pd.read_csv('result_XGB.csv')

In [25]:
tmp.groupby(['feature', 'model']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,RMSE,accuracy,spearman_corr,corr,R^2 score
feature,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
fundamental+technical,XGB,0.104644,0.867938,0.145044,0.114148,0.01303
fundamental_only,XGB,0.105916,0.867938,0.098257,0.116753,0.013631
return_only,XGB,0.104488,0.867938,0.132414,0.10627,0.011293
selected_columns,XGB,0.104905,0.867938,0.130836,0.097465,0.009499
technical_only,XGB,0.104483,0.867938,0.139428,0.109766,0.012049
