# Imports & Load Data
作業に必要なライブラリをインポートして、 以下のデータを読み込みます。

* stock_price : 株価情報
* stock_list : 銘柄情報
* stock_fin : 財務諸表
* stock_labels : 目的変数

In [None]:
# shap用にg++とgccをインストールします
! apt-get update
! apt-get install -y --no-install-recommends g++ gcc

# 必要なライブラリをインストールします
! pip install shap==0.37.0 slicer==0.0.3 xgboost==1.3.0.post0

In [1]:
import os
import pickle
import sys
import warnings
from glob import glob

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shap
from xgboost import XGBRegressor
from scipy.stats import spearmanr
from sklearn.ensemble import (
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.metrics import accuracy_score, mean_squared_error
from tqdm.auto import tqdm


# 表示用の設定を変更します
%matplotlib inline
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.width = 120

In [2]:
# python 3.7.3であることを確認します
import sys
print(sys.version)

3.7.3 (default, Mar 27 2019, 22:11:17) 
[GCC 7.3.0]


In [3]:
# データセット保存先ディレクトリ（""の中身はご自身の環境に合わせて定義してください。）
dataset_dir="/path/to"

In [4]:
# 読み込むファイルを定義します。
inputs = {
    "stock_list": f"{dataset_dir}/stock_list.csv.gz",
    "stock_price": f"{dataset_dir}/stock_price.csv.gz",
    "stock_fin": f"{dataset_dir}/stock_fin.csv.gz",
    # 本チュートリアルでは使用しないため、コメントアウトしています。
    # "stock_fin_price": f"{dataset_dir}/stock_fin_price.csv.gz",
    "stock_labels": f"{dataset_dir}/stock_labels.csv.gz",
}

# ファイルを読み込みます
dfs = {}
for k, v in inputs.items():
    print(k)
    dfs[k] = pd.read_csv(v)

stock_list
stock_price
stock_fin
stock_labels


# Stock List

In [5]:
model_path = os.path.join(os.path.dirname("__file__"), "../../get_data")

with open(os.path.join(model_path, "stock_list.pkl"), "rb") as f:
    new_stock_list = pickle.load(f)

In [7]:
type_lists = dfs['stock_list'].dtypes

In [8]:
new_stock_list = new_stock_list[new_stock_list['Effective Date'] == '20210129']

In [9]:
for index, item in type_lists.iteritems():
    if index == 'IssuedShareEquityQuote IssuedShare':
        new_stock_list[index] = pd.to_numeric(new_stock_list[index])
    else:
        new_stock_list[index] = new_stock_list[index].astype(item)

In [10]:
new_stock_list = new_stock_list.reset_index(drop=True)

In [11]:
new_stock_list = new_stock_list.drop(columns=['universe_comp2'])

In [12]:
new_stock_list.shape

(3707, 14)

In [13]:
new_stock_list[new_stock_list['Local Code'] == 9984]

Unnamed: 0,17 Sector(Code),17 Sector(name),33 Sector(Code),33 Sector(name),Effective Date,IssuedShareEquityQuote AccountingStandard,IssuedShareEquityQuote IssuedShare,IssuedShareEquityQuote ModifyDate,Local Code,Name (English),Section/Products,Size (New Index Series),Size Code (New Index Series),prediction_target
2704,10,"IT & SERVICES, OTHERS",5250,Information & Communication,20210129,ConsolidatedIFRS,2089814000.0,2020/11/09,9984,SoftBank Group Corp.,First Section (Domestic),TOPIX Core30,1,True


# Stock Price

In [14]:
model_path = os.path.join(os.path.dirname("__file__"), "../../get_data")

with open(os.path.join(model_path, "stock_price.pkl"), "rb") as f:
    tmp_stock_price = pickle.load(f)

In [15]:
type_lists = dfs['stock_price'].dtypes

In [16]:
for index, item in type_lists.iteritems():
    tmp_stock_price[index] = tmp_stock_price[index].astype(item)

In [17]:
stock_price_columns = dfs['stock_price'].columns

In [18]:
new_stock_price = dfs['stock_price']

In [19]:
new_stock_price = new_stock_price.append(tmp_stock_price[stock_price_columns])

In [20]:
new_stock_price = new_stock_price.reset_index(drop=True)

In [21]:
new_stock_price[new_stock_price['Local Code'] == 9984]

Unnamed: 0,Local Code,EndOfDayQuote Date,EndOfDayQuote Open,EndOfDayQuote High,EndOfDayQuote Low,EndOfDayQuote Close,EndOfDayQuote ExchangeOfficialClose,EndOfDayQuote Volume,EndOfDayQuote CumulativeAdjustmentFactor,EndOfDayQuote PreviousClose,EndOfDayQuote PreviousCloseDate,EndOfDayQuote PreviousExchangeOfficialClose,EndOfDayQuote PreviousExchangeOfficialCloseDate,EndOfDayQuote ChangeFromPreviousClose,EndOfDayQuote PercentChangeFromPreviousClose,EndOfDayQuote VWAP
4212010,9984,2016/01/04,3025.0,3072.5,2984.5,2996.5,2996.5,12751600.0,2.0,3069.5,2015/12/30,3069.5,2015/12/30,-73.0,-2.378,3011.845
4212011,9984,2016/01/05,2975.0,3023.0,2965.0,2984.5,2984.5,9739400.0,2.0,2996.5,2016/01/04,2996.5,2016/01/04,-12.0,-0.400,2990.394
4212012,9984,2016/01/06,2985.0,3000.0,2911.0,2944.5,2944.5,12891800.0,2.0,2984.5,2016/01/05,2984.5,2016/01/05,-40.0,-1.340,2950.130
4212013,9984,2016/01/07,2944.5,2954.5,2869.5,2875.5,2875.5,11946000.0,2.0,2944.5,2016/01/06,2944.5,2016/01/06,-69.0,-2.343,2902.300
4212014,9984,2016/01/08,2817.5,2921.5,2790.0,2847.5,2847.5,18098200.0,2.0,2875.5,2016/01/07,2875.5,2016/01/07,-28.0,-0.974,2850.308
4212015,9984,2016/01/12,2777.0,2831.0,2763.0,2764.0,2764.0,14905800.0,2.0,2847.5,2016/01/08,2847.5,2016/01/08,-83.5,-2.932,2787.057
4212016,9984,2016/01/13,2800.0,2862.5,2795.5,2846.0,2846.0,11749000.0,2.0,2764.0,2016/01/12,2764.0,2016/01/12,82.0,2.967,2841.036
4212017,9984,2016/01/14,2765.0,2772.5,2695.0,2760.0,2760.0,14822600.0,2.0,2846.0,2016/01/13,2846.0,2016/01/13,-86.0,-3.022,2737.202
4212018,9984,2016/01/15,2804.5,2854.5,2749.0,2774.0,2774.0,13434200.0,2.0,2760.0,2016/01/14,2760.0,2016/01/14,14.0,0.507,2796.880
4212019,9984,2016/01/18,2650.0,2675.0,2547.5,2555.5,2555.5,25528000.0,2.0,2774.0,2016/01/15,2774.0,2016/01/15,-218.5,-7.877,2590.897


# Stock Fin

In [22]:
model_path = os.path.join(os.path.dirname("__file__"), "../../get_data")

with open(os.path.join(model_path, "stock_fin.pkl"), "rb") as f:
    tmp_stock_fin = pickle.load(f)

In [23]:
type_lists = dfs['stock_fin'].dtypes

In [24]:
for index, item in type_lists.iteritems():
    if item == "float64":
        tmp_stock_fin[index] = pd.to_numeric(tmp_stock_fin[index])
    else:
        tmp_stock_fin[index] = tmp_stock_fin[index].astype(item)

In [25]:
stock_fin_columns = dfs['stock_fin'].columns

In [26]:
new_stock_fin = dfs['stock_fin']

In [27]:
new_stock_fin = new_stock_fin.append(tmp_stock_fin[stock_fin_columns])

In [28]:
new_stock_fin = new_stock_fin.reset_index(drop=True)

In [29]:
new_stock_fin[new_stock_fin['Local Code'] == 9984]

Unnamed: 0,base_date,Local Code,Result_FinancialStatement AccountingStandard,Result_FinancialStatement FiscalPeriodEnd,Result_FinancialStatement ReportType,Result_FinancialStatement FiscalYear,Result_FinancialStatement ModifyDate,Result_FinancialStatement CompanyType,Result_FinancialStatement ChangeOfFiscalYearEnd,Result_FinancialStatement NetSales,Result_FinancialStatement OperatingIncome,Result_FinancialStatement OrdinaryIncome,Result_FinancialStatement NetIncome,Result_FinancialStatement TotalAssets,Result_FinancialStatement NetAssets,Result_FinancialStatement CashFlowsFromOperatingActivities,Result_FinancialStatement CashFlowsFromFinancingActivities,Result_FinancialStatement CashFlowsFromInvestingActivities,Forecast_FinancialStatement AccountingStandard,Forecast_FinancialStatement FiscalPeriodEnd,Forecast_FinancialStatement ReportType,Forecast_FinancialStatement FiscalYear,Forecast_FinancialStatement ModifyDate,Forecast_FinancialStatement CompanyType,Forecast_FinancialStatement ChangeOfFiscalYearEnd,Forecast_FinancialStatement NetSales,Forecast_FinancialStatement OperatingIncome,Forecast_FinancialStatement OrdinaryIncome,Forecast_FinancialStatement NetIncome,Result_Dividend FiscalPeriodEnd,Result_Dividend ReportType,Result_Dividend FiscalYear,Result_Dividend ModifyDate,Result_Dividend RecordDate,Result_Dividend DividendPayableDate,Result_Dividend QuarterlyDividendPerShare,Result_Dividend AnnualDividendPerShare,Forecast_Dividend FiscalPeriodEnd,Forecast_Dividend ReportType,Forecast_Dividend FiscalYear,Forecast_Dividend ModifyDate,Forecast_Dividend RecordDate,Forecast_Dividend QuarterlyDividendPerShare,Forecast_Dividend AnnualDividendPerShare
2800,2016/02/10,9984,ConsolidatedIFRS,2015/12,Q3,2016.0,2016/02/10,GB,False,6810206.0,875322.0,918214.0,428972.0,21497648.0,3855507.0,,,,,,,,,,,,,,,,,,,,,,,2016/03,Annual,2016.0,2016/02/10,2016/03/31,20.0,40.0
4300,2016/04/21,9984,ConsolidatedIFRS,2015/12,Q3,2016.0,2016/02/10,GB,False,6810206.0,875322.0,918214.0,428972.0,21497648.0,3855507.0,,,,,,,,,,,,,,,,,,,,,,,2016/03,Annual,2016.0,2016/04/21,2016/03/31,21.0,41.0
5798,2016/05/10,9984,ConsolidatedIFRS,2016/03,Annual,2016.0,2016/05/10,GB,False,9153549.0,999488.0,1005764.0,474172.0,20707192.0,3505271.0,940186.0,43270.0,-1651682.0,,,,,,,,,,,,2016/03,Annual,2016.0,2016/05/10,2016/03/31,2016/06/23,21.0,41.0,2016/09,Q2,2017.0,2016/05/10,,22.0,
8735,2016/07/28,9984,ConsolidatedIFRS,2016/06,Q1,2017.0,2016/07/28,GB,False,2126521.0,319236.0,356361.0,254157.0,20622229.0,3271996.0,,,,,,,,,,,,,,,2016/03,Annual,2016.0,2016/07/28,2016/03/31,2016/06/23,21.0,41.0,2016/09,Q2,2017.0,2016/05/10,,22.0,
13977,2016/11/07,9984,ConsolidatedIFRS,2016/09,Q2,2017.0,2016/11/07,GB,False,4271834.0,653944.0,665625.0,766249.0,22896077.0,3126322.0,,,,,,,,,,,,,,,2016/09,Q2,2017.0,2016/11/07,2016/09/30,2016/12/12,22.0,,2017/03,Annual,2017.0,2016/11/07,2017/03/31,22.0,44.0
17920,2017/02/08,9984,ConsolidatedIFRS,2016/12,Q3,2017.0,2017/02/08,GB,False,6581466.0,949660.0,975257.0,857431.0,24867038.0,3981711.0,,,,,,,,,,,,,,,2016/09,Q2,2017.0,2017/02/08,2016/09/30,2016/12/12,22.0,,2017/03,Annual,2017.0,2017/02/08,2017/03/31,22.0,44.0
21731,2017/05/10,9984,ConsolidatedIFRS,2017/03,Annual,2017.0,2017/05/10,GB,False,8901004.0,1025999.0,712526.0,1426308.0,24634212.0,4469730.0,1500728.0,2380746.0,-4213597.0,,,,,,,,,,,,2017/03,Annual,2017.0,2017/05/10,2017/03/31,2017/06/22,22.0,44.0,2017/09,Q2,2018.0,2017/05/10,,22.0,
26044,2017/08/07,9984,ConsolidatedIFRS,2017/06,Q1,2018.0,2017/08/07,GB,False,2186059.0,479273.0,77568.0,5521.0,24883131.0,4654749.0,,,,,,,,,,,,,,,2017/03,Annual,2017.0,2017/08/07,2017/03/31,2017/06/22,22.0,44.0,2017/09,Q2,2018.0,2017/05/10,,22.0,
29785,2017/11/06,9984,ConsolidatedIFRS,2017/09,Q2,2018.0,2017/11/06,GB,False,4411135.0,874839.0,219021.0,102622.0,27973483.0,5468373.0,,,,,,,,,,,,,,,2017/09,Q2,2018.0,2017/11/06,2017/09/30,2017/12/11,22.0,,2018/03,Annual,2018.0,2017/05/10,2018/03/31,22.0,44.0
33852,2018/02/07,9984,ConsolidatedIFRS,2017/12,Q3,2018.0,2018/02/07,GB,False,6811274.0,1148829.0,563804.0,1014944.0,29412718.0,6568877.0,,,,,,,,,,,,,,,2017/09,Q2,2018.0,2017/11/06,2017/09/30,2017/12/11,22.0,,2018/03,Annual,2018.0,2017/05/10,2018/03/31,22.0,44.0


# Stock Labals

In [30]:
model_path = os.path.join(os.path.dirname("__file__"), "../../get_data")

with open(os.path.join(model_path, "stock_labels.pkl"), "rb") as f:
    tmp_stock_labels = pickle.load(f)

In [31]:
tmp_stock_labels[tmp_stock_labels['Local Code'] == '9984']

Unnamed: 0,Local Code,base_date,label_date_10,label_date_20,label_date_5,label_high_10,label_high_20,label_high_5,label_low_10,label_low_20,label_low_5
3700,9984,2021-01-04,2021-01-19,2021-02-02,2021-01-12,0.09528,0.12875,0.02585,-0.02073,-0.02073,-0.02073
3700,9984,2021-01-05,2021-01-20,2021-02-03,2021-01-13,0.1081,0.12706,0.05549,-0.01646,-0.01646,-0.01646
3699,9984,2021-01-06,2021-01-21,2021-02-04,2021-01-14,0.11318,0.11318,0.07192,-0.02857,-0.02857,-0.02857
3699,9984,2021-01-07,2021-01-22,2021-02-05,2021-01-15,0.13086,0.13612,0.09734,-0.01314,-0.01314,-0.01314
3699,9984,2021-01-08,2021-01-25,2021-02-08,2021-01-18,0.13157,0.20669,0.09802,-0.01252,-0.01252,-0.01252
3699,9984,2021-01-12,2021-01-26,2021-02-09,2021-01-19,0.11634,0.25232,0.08324,0.00235,0.00111,0.00235
3698,9984,2021-01-13,2021-01-27,2021-02-10,2021-01-20,0.08577,0.21802,0.06751,0.003,-0.02631,0.003
3698,9984,2021-01-14,2021-01-28,2021-02-12,2021-01-21,0.05485,0.18334,0.05485,-0.0454,-0.05403,-0.02556
3698,9984,2021-01-15,2021-01-29,2021-02-15,2021-01-22,0.06116,0.19042,0.06116,-0.04837,-0.04837,-0.01972
3698,9984,2021-01-18,2021-02-01,2021-02-16,2021-01-25,0.06304,0.23486,0.06304,-0.04669,-0.04669,0.008


In [32]:
columns = dfs['stock_labels'].columns

In [33]:
tmp_stock_labels = tmp_stock_labels[columns]

In [34]:
type_lists = dfs['stock_labels'].dtypes

In [35]:
for index, item in type_lists.iteritems():
    if item == "float64":
        tmp_stock_labels[index] = pd.to_numeric(tmp_stock_labels[index])
    else:
        tmp_stock_labels[index] = tmp_stock_labels[index].astype(item)

In [36]:
stock_labels_columns = dfs['stock_labels'].columns

In [37]:
new_stock_labels = dfs['stock_labels']

In [38]:
new_stock_labels = new_stock_labels.append(tmp_stock_labels[stock_labels_columns])

In [39]:
new_stock_labels = new_stock_labels.reset_index(drop=True)

In [40]:
new_stock_labels[new_stock_labels['Local Code'] == 9984]

Unnamed: 0,base_date,Local Code,label_date_5,label_high_5,label_low_5,label_date_10,label_high_10,label_low_10,label_date_20,label_high_20,label_low_20
4212010,2016-01-04,9984,2016-01-12,0.00884,-0.07792,2016-01-19,0.00884,-0.17170,2016-02-02,0.00884,-0.24245
4212011,2016-01-05,9984,2016-01-13,0.00519,-0.07422,2016-01-20,0.00519,-0.21595,2016-02-03,0.00519,-0.23940
4212012,2016-01-06,9984,2016-01-14,0.00340,-0.08473,2016-01-21,0.00340,-0.22907,2016-02-04,0.00340,-0.22907
4212013,2016-01-07,9984,2016-01-15,0.01600,-0.06277,2016-01-22,0.01600,-0.21057,2016-02-05,0.01600,-0.21057
4212014,2016-01-08,9984,2016-01-18,0.00527,-0.10536,2016-01-25,0.00527,-0.20281,2016-02-08,0.00527,-0.20281
4212015,2016-01-12,9984,2016-01-19,0.03564,-0.10203,2016-01-26,0.03564,-0.17873,2016-02-09,0.03564,-0.17873
4212016,2016-01-13,9984,2016-01-20,0.00299,-0.17779,2016-01-27,0.00299,-0.20239,2016-02-10,0.00299,-0.21592
4212017,2016-01-14,9984,2016-01-21,0.03424,-0.17754,2016-01-28,0.03424,-0.17754,2016-02-12,0.03424,-0.25127
4212018,2016-01-15,9984,2016-01-22,-0.03569,-0.18169,2016-01-29,-0.03569,-0.18169,2016-02-15,-0.00955,-0.25505
4212019,2016-01-18,9984,2016-01-25,0.00313,-0.11172,2016-02-01,0.07513,-0.11172,2016-02-16,0.07513,-0.19135


# データの整形

In [41]:
SELECT_FIN_DATA_COLUMNS = ['Result_FinancialStatement FiscalYear', 'Result_FinancialStatement NetSales',
       'Result_FinancialStatement OperatingIncome', 'Result_FinancialStatement OrdinaryIncome',
       'Result_FinancialStatement NetIncome', 'Result_FinancialStatement TotalAssets',
       'Result_FinancialStatement NetAssets', 'Result_FinancialStatement CashFlowsFromOperatingActivities',
       'Result_FinancialStatement CashFlowsFromFinancingActivities',
       'Result_FinancialStatement CashFlowsFromInvestingActivities', 'Forecast_FinancialStatement FiscalYear',
       'Forecast_FinancialStatement NetSales', 'Forecast_FinancialStatement OperatingIncome',
       'Forecast_FinancialStatement OrdinaryIncome', 'Forecast_FinancialStatement NetIncome',
       'Result_Dividend FiscalYear', 'Result_Dividend QuarterlyDividendPerShare',
       'Result_Dividend AnnualDividendPerShare', 'Forecast_Dividend FiscalYear',
       'Forecast_Dividend QuarterlyDividendPerShare', 'Forecast_Dividend AnnualDividendPerShare',
       'IssuedShareEquityQuote IssuedShare','Section/Products', '33 Sector(Code)', '17 Sector(Code)']

In [42]:
section_products = {
    "First Section (Domestic)" : 1,
    "JASDAQ(Standard / Domestic)" :2,
    "Second Section(Domestic)" :3,
    "Mothers (Domestic)" : 4,
    "JASDAQ(Growth/Domestic)" :5
}

In [43]:
def calculate_glossary_of_financial_analysis(row):
    operating_profit_margin = 0
    ordinary_profit_margin = 0
    net_profit_margin = 0
    total_asset_turnover = 0
    net_sales_growth_rate = 0
    ordinary_income_growth_rate = 0
    operationg_income_growth_rate = 0
    total_assets_growth_rate = 0
    net_assets_growth_rate = 0
    eps = 0
    bps = 0
    roe = 0

    # 売上高営業利益率 売上高営業利益率（％）＝営業利益÷売上高×100
    if row['Result_FinancialStatement NetSales'] != 0:
        operating_profit_margin = \
            row['Result_FinancialStatement OperatingIncome'] / \
            row['Result_FinancialStatement NetSales'] * 100
    # 売上高経常利益率　売上高経常利益率（％）＝経常利益÷売上高×100
    if row['Result_FinancialStatement NetSales'] != 0:
        ordinary_profit_margin = \
            row['Result_FinancialStatement OrdinaryIncome'] / \
            row['Result_FinancialStatement NetSales'] * 100
    # 売上高純履歴率　売上高純利益率（％）＝当期純利益÷売上高×100
    if row['Result_FinancialStatement NetSales'] != 0:
        net_profit_margin = row['Result_FinancialStatement NetIncome'] / \
                            row['Result_FinancialStatement NetSales'] * 100
    # 総資本回転率 総資本回転率（％）＝売上高÷総資本（自己資本＋他人資本）×100
    if row['Result_FinancialStatement NetAssets'] != 0:
        total_asset_turnover = row['Result_FinancialStatement NetSales'] / \
                            row['Result_FinancialStatement NetAssets'] * 100
    # 売上高増加率
    if row['Previous_FinancialStatement NetSales'] != 0:
        net_sales_growth_rate = \
            (row['Result_FinancialStatement NetSales'] -
            row['Previous_FinancialStatement NetSales']) / \
            row['Previous_FinancialStatement NetSales'] * 100
    # 経常利益増加率
    if row['Previous_FinancialStatement OrdinaryIncome'] != 0:
        ordinary_income_growth_rate = \
            (row['Result_FinancialStatement OrdinaryIncome'] -
            row['Previous_FinancialStatement OrdinaryIncome']) / \
            row['Previous_FinancialStatement OrdinaryIncome'] * 100

    # 営業利益増加率
    if row['Previous_FinancialStatement OperatingIncome'] != 0:
        operationg_income_growth_rate = \
            (row['Result_FinancialStatement OperatingIncome'] -
            row['Previous_FinancialStatement OperatingIncome']) / \
            row['Previous_FinancialStatement OperatingIncome'] * 100
    # 総資本増加率
    if row['Previous_FinancialStatement TotalAssets'] != 0:
        total_assets_growth_rate = \
            (row['Result_FinancialStatement TotalAssets'] -
            row['Previous_FinancialStatement TotalAssets']) / \
            row['Previous_FinancialStatement TotalAssets'] * 100
    # 純資本増加率
    if row['Previous_FinancialStatement NetAssets'] != 0:
        net_assets_growth_rate = \
            (row['Result_FinancialStatement NetAssets'] -
            row['Previous_FinancialStatement NetAssets']) / \
            row['Previous_FinancialStatement NetAssets'] * 100
    # 一株当たり当期純利益（EPS）
    if row['IssuedShareEquityQuote IssuedShare'] != 0:
        eps = row['Result_FinancialStatement NetIncome'] / \
            row['IssuedShareEquityQuote IssuedShare']
        # BPS 一株当たり純資産（円） ＝ 純資産 ÷ 発行済株式総数
        bps = row['Result_FinancialStatement NetAssets'] / \
            row['IssuedShareEquityQuote IssuedShare']
        # ROE EPS（一株当たり利益）÷ BPS（一株当たり純資産）× 100
        if bps > 0:
            roe = eps / bps * 100
    return pd.Series(
        [operating_profit_margin, ordinary_profit_margin,
            net_profit_margin, total_asset_turnover,
            net_sales_growth_rate, ordinary_income_growth_rate,
            operationg_income_growth_rate, total_assets_growth_rate,
            net_assets_growth_rate, eps, bps, roe])

# 特徴量の生成

In [44]:
#TRAIN_END = "2017-12-31"
#VAL_START = "2018-02-01"
#VAL_END = "2018-12-01"
#TEST_START = "2019-01-01"

In [45]:
TRAIN_END = "2018-12-31"
VAL_START = "2019-02-01"
VAL_END = "2019-12-01"
TEST_START = "2021-01-29"

In [46]:
def get_features_for_predict(dfs,code):
    """
    Args:
        dfs (dict)  : dict of pd.DataFrame include stock_fin, stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    # おおまかな手順の1つ目
    # stock_finデータを読み込み
    stock_fin = dfs["stock_fin"].copy()
    
    stock_list = dfs["stock_list"].copy()
    stock_fin = pd.merge(stock_fin, stock_list, on=["Local Code"] )

    # 特定の銘柄コードのデータに絞る
    fin_data = stock_fin[stock_fin["Local Code"] == code].copy()
    # 日付列をpd.Timestamp型に変換してindexに設定
    fin_data["datetime"] = pd.to_datetime(fin_data["base_date"])
    fin_data.set_index("datetime", inplace=True)
    # fin_dataを選択
    fin_data = fin_data[SELECT_FIN_DATA_COLUMNS]
    fin_data = fin_data.join(fin_data[['Result_FinancialStatement NetSales', 'Result_FinancialStatement OperatingIncome', 
                                   'Result_FinancialStatement OrdinaryIncome', 'Result_FinancialStatement NetIncome', 
                                   'Result_FinancialStatement TotalAssets', 'Result_FinancialStatement NetAssets',
                                   'Result_FinancialStatement CashFlowsFromOperatingActivities', 
                                   'Result_FinancialStatement CashFlowsFromFinancingActivities',
                                   'Result_FinancialStatement CashFlowsFromInvestingActivities']].rename(columns =
                                                                                                         {'Result_FinancialStatement NetSales': 'Previous_FinancialStatement NetSales',
                                                                                                          'Result_FinancialStatement OperatingIncome': 'Previous_FinancialStatement OperatingIncome', 
                                                                                                          'Result_FinancialStatement OrdinaryIncome': 'Previous_FinancialStatement OrdinaryIncome', 
                                                                                                          'Result_FinancialStatement NetIncome':'Previous_FinancialStatement NetIncome', 
                                                                                                          'Result_FinancialStatement TotalAssets': 'Previous_FinancialStatement TotalAssets', 
                                                                                                          'Result_FinancialStatement NetAssets':'Previous_FinancialStatement NetAssets',
                                                                                                          'Result_FinancialStatement CashFlowsFromOperatingActivities': 'Previous_FinancialStatement CashFlowsFromOperatingActivities', 
                                                                                                          'Result_FinancialStatement CashFlowsFromFinancingActivities':'Previous_FinancialStatement CashFlowsFromFinancingActivities',
                                                                                                          'Result_FinancialStatement CashFlowsFromInvestingActivities':'Previous_FinancialStatement CashFlowsFromInvestingActivities'}).shift(-1))
    fin_data[['operating_profit_margin', 'ordinary_profit_margin', 'net_profit_margin', 'total_asset_turnover',
         'net_sales_growth_rate', 'ordinary_income_growth_rate', 'operationg_income_growth_rate',
          'total_assets_growth_rate', 'net_assets_growth_rate', 'eps', 'bps', 'roe']] = fin_data.apply(calculate_glossary_of_financial_analysis, axis=1)

    
    # 欠損値処理
    fin_feats = fin_data.fillna(0)

    # おおまかな手順の2つ目
    # stock_priceデータを読み込む
    price = dfs["stock_price"].copy()
    # 特定の銘柄コードのデータに絞る
    price_data = price[price["Local Code"] == code].copy()
    # 日付列をpd.Timestamp型に変換してindexに設定
    price_data["datetime"] = pd.to_datetime(price_data["EndOfDayQuote Date"])
    price_data.set_index("datetime", inplace=True)
    # 終値のみに絞る
    feats = price_data[["EndOfDayQuote ExchangeOfficialClose"]].copy()
    # 終値の20営業日リターン
    feats["return_1month"] = feats["EndOfDayQuote ExchangeOfficialClose"].pct_change(20)
    # 終値の40営業日リターン
    feats["return_2month"] = feats["EndOfDayQuote ExchangeOfficialClose"].pct_change(40)
    # 終値の60営業日リターン
    feats["return_3month"] = feats["EndOfDayQuote ExchangeOfficialClose"].pct_change(60)
    # 終値の20営業日ボラティリティ
    feats["volatility_1month"] = (
        np.log(feats["EndOfDayQuote ExchangeOfficialClose"]).diff().rolling(20).std()
    )
    # 終値の40営業日ボラティリティ
    feats["volatility_2month"] = (
        np.log(feats["EndOfDayQuote ExchangeOfficialClose"]).diff().rolling(40).std()
    )
    # 終値の60営業日ボラティリティ
    feats["volatility_3month"] = (
        np.log(feats["EndOfDayQuote ExchangeOfficialClose"]).diff().rolling(60).std()
    )
    # 終値と20営業日の単純移動平均線の乖離
    feats["MA_gap_1month"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote ExchangeOfficialClose"].rolling(20).mean()
    )
    # 終値と40営業日の単純移動平均線の乖離
    feats["MA_gap_2month"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote ExchangeOfficialClose"].rolling(40).mean()
    )
    # 終値と60営業日の単純移動平均線の乖離
    feats["MA_gap_3month"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote ExchangeOfficialClose"].rolling(60).mean()
    )
    
    # EWMA
    ALPHA = 0.25
    feats["EWMA"] = feats["EndOfDayQuote ExchangeOfficialClose"]

    for t in zip(feats.index, feats.index[1:]):
        feats.loc[t[1], "EWMA"] = ALPHA * feats.loc[t[1], "EndOfDayQuote ExchangeOfficialClose"] + (1 - ALPHA) * feats.loc[t[0], "EWMA"]
    
    # EMA 10日
    feats["ema_10"] = feats["EndOfDayQuote ExchangeOfficialClose"].ewm(span=10).mean()
    
    # MACD 
    # EMA12
    feats["ema_12"] = feats["EndOfDayQuote ExchangeOfficialClose"].ewm(span=12).mean()
    # EMA 26
    feats["ema_26"] = feats["EndOfDayQuote ExchangeOfficialClose"].ewm(span=26).mean()
    feats["macd"] = feats["ema_12"] - feats["ema_26"]
    feats["signal"] = feats["macd"].ewm(span=9).mean()
    
    # PBR 株価 ÷ BPS（1株あたり純資産）
    feats["pbr"] = feats["EndOfDayQuote ExchangeOfficialClose"] / fin_data["bps"]
    # PER 株価 ÷ 1株当たり利益（EPS）
    feats["per"] = feats["EndOfDayQuote ExchangeOfficialClose"] / fin_data["eps"]

    # おおまかな手順の3つ目
    # 欠損値処理
    feats = feats.fillna(0)
    # 元データのカラムを削除
    feats = feats.drop(["EndOfDayQuote ExchangeOfficialClose"], axis=1)

    # 財務データの特徴量とマーケットデータの特徴量のインデックスを合わせる
    feats = feats.loc[feats.index.isin(fin_feats.index)]
    fin_feats = fin_feats.loc[fin_feats.index.isin(feats.index)]

    # データを結合
    feats = pd.concat([feats, fin_feats], axis=1).dropna()

    # 欠損値処理を行います。
    feats = feats.replace([np.inf, -np.inf], 0)
    
    # 市場・商品区分を数値に変換
    feats["Section/Products"] = section_products[feats["Section/Products"][0]]
    # 銘柄コードを設定
    feats["code"] = code

    return feats

In [47]:
def get_features_and_label(dfs, codes, feature, label):
    """
    Args:
        dfs (dict[pd.DataFrame]): loaded data
        codes  (array) : target codes
        feature (pd.DataFrame): features
        label (str) : label column name
    Returns:
        train_X (pd.DataFrame): training data
        train_y (pd.DataFrame): label for train_X
        val_X (pd.DataFrame): validation data
        val_y (pd.DataFrame): label for val_X
        test_X (pd.DataFrame): test data
        test_y (pd.DataFrame): label for test_X
    """
    # 分割データ用の変数を定義
    trains_X, vals_X, tests_X = [], [], []
    trains_y, vals_y, tests_y = [], [], []

    # 銘柄コード毎に特徴量を作成
    for code in tqdm(codes):
        # 特徴量取得
        feats = feature[feature["code"] == code]

        # stock_labelデータを読み込み
        stock_labels = dfs["stock_labels"].copy()
        # 特定の銘柄コードのデータに絞る
        stock_labels = stock_labels[stock_labels["Local Code"] == code]
        # 日付列をpd.Timestamp型に変換してindexに設定
        stock_labels["datetime"] = pd.to_datetime(stock_labels["base_date"])
        stock_labels.set_index("datetime", inplace=True)

        # 特定の目的変数に絞る
        labels = stock_labels[label]
        # nanを削除
        labels.dropna(inplace=True)

        if feats.shape[0] > 0 and labels.shape[0] > 0:
            # 特徴量と目的変数のインデックスを合わせる
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]
            labels.index = feats.index

            # データを分割（ホールドアウト法）
            _train_X = feats[: TRAIN_END].copy()
            _val_X = feats[VAL_START : VAL_END].copy()
            _test_X = feats[TEST_START :].copy()

            _train_y = labels[: TRAIN_END].copy()
            _val_y = labels[VAL_START : VAL_END].copy()
            _test_y = labels[TEST_START :].copy()

            # データを配列に格納 (後ほど結合するため)
            trains_X.append(_train_X)
            vals_X.append(_val_X)
            tests_X.append(_test_X)

            trains_y.append(_train_y)
            vals_y.append(_val_y)
            tests_y.append(_test_y)

    # 銘柄毎に作成した説明変数データを結合します。
    train_X = pd.concat(trains_X)
    val_X = pd.concat(vals_X)
    test_X = pd.concat(tests_X)
    # 銘柄毎に作成した目的変数データを結合します。
    train_y = pd.concat(trains_y)
    val_y = pd.concat(vals_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, val_X, val_y, test_X, test_y

In [48]:
del dfs

In [49]:
import gc
gc.collect()

60

In [50]:
dfs = {}
dfs['stock_list'] = new_stock_list
dfs['stock_price'] = new_stock_price
dfs['stock_fin'] = new_stock_fin
dfs['stock_labels'] = new_stock_labels

In [51]:
# 対象銘柄コードを定義
codes = [9984]
# 対象の目的変数を定義
label = "label_high_20"
# 特徴量を取得
feat = get_features_for_predict(dfs, codes[0])
# 特徴量と目的変数を入力し、分割データを取得
ret = get_features_and_label(dfs, codes, feat, label)
for v in ret:
    print(v.T)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


datetime                                              2016-02-10    2016-04-21    2016-05-10    2016-07-28  \
return_1month                                      -1.913212e-01  8.908766e-02  8.688554e-02 -7.610193e-02   
return_2month                                       0.000000e+00  1.441458e-01  2.409844e-02 -1.417147e-01   
return_3month                                       0.000000e+00  2.555166e-01  1.739812e-01 -1.038744e-01   
volatility_1month                                   4.402584e-02  1.717726e-02  2.271106e-02  3.175834e-02   
volatility_2month                                   0.000000e+00  1.705030e-02  1.818629e-02  3.062359e-02   
volatility_3month                                   0.000000e+00  3.428422e-02  3.271347e-02  2.758085e-02   
MA_gap_1month                                       9.107187e-01  1.071322e+00  1.008618e+00  9.588478e-01   
MA_gap_2month                                       0.000000e+00  1.067462e+00  1.037082e+00  9.375874e-01   
MA_gap_3m

In [58]:
def get_codes(dfs):
    """
    Args:
        dfs (dict[pd.DataFrame]): loaded data
    Returns:
        array: list of stock codes
    """
    stock_list = dfs["stock_list"].copy()
    # 予測対象の銘柄コードを取得
    codes = stock_list[stock_list["prediction_target"] == True][
        "Local Code"
    ].values
    return codes

In [60]:
# 対象の目的変数を定義
labels = {
    "label_high_5",
    "label_high_10",
    "label_high_20",
    "label_low_5",
    "label_low_10",
    "label_low_20",
}
# 目的変数毎にデータを保存するための変数
train_X, val_X, test_X = {}, {}, {}
train_y, val_y, test_y = {}, {}, {}

# 予測対象銘柄を取得
codes = get_codes(dfs)

# 特徴量を作成
buff = []
for code in tqdm(codes):
    feat = get_features_for_predict(dfs, code)
    buff.append(feat)
feature = pd.concat(buff)

# 目的変数毎に処理
for label in tqdm(labels):
    # 特徴量と目的変数を取得
    _train_X, _train_y, _val_X, _val_y, _test_X, _test_y = get_features_and_label(dfs, codes, feature, label)
    # 目的変数をキーとして値を保存
    train_X[label] = _train_X
    val_X[label] = _val_X
    test_X[label] = _test_X
    train_y[label] = _train_y
    val_y[label] = _val_y
    test_y[label] = _test_y

HBox(children=(IntProgress(value=0, max=3707), HTML(value='')))

divide by zero encountered in log
divide by zero encountered in log
divide by zero encountered in log





HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3707), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3707), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3707), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3707), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3707), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3707), HTML(value='')))




In [66]:
test_X['label_high_20']

Unnamed: 0_level_0,return_1month,return_2month,return_3month,volatility_1month,volatility_2month,volatility_3month,MA_gap_1month,MA_gap_2month,MA_gap_3month,EWMA,ema_10,ema_12,ema_26,macd,signal,pbr,per,Result_FinancialStatement FiscalYear,Result_FinancialStatement NetSales,Result_FinancialStatement OperatingIncome,Result_FinancialStatement OrdinaryIncome,Result_FinancialStatement NetIncome,Result_FinancialStatement TotalAssets,Result_FinancialStatement NetAssets,Result_FinancialStatement CashFlowsFromOperatingActivities,Result_FinancialStatement CashFlowsFromFinancingActivities,Result_FinancialStatement CashFlowsFromInvestingActivities,Forecast_FinancialStatement FiscalYear,Forecast_FinancialStatement NetSales,Forecast_FinancialStatement OperatingIncome,Forecast_FinancialStatement OrdinaryIncome,Forecast_FinancialStatement NetIncome,Result_Dividend FiscalYear,Result_Dividend QuarterlyDividendPerShare,Result_Dividend AnnualDividendPerShare,Forecast_Dividend FiscalYear,Forecast_Dividend QuarterlyDividendPerShare,Forecast_Dividend AnnualDividendPerShare,IssuedShareEquityQuote IssuedShare,Section/Products,33 Sector(Code),17 Sector(Code),Previous_FinancialStatement NetSales,Previous_FinancialStatement OperatingIncome,Previous_FinancialStatement OrdinaryIncome,Previous_FinancialStatement NetIncome,Previous_FinancialStatement TotalAssets,Previous_FinancialStatement NetAssets,Previous_FinancialStatement CashFlowsFromOperatingActivities,Previous_FinancialStatement CashFlowsFromFinancingActivities,Previous_FinancialStatement CashFlowsFromInvestingActivities,operating_profit_margin,ordinary_profit_margin,net_profit_margin,total_asset_turnover,net_sales_growth_rate,ordinary_income_growth_rate,operationg_income_growth_rate,total_assets_growth_rate,net_assets_growth_rate,eps,bps,roe,code
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1
2021-02-01,-0.040394,-0.005107,-0.014503,0.008851,0.010718,0.012429,0.979337,0.975170,0.987418,2952.559252,2959.795413,2963.902389,2977.527836,-13.625447,-6.027200,4.728769e+05,-6.656347e+07,2021.0,82968.0,-169.0,108.0,-215.0,56273.0,30264.0,0.0,0.0,0.0,2021.0,110000.0,-200.0,100.0,-200.0,2020.0,100.0,100.0,0.0,0.00,0.00,4.897723e+06,3,6050,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.203693,0.130171,-0.259136,274.147502,0.000000,0.000000,0.000000,0.000000,0.000000,-4.389795e-05,0.006179,-0.710415,8006
2021-02-10,0.002259,0.145439,0.359551,0.036656,0.064657,0.054063,0.996071,0.953080,1.045438,1333.981181,1335.066466,1336.728857,1341.331179,-4.602322,-1.836860,2.053019e+06,7.061305e+07,2021.0,926.0,106.0,110.0,76.0,3897.0,2614.0,0.0,0.0,0.0,2021.0,1564.0,145.0,144.0,100.0,2020.0,6.0,6.0,2021.0,6.00,6.00,4.032000e+06,2,3200,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.447084,11.879050,8.207343,35.424637,0.000000,0.000000,0.000000,0.000000,0.000000,1.884921e-05,0.000648,2.907422,4274
2021-02-10,0.178737,0.095989,0.086648,0.027799,0.025220,0.024088,1.090909,1.117972,1.103312,716.047642,710.258291,707.531151,697.575581,9.955570,5.072609,1.543168e+06,1.671289e+07,2021.0,2897.0,453.0,491.0,324.0,6070.0,3509.0,0.0,0.0,0.0,2021.0,5800.0,700.0,700.0,500.0,2021.0,0.0,0.0,2021.0,7.00,7.00,7.078400e+06,2,3600,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.636866,16.948567,11.183983,82.559134,0.000000,0.000000,0.000000,0.000000,0.000000,4.577306e-05,0.000496,9.233400,6239
2021-02-05,0.006667,0.003322,-0.016287,0.003294,0.004348,0.004910,1.009077,1.007422,1.006201,899.803371,899.045863,898.802984,898.879509,-0.076525,-1.052723,5.968168e+05,1.114394e+07,2021.0,13073.0,1039.0,1042.0,711.0,17369.0,13276.0,0.0,0.0,0.0,2021.0,18000.0,1170.0,1190.0,800.0,2021.0,0.0,0.0,2021.0,26.00,26.00,8.745408e+06,3,3550,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.947678,7.970626,5.438690,98.470925,0.000000,0.000000,0.000000,0.000000,0.000000,8.129981e-05,0.001518,5.355529,3435
2021-02-12,0.185044,0.183544,0.164384,0.023451,0.022166,0.035287,1.090315,1.158756,1.145693,906.426358,894.827611,887.847641,854.207171,33.640470,25.997551,2.372882e+06,-2.859803e+07,2020.0,2458.0,-220.0,-206.0,-173.0,3784.0,2085.0,-28.0,201.0,-249.0,2021.0,2584.0,26.0,56.0,37.0,2020.0,0.0,0.0,2021.0,0.00,0.00,5.291400e+06,4,3500,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.950366,-8.380797,-7.038242,117.889688,0.000000,0.000000,0.000000,0.000000,0.000000,-3.269456e-05,0.000394,-8.297362,5704
2021-02-08,0.053129,0.148005,0.176781,0.008392,0.012039,0.010482,1.030797,1.057906,1.087076,4385.554498,4372.624324,4363.931964,4295.336892,68.595072,72.410070,5.090665e+05,1.314538e+07,2021.0,16995.0,384.0,3099.0,2034.0,70866.0,52523.0,0.0,0.0,0.0,2021.0,23200.0,360.0,3100.0,2100.0,2021.0,0.0,0.0,2021.0,30.00,30.00,5.995000e+06,2,3050,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.259488,18.234775,11.968226,32.357253,0.000000,0.000000,0.000000,0.000000,0.000000,3.392827e-04,0.008761,3.872589,2221
2021-02-10,0.091491,0.346501,0.395322,0.021934,0.023732,0.022573,1.041103,1.146743,1.205680,2339.098966,2323.319428,2311.800455,2215.538942,96.261512,104.468710,5.463628e+05,1.226576e+07,2021.0,1570642.0,37448.0,34530.0,16490.0,1672522.0,370198.0,0.0,0.0,0.0,2021.0,2240000.0,76000.0,75000.0,50000.0,2021.0,0.0,0.0,2021.0,80.00,80.00,8.477051e+07,1,3300,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.384248,2.198464,1.049889,424.270796,0.000000,0.000000,0.000000,0.000000,0.000000,1.945252e-04,0.004367,4.454373,5021
2021-02-12,0.052960,0.021148,0.056250,0.012536,0.017395,0.017628,1.024398,1.040480,1.026056,335.461383,333.983031,333.144448,329.609627,3.534821,2.342553,3.797659e+05,8.081747e+06,2021.0,18328.0,728.0,499.0,324.0,29942.0,6895.0,0.0,0.0,0.0,2021.0,26000.0,1100.0,800.0,500.0,2021.0,0.0,0.0,2021.0,12.00,12.00,7.747000e+06,3,8050,17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.972065,2.722610,1.767787,265.815809,0.000000,0.000000,0.000000,0.000000,0.000000,4.182264e-05,0.000890,4.699057,3238
2021-02-08,0.148997,0.101648,0.062252,0.016003,0.015434,0.018865,1.111419,1.121443,1.124457,3810.642064,3756.829090,3730.208680,3639.731783,90.476897,39.902218,1.871726e+06,-6.944301e+07,2021.0,71232.0,-1171.0,93.0,-3305.0,204350.0,122619.0,0.0,0.0,0.0,2021.0,96500.0,-4000.0,-2600.0,600.0,2021.0,56.0,0.0,2021.0,57.00,113.00,5.723420e+07,1,6050,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.643924,0.130559,-4.639769,58.092139,0.000000,0.000000,0.000000,0.000000,0.000000,-5.774519e-05,0.002142,-2.695341,7458
2021-02-05,-0.028713,-0.002035,-0.048497,0.011541,0.009622,0.010158,0.970422,0.976241,0.981622,9822.290229,9870.513224,9898.981164,9998.215340,-99.234176,-49.053745,7.865449e+05,2.227937e+07,2021.0,21283.0,2660.0,2755.0,1987.0,65004.0,56283.0,0.0,0.0,0.0,2021.0,25100.0,2500.0,2900.0,2000.0,2021.0,0.0,0.0,2021.0,250.00,250.00,4.512651e+06,2,3200,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.498238,12.944604,9.336090,37.814260,0.000000,0.000000,0.000000,0.000000,0.000000,4.403177e-04,0.012472,3.530373,4365


In [73]:
labels

{'label_high_10',
 'label_high_20',
 'label_high_5',
 'label_low_10',
 'label_low_20',
 'label_low_5'}

In [75]:
model_path = os.path.join(os.path.dirname("__file__"), "../../new_data_dir/test_X")
# tag::save_model[]
# モデル保存先ディレクトリを作成
os.makedirs(model_path, exist_ok=True)
for label in labels:
    with open(os.path.join(model_path, f"test_X_{label}.pkl"), "wb") as f:
        # モデルをpickle形式で保存
        pickle.dump(_X[label], f)

In [None]:
tmp.shape

In [None]:
model_path = os.path.join(os.path.dirname("__file__"), "../model")
# tag::save_model[]
# モデル保存先ディレクトリを作成
os.makedirs(model_path, exist_ok=True)
with open(os.path.join(model_path, "new_feature.pkl"), "wb") as f:
    # モデルをpickle形式で保存
    pickle.dump(feature, f)

In [None]:
model_path = os.path.join(os.path.dirname("__file__"), "../model")

with open(os.path.join(model_path, "test_X"), "rb") as f:
    test_X = pickle.load(f)
with open(os.path.join(model_path, "test_y"), "rb") as f:
    test_y = pickle.load(f)

In [None]:
with open(os.path.join(model_path, "train_X"), "rb") as f:
    train_X = pickle.load(f)
with open(os.path.join(model_path, "train_y"), "rb") as f:
    train_y = pickle.load(f)

In [None]:
with open(os.path.join(model_path, "val_X"), "rb") as f:
    val_X = pickle.load(f)
with open(os.path.join(model_path, "val_y"), "rb") as f:
    val_y = pickle.load(f)

In [None]:
# 目的変数を指定
label = "label_high_20"

# 学習用データセット定義
# ファンダメンタル情報
#fundamental_cols = dfs["stock_fin"].select_dtypes("float64").columns
fundamental_cols = pd.Index(SELECT_FIN_DATA_COLUMNS)
fundamental_cols = fundamental_cols[fundamental_cols != "Result_Dividend DividendPayableDate"]
fundamental_cols = fundamental_cols[fundamental_cols != "Local Code"]
# 価格変化率
returns_cols = [x for x in train_X[label].columns if "return" in x]
# テクニカル
technical_cols = [x for x in train_X[label].columns if (x not in fundamental_cols) and (x != "code")]

columns = {
    "fundamental_only": fundamental_cols,
    "return_only": returns_cols,
    "technical_only": technical_cols,
    "fundamental+technical": list(fundamental_cols) + list(technical_cols),
}
# 学習用データセットを指定
col = "fundamental+technical"

In [None]:
'''reg_cv = GridSearchCV(xgb_model, {
    "eta": [0.1], 
    "gamma": [0.1,0.2,0.3,0.4,0.5],
    "n_estimators": [50, 100, 200], 
    "max_depth": [5, 7, 9,10,20,30],
    "subsample":[0.6,0.8,1],
    "colsample_bytree": [0.5,0.7,0.9],
}, verbose=1)
'''

In [None]:
'''
{'colsample_bytree': 0.5, 'eta': 0.1, 'gamma': 0.5, 'max_depth': 5, 'n_estimators': 50, 'subsample': 1}
0.07780464612358796
'''

In [None]:
# ライブラリインポート
from sklearn.model_selection import GridSearchCV

# モデル定義
model = XGBRegressor()

# ハイパーパラメータ探索
reg_cv = GridSearchCV(model, {
    "eta": [0.1], 
    "gamma": [0.4, 0.5],
    "max_depth": [5],
    "n_estimators": [50], 
    "subsample":[1],
    "colsample_bytree": [0.5],
}, verbose=1)

# 訓練実施
reg_cv.fit(train_X[label][columns[col]].values, train_y[label])

In [None]:
# 結果を表示
print(reg_cv.best_params_)
print(reg_cv.best_score_)

In [None]:
best_model = reg_cv.best_estimator_

In [None]:
# 目的変数を指定
label = "label_high_20"

# 学習用データセット定義
# ファンダメンタル情報
#fundamental_cols = dfs["stock_fin"].select_dtypes("float64").columns
fundamental_cols = pd.Index(SELECT_FIN_DATA_COLUMNS)
fundamental_cols = fundamental_cols[fundamental_cols != "Result_Dividend DividendPayableDate"]
fundamental_cols = fundamental_cols[fundamental_cols != "Local Code"]
# 価格変化率
returns_cols = [x for x in train_X[label].columns if "return" in x]
# テクニカル
technical_cols = [x for x in train_X[label].columns if (x not in fundamental_cols) and (x != "code")]

columns = {
    "fundamental_only": fundamental_cols,
    "return_only": returns_cols,
    "technical_only": technical_cols,
    "fundamental+technical": list(fundamental_cols) + list(technical_cols),
}
# 学習用データセットを指定
col = "fundamental+technical"

# 学習
#pred_model = models[model](reg_cv.best_estimator_, random_state=0)
best_model.fit(train_X[label][columns[col]].values, train_y[label])


In [None]:
columns[col]

In [None]:
test_X[label][columns[col]].T.shape

In [None]:
train_X[label][columns[col]].T.shape

In [None]:
val_X[label][columns[col]].T.shape

In [None]:
test_y[label].shape

In [None]:
train_y[label].shape

In [None]:
val_y[label].shape

In [None]:
# 予測
result = {}
result[label] = pd.DataFrame(
    best_model.predict(val_X[label][columns[col]]), columns=["predict"]
)

# 予測結果に日付と銘柄コードを追加
result[label]["datetime"] = val_X[label][columns[col]].index
result[label]["code"] = val_X[label]["code"].values

# 予測の符号を取得
result[label]["predict_dir"] = np.sign(result[label]["predict"])

# 実際の値を追加
result[label]["actual"] = val_y[label].values

In [None]:
pred_model.fit(train_X[label][columns[col]].values, train_y[label])

In [None]:
pred_model.feature_importances_

# Submit Model

In [None]:
model_path = os.path.join(os.path.dirname("__file__"), "../model")
# tag::save_model[]
# モデル保存先ディレクトリを作成
os.makedirs(model_path, exist_ok=True)
with open(os.path.join(model_path, f"my_model_{label}.pkl"), "wb") as f:
    # モデルをpickle形式で保存
    pickle.dump(pred_model, f)


In [None]:
sns.jointplot(data=result[label], x="predict", y="actual")

In [None]:
pred_model

In [None]:
# 学習済みモデルを指定
rf = pred_model

# 重要度順を取得
sorted_idx = rf.feature_importances_.argsort()
# プロット
fig, ax = plt.subplots(figsize=(8, 8))
ax.barh(fundamental_cols[sorted_idx], rf.feature_importances_[sorted_idx])
ax.set_xlabel("Random Forest Feature Importance")

In [None]:
# モデルを定義します
sample_model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(train_X["label_high_20"], label=train_y["label_high_20"]), 100)

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(model=sample_model, feature_perturbation='tree_path_dependent', model_output='margin')
# SHAP値
shap_values = explainer.shap_values(X=train_X["label_high_20"])
# プロット
shap.summary_plot(shap_values, train_X["label_high_20"], plot_type="bar")

In [None]:
shap.summary_plot(shap_values, train_X["label_high_20"])

In [None]:
# モデルを定義
models = {
    "rf": RandomForestRegressor,
    "extraTree": ExtraTreesRegressor,
    "gbr": GradientBoostingRegressor,
}

# 学習用データセット定義
columns = {
    "fundamental_only": fundamental_cols,
    "return_only": returns_cols,
    "technical_only": technical_cols,
    "fundamental+technical": list(fundamental_cols) + list(technical_cols),
}

# 結果保存用
all_results = dict()
# モデル毎に処理
for model in tqdm(models.keys()):
    all_results[model] = dict()
    # データセット毎に処理
    for col in columns.keys():
        result = dict()
        # 目的変数毎に処理
        for label in tqdm(labels):
            if len(test_X[label][columns[col]]) > 0:
                # モデル取得
                pred_model = models[model](random_state=0)
                # 学習
                pred_model.fit(train_X[label][columns[col]].values, train_y[label])
                # 結果データ作成
                result[label] = test_X[label][["code"]].copy()
                result[label]["datetime"] = test_X[label][columns[col]].index
                # 予測
                result[label]["predict"] = pred_model.predict(test_X[label][columns[col]])
                result[label]["predict_dir"] = np.sign(result[label]["predict"])
                # 実際の結果
                result[label]["actual"] = test_y[label].values
                result[label]["actual_dir"] = np.sign(result[label]["actual"])
                result[label].dropna(inplace=True)

        all_results[model][col] = result

In [None]:
results = []
for model in all_results.keys():
    for col in all_results[model]:
        tmp = pd.concat(all_results[model][col])
        tmp["model"] = model
        tmp["feature"] = col
        results.append(tmp)
results = pd.concat(results)
results["label"] = [x[0] for x in results.index]
results.head(5)

In [None]:
# 結果保存用変数
all_metrics = []

# データセット毎に処理
for feature in columns:
    matrix = dict()
    # モデル毎に処理
    for model in models:
        # 目的変数毎に処理
        for label in labels:
            # 処理対象データに絞り込み
            tmp_df = results[(results["model"] == model) & (results["label"] == label) & (results["feature"] == feature)]
            # RMSE
            rmse = np.sqrt(mean_squared_error(tmp_df["predict"], tmp_df["actual"]))
            # 精度
            accuracy = accuracy_score(tmp_df["predict_dir"], tmp_df["actual_dir"])
            # 相関係数
            corr = np.corrcoef(tmp_df["actual"], tmp_df["predict"])[0, 1]
            # 順位相関
            spearman_corr = spearmanr(tmp_df["actual"], tmp_df["predict"])[0]
            # 結果を保存
            matrix[label] = [rmse, accuracy, spearman_corr,corr, corr**2, feature, model, tmp_df.shape[0]]
        res = pd.DataFrame.from_dict(matrix).T
        res.columns = ["RMSE","accuracy","spearman_corr","corr","R^2 score","feature", "model", "# of samples"]
        all_metrics.append(res)
all_metrics = pd.concat(all_metrics)
all_metrics.reset_index()