# Imports & Load Data
作業に必要なライブラリをインポートして、 以下のデータを読み込みます。

* stock_price : 株価情報
* stock_list : 銘柄情報
* stock_fin : 財務諸表
* stock_labels : 目的変数

In [1]:
# shap用にg++とgccをインストールします
! apt-get update
! apt-get install -y --no-install-recommends g++ gcc

# 必要なライブラリをインストールします
! pip install shap==0.37.0 slicer==0.0.3 xgboost==1.3.0.post0

Hit:1 http://security.debian.org/debian-security stretch/updates InRelease
Ign:2 http://deb.debian.org/debian stretch InRelease
Get:3 http://deb.debian.org/debian stretch-updates InRelease [93.6 kB]
Hit:4 http://deb.debian.org/debian stretch Release           
Fetched 93.6 kB in 0s (173 kB/s)
Reading package lists... Done
Reading package lists... Done
Building dependency tree       
Reading state information... Done
g++ is already the newest version (4:6.3.0-4).
gcc is already the newest version (4:6.3.0-4).
0 upgraded, 0 newly installed, 0 to remove and 60 not upgraded.


In [1]:
import os
import pickle
import sys
import warnings
from glob import glob

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shap
from xgboost import XGBRegressor
from scipy.stats import spearmanr
from sklearn.ensemble import (
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.metrics import accuracy_score, mean_squared_error
from tqdm.auto import tqdm


# 表示用の設定を変更します
%matplotlib inline
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.width = 120

In [2]:
# python 3.7.3であることを確認します
import sys
print(sys.version)

3.7.3 (default, Mar 27 2019, 22:11:17) 
[GCC 7.3.0]


In [3]:
# データセット保存先ディレクトリ（""の中身はご自身の環境に合わせて定義してください。）
dataset_dir="/path/to"

In [4]:
# 読み込むファイルを定義します。
inputs = {
    "stock_list": f"{dataset_dir}/stock_list.csv.gz",
    "stock_price": f"{dataset_dir}/stock_price.csv.gz",
    "stock_fin": f"{dataset_dir}/stock_fin.csv.gz",
    # 本チュートリアルでは使用しないため、コメントアウトしています。
    # "stock_fin_price": f"{dataset_dir}/stock_fin_price.csv.gz",
    "stock_labels": f"{dataset_dir}/stock_labels.csv.gz",
}

# ファイルを読み込みます
dfs = {}
for k, v in inputs.items():
    print(k)
    dfs[k] = pd.read_csv(v)

stock_list
stock_price
stock_fin
stock_labels


# Stock List

In [10]:
tmp_stock_list = dfs['stock_list'].copy()

In [12]:
tmp_stock_list['Effective Date'].value_counts()

20201230    3711
Name: Effective Date, dtype: int64

In [24]:
tmp_stock_list

Unnamed: 0,prediction_target,Effective Date,Local Code,Name (English),Section/Products,33 Sector(Code),33 Sector(name),17 Sector(Code),17 Sector(name),Size Code (New Index Series),Size (New Index Series),IssuedShareEquityQuote AccountingStandard,IssuedShareEquityQuote ModifyDate,IssuedShareEquityQuote IssuedShare
0,True,20201230,1301,"KYOKUYO CO.,LTD.",First Section (Domestic),50,"Fishery, Agriculture and Forestry",1,FOODS,7,TOPIX Small 2,ConsolidatedJP,2020/11/06,1.092828e+07
1,True,20201230,1332,"Nippon Suisan Kaisha,Ltd.",First Section (Domestic),50,"Fishery, Agriculture and Forestry",1,FOODS,4,TOPIX Mid400,ConsolidatedJP,2020/11/05,3.124303e+08
2,True,20201230,1333,Maruha Nichiro Corporation,First Section (Domestic),50,"Fishery, Agriculture and Forestry",1,FOODS,4,TOPIX Mid400,ConsolidatedJP,2020/11/02,5.265691e+07
3,True,20201230,1352,HOHSUI CORPORATION,First Section (Domestic),6050,Wholesale Trade,13,COMMERCIAL & WHOLESALE TRADE,7,TOPIX Small 2,ConsolidatedJP,2020/10/30,8.379000e+06
4,False,20201230,1375,"YUKIGUNI MAITAKE CO.,LTD.",First Section (Domestic),50,"Fishery, Agriculture and Forestry",1,FOODS,7,TOPIX Small 2,ConsolidatedIFRS,2020/11/05,3.985000e+07
5,True,20201230,1376,"KANEKO SEEDS CO.,LTD.",First Section (Domestic),50,"Fishery, Agriculture and Forestry",1,FOODS,7,TOPIX Small 2,ConsolidatedJP,2021/01/05,1.177263e+07
6,True,20201230,1377,SAKATA SEED CORPORATION,First Section (Domestic),50,"Fishery, Agriculture and Forestry",1,FOODS,6,TOPIX Small 1,ConsolidatedJP,2021/01/13,4.741075e+07
7,True,20201230,1379,HOKUTO CORPORATION,First Section (Domestic),50,"Fishery, Agriculture and Forestry",1,FOODS,6,TOPIX Small 1,ConsolidatedJP,2020/11/02,3.335904e+07
8,True,20201230,1380,"AKIKAWA FOODS & FARMS CO.,LTD.",JASDAQ(Standard / Domestic),50,"Fishery, Agriculture and Forestry",1,FOODS,-,-,ConsolidatedJP,2020/11/06,4.179000e+06
9,True,20201230,1381,"AXYZ CO.,Ltd.",JASDAQ(Standard / Domestic),50,"Fishery, Agriculture and Forestry",1,FOODS,-,-,ConsolidatedJP,2020/10/23,5.617500e+06


In [25]:
model_path = os.path.join(os.path.dirname("__file__"), "../../../get_data")

with open(os.path.join(model_path, "stock_lists.pkl"), "rb") as f:
    stock_lists = pickle.load(f)

In [26]:
stock_lists['Effective Date'].value_counts()

20210226    3712
20201230    3711
20210129    3707
20201030       9
Name: Effective Date, dtype: int64

# Stock Price

In [264]:
dfs['stock_price'].head()

Unnamed: 0,Local Code,EndOfDayQuote Date,EndOfDayQuote Open,EndOfDayQuote High,EndOfDayQuote Low,EndOfDayQuote Close,EndOfDayQuote ExchangeOfficialClose,EndOfDayQuote Volume,EndOfDayQuote CumulativeAdjustmentFactor,EndOfDayQuote PreviousClose,EndOfDayQuote PreviousCloseDate,EndOfDayQuote PreviousExchangeOfficialClose,EndOfDayQuote PreviousExchangeOfficialCloseDate,EndOfDayQuote ChangeFromPreviousClose,EndOfDayQuote PercentChangeFromPreviousClose,EndOfDayQuote VWAP
0,1301,2016/01/04,2800.0,2820.0,2740.0,2750.0,2750.0,32000.0,0.1,2770.0,2015/12/30,2770.0,2015/12/30,-20.0,-0.722,2778.25
1,1301,2016/01/05,2750.0,2780.0,2750.0,2760.0,2760.0,20100.0,0.1,2750.0,2016/01/04,2750.0,2016/01/04,10.0,0.364,2761.99
2,1301,2016/01/06,2760.0,2770.0,2740.0,2760.0,2760.0,15000.0,0.1,2760.0,2016/01/05,2760.0,2016/01/05,0.0,0.0,2758.867
3,1301,2016/01/07,2740.0,2760.0,2710.0,2710.0,2710.0,31400.0,0.1,2760.0,2016/01/06,2760.0,2016/01/06,-50.0,-1.812,2733.471
4,1301,2016/01/08,2700.0,2740.0,2690.0,2700.0,2700.0,26200.0,0.1,2710.0,2016/01/07,2710.0,2016/01/07,-10.0,-0.369,2709.122


In [254]:
tmp_stock_price = dfs['stock_price'].copy()

In [255]:
model_path = os.path.join(os.path.dirname("__file__"), "../../../get_data")

with open(os.path.join(model_path, "stock_prices.pkl"), "rb") as f:
    stock_prices = pickle.load(f)

In [256]:
stock_prices = stock_prices.astype(dict(tmp_stock_price.dtypes))

In [257]:
stock_prices = stock_prices[tmp_stock_price.columns]

In [258]:
tmp_stock_price = tmp_stock_price.append(stock_prices)

In [259]:
tmp_stock_price = tmp_stock_price.drop_duplicates(keep='last', subset=["EndOfDayQuote Date", "Local Code"])

In [260]:
tmp_stock_price[tmp_stock_price.duplicated(keep='last', subset=["EndOfDayQuote Date", "Local Code"])]

Unnamed: 0,Local Code,EndOfDayQuote Date,EndOfDayQuote Open,EndOfDayQuote High,EndOfDayQuote Low,EndOfDayQuote Close,EndOfDayQuote ExchangeOfficialClose,EndOfDayQuote Volume,EndOfDayQuote CumulativeAdjustmentFactor,EndOfDayQuote PreviousClose,EndOfDayQuote PreviousCloseDate,EndOfDayQuote PreviousExchangeOfficialClose,EndOfDayQuote PreviousExchangeOfficialCloseDate,EndOfDayQuote ChangeFromPreviousClose,EndOfDayQuote PercentChangeFromPreviousClose,EndOfDayQuote VWAP


In [261]:
tmp_stock_price = tmp_stock_price.sort_values(['Local Code',  'EndOfDayQuote Date'])

In [262]:
tmp_stock_price = tmp_stock_price.reset_index(drop=True)

In [263]:
tmp_stock_price

Unnamed: 0,Local Code,EndOfDayQuote Date,EndOfDayQuote Open,EndOfDayQuote High,EndOfDayQuote Low,EndOfDayQuote Close,EndOfDayQuote ExchangeOfficialClose,EndOfDayQuote Volume,EndOfDayQuote CumulativeAdjustmentFactor,EndOfDayQuote PreviousClose,EndOfDayQuote PreviousCloseDate,EndOfDayQuote PreviousExchangeOfficialClose,EndOfDayQuote PreviousExchangeOfficialCloseDate,EndOfDayQuote ChangeFromPreviousClose,EndOfDayQuote PercentChangeFromPreviousClose,EndOfDayQuote VWAP
0,1301,2016/01/04,2800.0,2820.0,2740.0,2750.0,2750.0,32000.0,0.1,2770.0,2015/12/30,2770.0,2015/12/30,-20.0,-0.722,2778.250
1,1301,2016/01/05,2750.0,2780.0,2750.0,2760.0,2760.0,20100.0,0.1,2750.0,2016/01/04,2750.0,2016/01/04,10.0,0.364,2761.990
2,1301,2016/01/06,2760.0,2770.0,2740.0,2760.0,2760.0,15000.0,0.1,2760.0,2016/01/05,2760.0,2016/01/05,0.0,0.000,2758.867
3,1301,2016/01/07,2740.0,2760.0,2710.0,2710.0,2710.0,31400.0,0.1,2760.0,2016/01/06,2760.0,2016/01/06,-50.0,-1.812,2733.471
4,1301,2016/01/08,2700.0,2740.0,2690.0,2700.0,2700.0,26200.0,0.1,2710.0,2016/01/07,2710.0,2016/01/07,-10.0,-0.369,2709.122
5,1301,2016/01/12,2700.0,2730.0,2640.0,2640.0,2640.0,27500.0,0.1,2700.0,2016/01/08,2700.0,2016/01/08,-60.0,-2.222,2671.927
6,1301,2016/01/13,2680.0,2710.0,2670.0,2690.0,2690.0,20400.0,0.1,2640.0,2016/01/12,2640.0,2016/01/12,50.0,1.894,2693.235
7,1301,2016/01/14,2650.0,2650.0,2620.0,2630.0,2630.0,29700.0,0.1,2690.0,2016/01/13,2690.0,2016/01/13,-60.0,-2.230,2633.502
8,1301,2016/01/15,2650.0,2660.0,2630.0,2650.0,2650.0,11400.0,0.1,2630.0,2016/01/14,2630.0,2016/01/14,20.0,0.760,2647.544
9,1301,2016/01/18,2620.0,2630.0,2610.0,2630.0,2630.0,17300.0,0.1,2650.0,2016/01/15,2650.0,2016/01/15,-20.0,-0.755,2617.110


In [265]:
model_path = os.path.join(os.path.dirname("__file__"), "../../../current_data")

# tag::save_model[]
# モデル保存先ディレクトリを作成
os.makedirs(model_path, exist_ok=True)
with open(os.path.join(model_path, "stock_price.pkl"), "wb") as f:
    # モデルをpickle形式で保存
    pickle.dump(tmp_stock_price, f)

# Stock Fin

In [266]:
tmp_stock_fin = dfs['stock_fin'].copy()

In [267]:
tmp_stock_fin

Unnamed: 0,base_date,Local Code,Result_FinancialStatement AccountingStandard,Result_FinancialStatement FiscalPeriodEnd,Result_FinancialStatement ReportType,Result_FinancialStatement FiscalYear,Result_FinancialStatement ModifyDate,Result_FinancialStatement CompanyType,Result_FinancialStatement ChangeOfFiscalYearEnd,Result_FinancialStatement NetSales,Result_FinancialStatement OperatingIncome,Result_FinancialStatement OrdinaryIncome,Result_FinancialStatement NetIncome,Result_FinancialStatement TotalAssets,Result_FinancialStatement NetAssets,Result_FinancialStatement CashFlowsFromOperatingActivities,Result_FinancialStatement CashFlowsFromFinancingActivities,Result_FinancialStatement CashFlowsFromInvestingActivities,Forecast_FinancialStatement AccountingStandard,Forecast_FinancialStatement FiscalPeriodEnd,Forecast_FinancialStatement ReportType,Forecast_FinancialStatement FiscalYear,Forecast_FinancialStatement ModifyDate,Forecast_FinancialStatement CompanyType,Forecast_FinancialStatement ChangeOfFiscalYearEnd,Forecast_FinancialStatement NetSales,Forecast_FinancialStatement OperatingIncome,Forecast_FinancialStatement OrdinaryIncome,Forecast_FinancialStatement NetIncome,Result_Dividend FiscalPeriodEnd,Result_Dividend ReportType,Result_Dividend FiscalYear,Result_Dividend ModifyDate,Result_Dividend RecordDate,Result_Dividend DividendPayableDate,Result_Dividend QuarterlyDividendPerShare,Result_Dividend AnnualDividendPerShare,Forecast_Dividend FiscalPeriodEnd,Forecast_Dividend ReportType,Forecast_Dividend FiscalYear,Forecast_Dividend ModifyDate,Forecast_Dividend RecordDate,Forecast_Dividend QuarterlyDividendPerShare,Forecast_Dividend AnnualDividendPerShare
0,2016/01/04,2753,ConsolidatedJP,2015/12,Q3,2016.0,2016/01/04,GB,False,22354.0,2391.0,2466.0,1645.0,21251.0,16962.0,,,,ConsolidatedJP,2016/03,Annual,2016.0,2016/01/04,GB,False,30500.0,3110.0,3200.0,2130.0,,,,,,,,,2016/03,Annual,2016.0,2016/01/04,2016/03/31,45.00,90.00
1,2016/01/04,3353,ConsolidatedJP,2015/11,Q3,2016.0,2016/01/04,GB,False,21550.0,1038.0,1053.0,697.0,24949.0,7145.0,,,,ConsolidatedJP,2016/02,Annual,2016.0,2016/01/04,GB,False,27800.0,1320.0,1310.0,840.0,,,,,,,,,2016/02,Annual,2016.0,2016/01/04,2016/02/29,32.50,65.00
2,2016/01/04,4591,,,,,,,,,,,,,,,,,NonConsolidated,2016/03,Annual,2016.0,2016/01/04,GB,False,119.0,-474.0,-451.0,-452.0,,,,,,,,,2016/03,Annual,2016.0,2016/01/04,2016/03/31,0.00,0.00
3,2016/01/04,6786,,,,,,,,,,,,,,,,,ConsolidatedJP,2016/03,Annual,2016.0,2016/01/04,GB,False,15700.0,,1800.0,1100.0,,,,,,,,,2016/03,Annual,2016.0,2016/01/04,2016/03/31,0.00,0.00
4,2016/01/04,7463,ConsolidatedJP,2015/12,Q3,2016.0,2016/01/04,GB,False,14307.0,4518.0,4136.0,2685.0,38498.0,30980.0,,,,ConsolidatedJP,2016/03,Annual,2016.0,2016/01/04,GB,False,19500.0,6000.0,5660.0,3500.0,,,,,,,,,2016/03,Annual,2016.0,2016/01/04,2016/03/31,18.00,18.00
5,2016/01/05,1376,ConsolidatedJP,2015/11,Q2,2016.0,2016/01/05,GB,False,24881.0,491.0,546.0,354.0,33055.0,15629.0,,,,ConsolidatedJP,2016/05,Annual,2016.0,2016/01/05,GB,False,59000.0,1850.0,1950.0,1200.0,,,,,,,,,2016/05,Annual,2016.0,2016/01/05,2016/05/31,14.00,25.00
6,2016/01/05,2659,ConsolidatedJP,2015/11,Q3,2016.0,2016/01/05,GB,False,128716.0,10206.0,10451.0,6212.0,118788.0,92277.0,,,,ConsolidatedJP,2016/02,Annual,2016.0,2016/01/05,GB,False,167525.0,12941.0,13244.0,7969.0,,,,,,,,,2016/02,Annual,2016.0,2016/01/05,2016/02/29,36.00,36.00
7,2016/01/05,5216,,,,,,,,,,,,,,,,,ConsolidatedJP,2015/12,Annual,2015.0,2016/01/05,GB,False,5800.0,-590.0,-670.0,100.0,,,,,,,,,2015/12,Annual,2015.0,2016/01/05,2015/12/31,0.00,0.00
8,2016/01/05,6788,,,,,,,,,,,,,,,,,ConsolidatedJP,2016/03,Annual,2016.0,2016/01/05,GB,False,15100.0,3070.0,3150.0,1950.0,,,,,,,,,2016/03,Annual,2016.0,2016/01/05,2016/03/31,60.00,60.00
9,2016/01/05,8168,NonConsolidated,2015/11,Q3,2016.0,2016/01/05,GB,False,119395.0,118.0,872.0,-183.0,96633.0,38984.0,,,,NonConsolidated,2016/02,Annual,2016.0,2016/01/05,GB,False,161000.0,950.0,1850.0,200.0,,,,,,,,,2016/02,Annual,2016.0,2016/01/05,2016/02/29,6.25,12.50


In [268]:
model_path = os.path.join(os.path.dirname("__file__"), "../../../get_data")

with open(os.path.join(model_path, "stock_fins.pkl"), "rb") as f:
    stock_fins = pickle.load(f)

In [269]:
stock_fins[["Result_FinancialStatement FiscalYear", 
"Result_FinancialStatement NetSales", 
"Result_FinancialStatement OperatingIncome", 
"Result_FinancialStatement OrdinaryIncome", 
"Result_FinancialStatement NetIncome", 
"Result_FinancialStatement TotalAssets", 
"Result_FinancialStatement NetAssets", 
"Result_FinancialStatement CashFlowsFromOperatingActivities", 
"Result_FinancialStatement CashFlowsFromFinancingActivities", 
"Result_FinancialStatement CashFlowsFromInvestingActivities", 
"Forecast_FinancialStatement FiscalYear", 
"Forecast_FinancialStatement NetSales", 
"Forecast_FinancialStatement OperatingIncome", 
"Forecast_FinancialStatement OrdinaryIncome", 
"Forecast_FinancialStatement NetIncome", 
"Result_Dividend FiscalYear", 
"Result_Dividend QuarterlyDividendPerShare", 
"Result_Dividend AnnualDividendPerShare", 
"Forecast_Dividend FiscalYear", 
"Forecast_Dividend QuarterlyDividendPerShare", 
"Forecast_Dividend AnnualDividendPerShare"]]  = stock_fins[["Result_FinancialStatement FiscalYear", 
"Result_FinancialStatement NetSales", 
"Result_FinancialStatement OperatingIncome", 
"Result_FinancialStatement OrdinaryIncome", 
"Result_FinancialStatement NetIncome", 
"Result_FinancialStatement TotalAssets", 
"Result_FinancialStatement NetAssets", 
"Result_FinancialStatement CashFlowsFromOperatingActivities", 
"Result_FinancialStatement CashFlowsFromFinancingActivities", 
"Result_FinancialStatement CashFlowsFromInvestingActivities", 
"Forecast_FinancialStatement FiscalYear", 
"Forecast_FinancialStatement NetSales", 
"Forecast_FinancialStatement OperatingIncome", 
"Forecast_FinancialStatement OrdinaryIncome", 
"Forecast_FinancialStatement NetIncome", 
"Result_Dividend FiscalYear", 
"Result_Dividend QuarterlyDividendPerShare", 
"Result_Dividend AnnualDividendPerShare", 
"Forecast_Dividend FiscalYear", 
"Forecast_Dividend QuarterlyDividendPerShare", 
"Forecast_Dividend AnnualDividendPerShare"]].apply(pd.to_numeric)

In [270]:
stock_fins = stock_fins.astype(dict(tmp_stock_fin.dtypes))

In [271]:
stock_fins = stock_fins[tmp_stock_fin.columns]

In [272]:
tmp_stock_fin = tmp_stock_fin.append(stock_fins)

In [273]:
tmp_stock_fin = tmp_stock_fin.drop_duplicates(keep='last', subset=["base_date", "Local Code"])

In [274]:
tmp_stock_fin[tmp_stock_fin.duplicated(keep='last', subset=["base_date", "Local Code"])]

Unnamed: 0,base_date,Local Code,Result_FinancialStatement AccountingStandard,Result_FinancialStatement FiscalPeriodEnd,Result_FinancialStatement ReportType,Result_FinancialStatement FiscalYear,Result_FinancialStatement ModifyDate,Result_FinancialStatement CompanyType,Result_FinancialStatement ChangeOfFiscalYearEnd,Result_FinancialStatement NetSales,Result_FinancialStatement OperatingIncome,Result_FinancialStatement OrdinaryIncome,Result_FinancialStatement NetIncome,Result_FinancialStatement TotalAssets,Result_FinancialStatement NetAssets,Result_FinancialStatement CashFlowsFromOperatingActivities,Result_FinancialStatement CashFlowsFromFinancingActivities,Result_FinancialStatement CashFlowsFromInvestingActivities,Forecast_FinancialStatement AccountingStandard,Forecast_FinancialStatement FiscalPeriodEnd,Forecast_FinancialStatement ReportType,Forecast_FinancialStatement FiscalYear,Forecast_FinancialStatement ModifyDate,Forecast_FinancialStatement CompanyType,Forecast_FinancialStatement ChangeOfFiscalYearEnd,Forecast_FinancialStatement NetSales,Forecast_FinancialStatement OperatingIncome,Forecast_FinancialStatement OrdinaryIncome,Forecast_FinancialStatement NetIncome,Result_Dividend FiscalPeriodEnd,Result_Dividend ReportType,Result_Dividend FiscalYear,Result_Dividend ModifyDate,Result_Dividend RecordDate,Result_Dividend DividendPayableDate,Result_Dividend QuarterlyDividendPerShare,Result_Dividend AnnualDividendPerShare,Forecast_Dividend FiscalPeriodEnd,Forecast_Dividend ReportType,Forecast_Dividend FiscalYear,Forecast_Dividend ModifyDate,Forecast_Dividend RecordDate,Forecast_Dividend QuarterlyDividendPerShare,Forecast_Dividend AnnualDividendPerShare


In [275]:
tmp_stock_fin = tmp_stock_fin.sort_values(['base_date', 'Local Code'])

In [276]:
tmp_stock_fin = tmp_stock_fin.reset_index(drop=True)

In [278]:
dfs['stock_fin'].head()

Unnamed: 0,base_date,Local Code,Result_FinancialStatement AccountingStandard,Result_FinancialStatement FiscalPeriodEnd,Result_FinancialStatement ReportType,Result_FinancialStatement FiscalYear,Result_FinancialStatement ModifyDate,Result_FinancialStatement CompanyType,Result_FinancialStatement ChangeOfFiscalYearEnd,Result_FinancialStatement NetSales,Result_FinancialStatement OperatingIncome,Result_FinancialStatement OrdinaryIncome,Result_FinancialStatement NetIncome,Result_FinancialStatement TotalAssets,Result_FinancialStatement NetAssets,Result_FinancialStatement CashFlowsFromOperatingActivities,Result_FinancialStatement CashFlowsFromFinancingActivities,Result_FinancialStatement CashFlowsFromInvestingActivities,Forecast_FinancialStatement AccountingStandard,Forecast_FinancialStatement FiscalPeriodEnd,Forecast_FinancialStatement ReportType,Forecast_FinancialStatement FiscalYear,Forecast_FinancialStatement ModifyDate,Forecast_FinancialStatement CompanyType,Forecast_FinancialStatement ChangeOfFiscalYearEnd,Forecast_FinancialStatement NetSales,Forecast_FinancialStatement OperatingIncome,Forecast_FinancialStatement OrdinaryIncome,Forecast_FinancialStatement NetIncome,Result_Dividend FiscalPeriodEnd,Result_Dividend ReportType,Result_Dividend FiscalYear,Result_Dividend ModifyDate,Result_Dividend RecordDate,Result_Dividend DividendPayableDate,Result_Dividend QuarterlyDividendPerShare,Result_Dividend AnnualDividendPerShare,Forecast_Dividend FiscalPeriodEnd,Forecast_Dividend ReportType,Forecast_Dividend FiscalYear,Forecast_Dividend ModifyDate,Forecast_Dividend RecordDate,Forecast_Dividend QuarterlyDividendPerShare,Forecast_Dividend AnnualDividendPerShare
0,2016/01/04,2753,ConsolidatedJP,2015/12,Q3,2016.0,2016/01/04,GB,False,22354.0,2391.0,2466.0,1645.0,21251.0,16962.0,,,,ConsolidatedJP,2016/03,Annual,2016.0,2016/01/04,GB,False,30500.0,3110.0,3200.0,2130.0,,,,,,,,,2016/03,Annual,2016.0,2016/01/04,2016/03/31,45.0,90.0
1,2016/01/04,3353,ConsolidatedJP,2015/11,Q3,2016.0,2016/01/04,GB,False,21550.0,1038.0,1053.0,697.0,24949.0,7145.0,,,,ConsolidatedJP,2016/02,Annual,2016.0,2016/01/04,GB,False,27800.0,1320.0,1310.0,840.0,,,,,,,,,2016/02,Annual,2016.0,2016/01/04,2016/02/29,32.5,65.0
2,2016/01/04,4591,,,,,,,,,,,,,,,,,NonConsolidated,2016/03,Annual,2016.0,2016/01/04,GB,False,119.0,-474.0,-451.0,-452.0,,,,,,,,,2016/03,Annual,2016.0,2016/01/04,2016/03/31,0.0,0.0
3,2016/01/04,6786,,,,,,,,,,,,,,,,,ConsolidatedJP,2016/03,Annual,2016.0,2016/01/04,GB,False,15700.0,,1800.0,1100.0,,,,,,,,,2016/03,Annual,2016.0,2016/01/04,2016/03/31,0.0,0.0
4,2016/01/04,7463,ConsolidatedJP,2015/12,Q3,2016.0,2016/01/04,GB,False,14307.0,4518.0,4136.0,2685.0,38498.0,30980.0,,,,ConsolidatedJP,2016/03,Annual,2016.0,2016/01/04,GB,False,19500.0,6000.0,5660.0,3500.0,,,,,,,,,2016/03,Annual,2016.0,2016/01/04,2016/03/31,18.0,18.0


In [277]:
tmp_stock_fin

Unnamed: 0,base_date,Local Code,Result_FinancialStatement AccountingStandard,Result_FinancialStatement FiscalPeriodEnd,Result_FinancialStatement ReportType,Result_FinancialStatement FiscalYear,Result_FinancialStatement ModifyDate,Result_FinancialStatement CompanyType,Result_FinancialStatement ChangeOfFiscalYearEnd,Result_FinancialStatement NetSales,Result_FinancialStatement OperatingIncome,Result_FinancialStatement OrdinaryIncome,Result_FinancialStatement NetIncome,Result_FinancialStatement TotalAssets,Result_FinancialStatement NetAssets,Result_FinancialStatement CashFlowsFromOperatingActivities,Result_FinancialStatement CashFlowsFromFinancingActivities,Result_FinancialStatement CashFlowsFromInvestingActivities,Forecast_FinancialStatement AccountingStandard,Forecast_FinancialStatement FiscalPeriodEnd,Forecast_FinancialStatement ReportType,Forecast_FinancialStatement FiscalYear,Forecast_FinancialStatement ModifyDate,Forecast_FinancialStatement CompanyType,Forecast_FinancialStatement ChangeOfFiscalYearEnd,Forecast_FinancialStatement NetSales,Forecast_FinancialStatement OperatingIncome,Forecast_FinancialStatement OrdinaryIncome,Forecast_FinancialStatement NetIncome,Result_Dividend FiscalPeriodEnd,Result_Dividend ReportType,Result_Dividend FiscalYear,Result_Dividend ModifyDate,Result_Dividend RecordDate,Result_Dividend DividendPayableDate,Result_Dividend QuarterlyDividendPerShare,Result_Dividend AnnualDividendPerShare,Forecast_Dividend FiscalPeriodEnd,Forecast_Dividend ReportType,Forecast_Dividend FiscalYear,Forecast_Dividend ModifyDate,Forecast_Dividend RecordDate,Forecast_Dividend QuarterlyDividendPerShare,Forecast_Dividend AnnualDividendPerShare
0,2016/01/04,2753,ConsolidatedJP,2015/12,Q3,2016.0,2016/01/04,GB,False,22354.0,2391.0,2466.0,1645.0,21251.0,16962.0,,,,ConsolidatedJP,2016/03,Annual,2016.0,2016/01/04,GB,False,30500.0,3110.0,3200.0,2130.0,,,,,,,,,2016/03,Annual,2016.0,2016/01/04,2016/03/31,45.00,90.00
1,2016/01/04,3353,ConsolidatedJP,2015/11,Q3,2016.0,2016/01/04,GB,False,21550.0,1038.0,1053.0,697.0,24949.0,7145.0,,,,ConsolidatedJP,2016/02,Annual,2016.0,2016/01/04,GB,False,27800.0,1320.0,1310.0,840.0,,,,,,,,,2016/02,Annual,2016.0,2016/01/04,2016/02/29,32.50,65.00
2,2016/01/04,4591,,,,,,,,,,,,,,,,,NonConsolidated,2016/03,Annual,2016.0,2016/01/04,GB,False,119.0,-474.0,-451.0,-452.0,,,,,,,,,2016/03,Annual,2016.0,2016/01/04,2016/03/31,0.00,0.00
3,2016/01/04,6786,,,,,,,,,,,,,,,,,ConsolidatedJP,2016/03,Annual,2016.0,2016/01/04,GB,False,15700.0,,1800.0,1100.0,,,,,,,,,2016/03,Annual,2016.0,2016/01/04,2016/03/31,0.00,0.00
4,2016/01/04,7463,ConsolidatedJP,2015/12,Q3,2016.0,2016/01/04,GB,False,14307.0,4518.0,4136.0,2685.0,38498.0,30980.0,,,,ConsolidatedJP,2016/03,Annual,2016.0,2016/01/04,GB,False,19500.0,6000.0,5660.0,3500.0,,,,,,,,,2016/03,Annual,2016.0,2016/01/04,2016/03/31,18.00,18.00
5,2016/01/05,1376,ConsolidatedJP,2015/11,Q2,2016.0,2016/01/05,GB,False,24881.0,491.0,546.0,354.0,33055.0,15629.0,,,,ConsolidatedJP,2016/05,Annual,2016.0,2016/01/05,GB,False,59000.0,1850.0,1950.0,1200.0,,,,,,,,,2016/05,Annual,2016.0,2016/01/05,2016/05/31,14.00,25.00
6,2016/01/05,2659,ConsolidatedJP,2015/11,Q3,2016.0,2016/01/05,GB,False,128716.0,10206.0,10451.0,6212.0,118788.0,92277.0,,,,ConsolidatedJP,2016/02,Annual,2016.0,2016/01/05,GB,False,167525.0,12941.0,13244.0,7969.0,,,,,,,,,2016/02,Annual,2016.0,2016/01/05,2016/02/29,36.00,36.00
7,2016/01/05,5216,,,,,,,,,,,,,,,,,ConsolidatedJP,2015/12,Annual,2015.0,2016/01/05,GB,False,5800.0,-590.0,-670.0,100.0,,,,,,,,,2015/12,Annual,2015.0,2016/01/05,2015/12/31,0.00,0.00
8,2016/01/05,6788,,,,,,,,,,,,,,,,,ConsolidatedJP,2016/03,Annual,2016.0,2016/01/05,GB,False,15100.0,3070.0,3150.0,1950.0,,,,,,,,,2016/03,Annual,2016.0,2016/01/05,2016/03/31,60.00,60.00
9,2016/01/05,8168,NonConsolidated,2015/11,Q3,2016.0,2016/01/05,GB,False,119395.0,118.0,872.0,-183.0,96633.0,38984.0,,,,NonConsolidated,2016/02,Annual,2016.0,2016/01/05,GB,False,161000.0,950.0,1850.0,200.0,,,,,,,,,2016/02,Annual,2016.0,2016/01/05,2016/02/29,6.25,12.50


In [279]:
model_path = os.path.join(os.path.dirname("__file__"), "../../../current_data")

# tag::save_model[]
# モデル保存先ディレクトリを作成
os.makedirs(model_path, exist_ok=True)
with open(os.path.join(model_path, "stock_fin.pkl"), "wb") as f:
    # モデルをpickle形式で保存
    pickle.dump(tmp_stock_fin, f)

# Stock Labels

In [280]:
tmp_stock_label = dfs['stock_labels'].copy()

In [281]:
tmp_stock_label.head(10)

Unnamed: 0,base_date,Local Code,label_date_5,label_high_5,label_low_5,label_date_10,label_high_10,label_low_10,label_date_20,label_high_20,label_low_20
0,2016-01-04,1301,2016-01-12,0.01091,-0.04,2016-01-19,0.01091,-0.05455,2016-02-02,0.01091,-0.08727
1,2016-01-05,1301,2016-01-13,0.00362,-0.04348,2016-01-20,0.00362,-0.07609,2016-02-03,0.00362,-0.09058
2,2016-01-06,1301,2016-01-14,0.0,-0.05072,2016-01-21,0.0,-0.08696,2016-02-04,0.00362,-0.09058
3,2016-01-07,1301,2016-01-15,0.01107,-0.03321,2016-01-22,0.01107,-0.0738,2016-02-05,0.02214,-0.0738
4,2016-01-08,1301,2016-01-18,0.01111,-0.03333,2016-01-25,0.01111,-0.07037,2016-02-08,0.02593,-0.07037
5,2016-01-12,1301,2016-01-19,0.02652,-0.01515,2016-01-26,0.02652,-0.04924,2016-02-09,0.04924,-0.04924
6,2016-01-13,1301,2016-01-20,-0.01115,-0.05204,2016-01-27,-0.01115,-0.06691,2016-02-10,0.02974,-0.06691
7,2016-01-14,1301,2016-01-21,0.01141,-0.04183,2016-01-28,0.01521,-0.04563,2016-02-12,0.05323,-0.04943
8,2016-01-15,1301,2016-01-22,-0.00377,-0.05283,2016-01-29,0.03019,-0.05283,2016-02-15,0.04528,-0.0566
9,2016-01-18,1301,2016-01-25,0.0038,-0.04563,2016-02-01,0.05323,-0.04563,2016-02-16,0.05323,-0.04943


In [282]:
tmp_stock_label.tail()

Unnamed: 0,base_date,Local Code,label_date_5,label_high_5,label_low_5,label_date_10,label_high_10,label_low_10,label_date_20,label_high_20,label_low_20
4225436,2020-12-24,9997,,,,,,,,,
4225437,2020-12-25,9997,,,,,,,,,
4225438,2020-12-28,9997,,,,,,,,,
4225439,2020-12-29,9997,,,,,,,,,
4225440,2020-12-30,9997,,,,,,,,,


In [283]:
model_path = os.path.join(os.path.dirname("__file__"), "../../../get_data")

with open(os.path.join(model_path, "stock_labels.pkl"), "rb") as f:
    stock_labels = pickle.load(f)

In [284]:
tmp_stock_label.dtypes

base_date         object
Local Code         int64
label_date_5      object
label_high_5     float64
label_low_5      float64
label_date_10     object
label_high_10    float64
label_low_10     float64
label_date_20     object
label_high_20    float64
label_low_20     float64
dtype: object

In [285]:
stock_labels.dtypes

Local Code       object
base_date        object
label_date_10    object
label_date_20    object
label_date_5     object
label_high_10    object
label_high_20    object
label_high_5     object
label_low_10     object
label_low_20     object
label_low_5      object
dtype: object

In [286]:
stock_labels[['label_high_5',
'label_low_5',
'label_high_10',
'label_low_10',
'label_high_20',
'label_low_20']] = stock_labels[['label_high_5',
'label_low_5',
'label_high_10',
'label_low_10',
'label_high_20',
'label_low_20'
]].apply(pd.to_numeric)

In [287]:
stock_labels = stock_labels.astype(dict(tmp_stock_label.dtypes))

In [288]:
stock_labels = stock_labels[tmp_stock_label.columns]

In [289]:
tmp_stock_label = tmp_stock_label.append(stock_labels)

In [290]:
tmp_stock_label = tmp_stock_label.drop_duplicates(keep='last', subset=["base_date", "Local Code"])

In [291]:
tmp_stock_label[tmp_stock_label.duplicated(keep='last', subset=["base_date", "Local Code"])]

Unnamed: 0,base_date,Local Code,label_date_5,label_high_5,label_low_5,label_date_10,label_high_10,label_low_10,label_date_20,label_high_20,label_low_20


In [292]:
tmp_stock_label = tmp_stock_label.sort_values(['Local Code', 'base_date'])

In [293]:
tmp_stock_label = tmp_stock_label.reset_index(drop=True)

In [294]:
dfs['stock_labels'].head()

Unnamed: 0,base_date,Local Code,label_date_5,label_high_5,label_low_5,label_date_10,label_high_10,label_low_10,label_date_20,label_high_20,label_low_20
0,2016-01-04,1301,2016-01-12,0.01091,-0.04,2016-01-19,0.01091,-0.05455,2016-02-02,0.01091,-0.08727
1,2016-01-05,1301,2016-01-13,0.00362,-0.04348,2016-01-20,0.00362,-0.07609,2016-02-03,0.00362,-0.09058
2,2016-01-06,1301,2016-01-14,0.0,-0.05072,2016-01-21,0.0,-0.08696,2016-02-04,0.00362,-0.09058
3,2016-01-07,1301,2016-01-15,0.01107,-0.03321,2016-01-22,0.01107,-0.0738,2016-02-05,0.02214,-0.0738
4,2016-01-08,1301,2016-01-18,0.01111,-0.03333,2016-01-25,0.01111,-0.07037,2016-02-08,0.02593,-0.07037


In [309]:
tmp_stock_label[(tmp_stock_label['Local Code'] == 9997) & (tmp_stock_label['base_date'] == "2020-12-24")]

Unnamed: 0,base_date,Local Code,label_date_5,label_high_5,label_low_5,label_date_10,label_high_10,label_low_10,label_date_20,label_high_20,label_low_20
4436804,2020-12-24,9997,2021-01-04,0.13659,-0.00683,2021-01-12,0.13659,-0.00683,2021-01-26,0.1639,-0.00683


In [311]:
model_path = os.path.join(os.path.dirname("__file__"), "../../../current_data")

# tag::save_model[]
# モデル保存先ディレクトリを作成
os.makedirs(model_path, exist_ok=True)
with open(os.path.join(model_path, "stock_labels.pkl"), "wb") as f:
    # モデルをpickle形式で保存
    pickle.dump(tmp_stock_label, f)

In [312]:

dfs = {}
dfs['stock_list'] = pd.read_csv(f"{dataset_dir}/stock_list.csv.gz")

In [315]:

dfs['stock_price'] = tmp_stock_price
dfs['stock_fin'] = tmp_stock_fin
dfs['stock_labels'] = tmp_stock_label

In [321]:
dfs['stock_price'].tail()

Unnamed: 0,Local Code,EndOfDayQuote Date,EndOfDayQuote Open,EndOfDayQuote High,EndOfDayQuote Low,EndOfDayQuote Close,EndOfDayQuote ExchangeOfficialClose,EndOfDayQuote Volume,EndOfDayQuote CumulativeAdjustmentFactor,EndOfDayQuote PreviousClose,EndOfDayQuote PreviousCloseDate,EndOfDayQuote PreviousExchangeOfficialClose,EndOfDayQuote PreviousExchangeOfficialCloseDate,EndOfDayQuote ChangeFromPreviousClose,EndOfDayQuote PercentChangeFromPreviousClose,EndOfDayQuote VWAP
4436861,9997,2021/03/22,1350.0,1350.0,1301.0,1306.0,1306.0,308800.0,1.0,1350.0,2021/03/19,1350.0,2021/03/19,-44.0,-3.259,1319.927
4436862,9997,2021/03/23,1322.0,1337.0,1308.0,1314.0,1314.0,235900.0,1.0,1306.0,2021/03/22,1306.0,2021/03/22,8.0,0.613,1323.588
4436863,9997,2021/03/24,1299.0,1317.0,1252.0,1268.0,1268.0,298500.0,1.0,1314.0,2021/03/23,1314.0,2021/03/23,-46.0,-3.501,1273.23
4436864,9997,2021/03/25,1262.0,1305.0,1262.0,1298.0,1298.0,143600.0,1.0,1268.0,2021/03/24,1268.0,2021/03/24,30.0,2.366,1288.961
4436865,9997,2021/03/26,1312.0,1333.0,1306.0,1317.0,1317.0,174700.0,1.0,1298.0,2021/03/25,1298.0,2021/03/25,19.0,1.464,1319.591


In [5]:
FEATURES = ['MA_gap_2month',
            'MA_gap_3month',
            'volatility_2month',
            'volatility_3month',
            'Result_Dividend FiscalYear',
            'return_3month',
            'Forecast_Dividend FiscalYear',
            'volatility_1month',
            'Forecast_FinancialStatement FiscalYear',
            'MA_gap_1month',
            'pbr',
            'Result_FinancialStatement FiscalYear',
            'return_1month',
            'ema_12',
            'Result_FinancialStatement TotalAssets',
            'signal',
            'Previous_FinancialStatement NetIncome',
            'per',
            'Result_FinancialStatement CashFlowsFromOperatingActivities',
            'Result_FinancialStatement CashFlowsFromInvestingActivities',
            'ema_10']

FEATURES_HIGH = ['MA_gap_2month_high',
                 'MA_gap_3month_high',
                 'volatility_2month_high',
                 'volatility_3month_high',
                 'Result_Dividend FiscalYear',
                 'return_3month_high',
                 'Forecast_Dividend FiscalYear',
                 'volatility_1month_high',
                 'Forecast_FinancialStatement FiscalYear',
                 'MA_gap_1month_high',
                 'pbr',
                 'Result_FinancialStatement FiscalYear',
                 'return_1month_high',
                 'ema_12',
                 'Result_FinancialStatement TotalAssets',
                 'signal',
                 'Previous_FinancialStatement NetIncome',
                 'per',
                 'Result_FinancialStatement CashFlowsFromOperatingActivities',
                 'Result_FinancialStatement CashFlowsFromInvestingActivities',
                 'ema_10']

FEATURES_LOW = ['MA_gap_2month_low',
                'MA_gap_3month_low',
                'volatility_2month_low',
                'volatility_3month_low',
                'Result_Dividend FiscalYear',
                'return_3month_low',
                'Forecast_Dividend FiscalYear',
                'volatility_1month_low',
                'Forecast_FinancialStatement FiscalYear',
                'MA_gap_1month_low',
                'pbr',
                'Result_FinancialStatement FiscalYear',
                'return_1month_low',
                'ema_12',
                'Result_FinancialStatement TotalAssets',
                'signal',
                'Previous_FinancialStatement NetIncome',
                'per',
                'Result_FinancialStatement CashFlowsFromOperatingActivities',
                'Result_FinancialStatement CashFlowsFromInvestingActivities',
                'ema_10']

In [6]:
SELECT_FIN_DATA_COLUMNS = ['Result_FinancialStatement FiscalYear', 'Result_FinancialStatement NetSales',
       'Result_FinancialStatement OperatingIncome', 'Result_FinancialStatement OrdinaryIncome',
       'Result_FinancialStatement NetIncome', 'Result_FinancialStatement TotalAssets',
       'Result_FinancialStatement NetAssets', 'Result_FinancialStatement CashFlowsFromOperatingActivities',
       'Result_FinancialStatement CashFlowsFromFinancingActivities',
       'Result_FinancialStatement CashFlowsFromInvestingActivities', 'Forecast_FinancialStatement FiscalYear',
       'Forecast_FinancialStatement NetSales', 'Forecast_FinancialStatement OperatingIncome',
       'Forecast_FinancialStatement OrdinaryIncome', 'Forecast_FinancialStatement NetIncome',
       'Result_Dividend FiscalYear', 'Result_Dividend QuarterlyDividendPerShare',
       'Result_Dividend AnnualDividendPerShare', 'Forecast_Dividend FiscalYear',
       'Forecast_Dividend QuarterlyDividendPerShare', 'Forecast_Dividend AnnualDividendPerShare',
       'IssuedShareEquityQuote IssuedShare','Section/Products', '33 Sector(Code)', '17 Sector(Code)']

In [7]:
section_products = {
    "First Section (Domestic)" : 1,
    "JASDAQ(Standard / Domestic)" :2,
    "Second Section(Domestic)" :3,
    "Mothers (Domestic)" : 4,
    "JASDAQ(Growth/Domestic)" :5
}

In [8]:
def calculate_glossary_of_financial_analysis(row):
    operating_profit_margin = 0
    ordinary_profit_margin = 0
    net_profit_margin = 0
    total_asset_turnover = 0
    net_sales_growth_rate = 0
    ordinary_income_growth_rate = 0
    operationg_income_growth_rate = 0
    total_assets_growth_rate = 0
    net_assets_growth_rate = 0
    eps = 0
    bps = 0
    roe = 0

    # 売上高営業利益率 売上高営業利益率（％）＝営業利益÷売上高×100
    if row['Result_FinancialStatement NetSales'] != 0:
        operating_profit_margin = \
            row['Result_FinancialStatement OperatingIncome'] / \
            row['Result_FinancialStatement NetSales'] * 100
    # 売上高経常利益率　売上高経常利益率（％）＝経常利益÷売上高×100
    if row['Result_FinancialStatement NetSales'] != 0:
        ordinary_profit_margin = \
            row['Result_FinancialStatement OrdinaryIncome'] / \
            row['Result_FinancialStatement NetSales'] * 100
    # 売上高純履歴率　売上高純利益率（％）＝当期純利益÷売上高×100
    if row['Result_FinancialStatement NetSales'] != 0:
        net_profit_margin = row['Result_FinancialStatement NetIncome'] / \
                            row['Result_FinancialStatement NetSales'] * 100
    # 総資本回転率 総資本回転率（％）＝売上高÷総資本（自己資本＋他人資本）×100
    if row['Result_FinancialStatement NetAssets'] != 0:
        total_asset_turnover = row['Result_FinancialStatement NetSales'] / \
                            row['Result_FinancialStatement NetAssets'] * 100
    # 売上高増加率
    if row['Previous_FinancialStatement NetSales'] != 0:
        net_sales_growth_rate = \
            (row['Result_FinancialStatement NetSales'] -
            row['Previous_FinancialStatement NetSales']) / \
            row['Previous_FinancialStatement NetSales'] * 100
    # 経常利益増加率
    if row['Previous_FinancialStatement OrdinaryIncome'] != 0:
        ordinary_income_growth_rate = \
            (row['Result_FinancialStatement OrdinaryIncome'] -
            row['Previous_FinancialStatement OrdinaryIncome']) / \
            row['Previous_FinancialStatement OrdinaryIncome'] * 100

    # 営業利益増加率
    if row['Previous_FinancialStatement OperatingIncome'] != 0:
        operationg_income_growth_rate = \
            (row['Result_FinancialStatement OperatingIncome'] -
            row['Previous_FinancialStatement OperatingIncome']) / \
            row['Previous_FinancialStatement OperatingIncome'] * 100
    # 総資本増加率
    if row['Previous_FinancialStatement TotalAssets'] != 0:
        total_assets_growth_rate = \
            (row['Result_FinancialStatement TotalAssets'] -
            row['Previous_FinancialStatement TotalAssets']) / \
            row['Previous_FinancialStatement TotalAssets'] * 100
    # 純資本増加率
    if row['Previous_FinancialStatement NetAssets'] != 0:
        net_assets_growth_rate = \
            (row['Result_FinancialStatement NetAssets'] -
            row['Previous_FinancialStatement NetAssets']) / \
            row['Previous_FinancialStatement NetAssets'] * 100
    # 一株当たり当期純利益（EPS）
    if row['IssuedShareEquityQuote IssuedShare'] != 0:
        eps = row['Result_FinancialStatement NetIncome'] / \
            row['IssuedShareEquityQuote IssuedShare']
        # BPS 一株当たり純資産（円） ＝ 純資産 ÷ 発行済株式総数
        bps = row['Result_FinancialStatement NetAssets'] / \
            row['IssuedShareEquityQuote IssuedShare']
        # ROE EPS（一株当たり利益）÷ BPS（一株当たり純資産）× 100
        if bps > 0:
            roe = eps / bps * 100
    return pd.Series(
        [operating_profit_margin, ordinary_profit_margin,
            net_profit_margin, total_asset_turnover,
            net_sales_growth_rate, ordinary_income_growth_rate,
            operationg_income_growth_rate, total_assets_growth_rate,
            net_assets_growth_rate, eps, bps, roe])

# 特徴量の生成

In [9]:
TRAIN_END = "2021-03-25"
VAL_START = "2020-02-01"
VAL_END = "2021-03-25"
TEST_START = "2021-01-01"

In [10]:
def get_features_for_predict(dfs, code):
    """
    Args:
        dfs (dict)  : dict of pd.DataFrame include stock_fin, stock_price
        code (int)  : A local code for a listed company
    Returns:
        feature DataFrame (pd.DataFrame)
    """
    # おおまかな手順の1つ目
    # stock_finデータを読み込み
    stock_fin = dfs["stock_fin"].copy()
    
    stock_list = dfs["stock_list"].copy()
    stock_fin = pd.merge(stock_fin, stock_list, on=["Local Code"] )

    # 特定の銘柄コードのデータに絞る
    fin_data = stock_fin[stock_fin["Local Code"] == code].copy()
    # 日付列をpd.Timestamp型に変換してindexに設定
    fin_data["datetime"] = pd.to_datetime(fin_data["base_date"])
    fin_data.set_index("datetime", inplace=True)
    # fin_dataを選択
    fin_data = fin_data[SELECT_FIN_DATA_COLUMNS]
    fin_data = fin_data.join(fin_data[['Result_FinancialStatement NetSales', 'Result_FinancialStatement OperatingIncome', 
                                   'Result_FinancialStatement OrdinaryIncome', 'Result_FinancialStatement NetIncome', 
                                   'Result_FinancialStatement TotalAssets', 'Result_FinancialStatement NetAssets',
                                   'Result_FinancialStatement CashFlowsFromOperatingActivities', 
                                   'Result_FinancialStatement CashFlowsFromFinancingActivities',
                                   'Result_FinancialStatement CashFlowsFromInvestingActivities']].rename(columns =
                                                                                                         {'Result_FinancialStatement NetSales': 'Previous_FinancialStatement NetSales',
                                                                                                          'Result_FinancialStatement OperatingIncome': 'Previous_FinancialStatement OperatingIncome', 
                                                                                                          'Result_FinancialStatement OrdinaryIncome': 'Previous_FinancialStatement OrdinaryIncome', 
                                                                                                          'Result_FinancialStatement NetIncome':'Previous_FinancialStatement NetIncome', 
                                                                                                          'Result_FinancialStatement TotalAssets': 'Previous_FinancialStatement TotalAssets', 
                                                                                                          'Result_FinancialStatement NetAssets':'Previous_FinancialStatement NetAssets',
                                                                                                          'Result_FinancialStatement CashFlowsFromOperatingActivities': 'Previous_FinancialStatement CashFlowsFromOperatingActivities', 
                                                                                                          'Result_FinancialStatement CashFlowsFromFinancingActivities':'Previous_FinancialStatement CashFlowsFromFinancingActivities',
                                                                                                          'Result_FinancialStatement CashFlowsFromInvestingActivities':'Previous_FinancialStatement CashFlowsFromInvestingActivities'}).shift(-1))
    fin_data[['operating_profit_margin', 'ordinary_profit_margin', 'net_profit_margin', 'total_asset_turnover',
         'net_sales_growth_rate', 'ordinary_income_growth_rate', 'operationg_income_growth_rate',
          'total_assets_growth_rate', 'net_assets_growth_rate', 'eps', 'bps', 'roe']] = fin_data.apply(calculate_glossary_of_financial_analysis, axis=1)

    # 欠損値処理
    fin_feats = fin_data.fillna(0)

    # おおまかな手順の2つ目
    # stock_priceデータを読み込む
    price = dfs["stock_price"].copy()
    # 特定の銘柄コードのデータに絞る
    price_data = price[price["Local Code"] == code].copy()
    # 日付列をpd.Timestamp型に変換してindexに設定
    price_data["datetime"] = pd.to_datetime(price_data["EndOfDayQuote Date"])
    price_data.set_index("datetime", inplace=True)
    # 終値、最高値、最安値に絞る
    feats = price_data[["EndOfDayQuote ExchangeOfficialClose","EndOfDayQuote High", "EndOfDayQuote Low"]].copy()
    
    # 終値
    # 終値の20営業日リターン
    feats["return_1month"] = feats["EndOfDayQuote ExchangeOfficialClose"].pct_change(20)
    # 終値の40営業日リターン
    feats["return_2month"] = feats["EndOfDayQuote ExchangeOfficialClose"].pct_change(40)
    # 終値の60営業日リターン
    feats["return_3month"] = feats["EndOfDayQuote ExchangeOfficialClose"].pct_change(60)
    # 終値の20営業日ボラティリティ
    feats["volatility_1month"] = (
        np.log(feats["EndOfDayQuote ExchangeOfficialClose"]).diff().rolling(20).std()
    )
    # 終値の40営業日ボラティリティ
    feats["volatility_2month"] = (
        np.log(feats["EndOfDayQuote ExchangeOfficialClose"]).diff().rolling(40).std()
    )
    # 終値の60営業日ボラティリティ
    feats["volatility_3month"] = (
        np.log(feats["EndOfDayQuote ExchangeOfficialClose"]).diff().rolling(60).std()
    )
    # 終値と20営業日の単純移動平均線の乖離
    feats["MA_gap_1month"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote ExchangeOfficialClose"].rolling(20).mean()
    )
    # 終値と40営業日の単純移動平均線の乖離
    feats["MA_gap_2month"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote ExchangeOfficialClose"].rolling(40).mean()
    )
    # 終値と60営業日の単純移動平均線の乖離
    feats["MA_gap_3month"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote ExchangeOfficialClose"].rolling(60).mean()
    )
    
        
    # 最高値
    # 最高値の20営業日リターン
    feats["return_1month_high"] = feats["EndOfDayQuote High"].pct_change(20)
    # 最高値の40営業日リターン
    feats["return_2month_high"] = feats["EndOfDayQuote High"].pct_change(40)
    # 最高値の60営業日リターン
    feats["return_3month_high"] = feats["EndOfDayQuote High"].pct_change(60)
    # 最高値の20営業日ボラティリティ
    feats["volatility_1month_high"] = (
        np.log(feats["EndOfDayQuote High"]).diff().rolling(20).std()
    )
    # 最高値の40営業日ボラティリティ
    feats["volatility_2month_high"] = (
        np.log(feats["EndOfDayQuote High"]).diff().rolling(40).std()
    )
    # 最高値の60営業日ボラティリティ
    feats["volatility_3month_high"] = (
        np.log(feats["EndOfDayQuote High"]).diff().rolling(60).std()
    )
    # 終値と20営業日の単純移動平均線の乖離
    feats["MA_gap_1month_high"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote High"].rolling(20).mean()
    )
    # 終値と40営業日の単純移動平均線の乖離
    feats["MA_gap_2month_high"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote High"].rolling(40).mean()
    )
    # 終値と60営業日の単純移動平均線の乖離
    feats["MA_gap_3month_high"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote High"].rolling(60).mean()
    )
        
    # 最安値
    # 最安値の20営業日リターン
    feats["return_1month_low"] = feats["EndOfDayQuote Low"].pct_change(20)
    # 最安値の40営業日リターン
    feats["return_2month_low"] = feats["EndOfDayQuote Low"].pct_change(40)
    # 最安値の60営業日リターン
    feats["return_3month_low"] = feats["EndOfDayQuote Low"].pct_change(60)
    # 最安値の20営業日ボラティリティ
    feats["volatility_1month_low"] = (
        np.log(feats["EndOfDayQuote Low"]).diff().rolling(20).std()
    )
    # 最安値の40営業日ボラティリティ
    feats["volatility_2month_low"] = (
        np.log(feats["EndOfDayQuote Low"]).diff().rolling(40).std()
    )
    # 最安値の60営業日ボラティリティ
    feats["volatility_3month_low"] = (
        np.log(feats["EndOfDayQuote Low"]).diff().rolling(60).std()
    )
    # 終値と20営業日の単純移動平均線の乖離
    feats["MA_gap_1month_low"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote Low"].rolling(20).mean()
    )
    # 終値と40営業日の単純移動平均線の乖離
    feats["MA_gap_2month_low"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote Low"].rolling(40).mean()
    )
    # 終値と60営業日の単純移動平均線の乖離
    feats["MA_gap_3month_low"] = feats["EndOfDayQuote ExchangeOfficialClose"] / (
        feats["EndOfDayQuote Low"].rolling(60).mean()
    )
    
    # EWMA
    ALPHA = 0.25
    feats["EWMA"] = feats["EndOfDayQuote ExchangeOfficialClose"]

    for t in zip(feats.index, feats.index[1:]):
        feats.loc[t[1], "EWMA"] = ALPHA * feats.loc[t[1], "EndOfDayQuote ExchangeOfficialClose"] + (1 - ALPHA) * feats.loc[t[0], "EWMA"]
    
    # EMA 10日
    feats["ema_10"] = feats["EndOfDayQuote ExchangeOfficialClose"].ewm(span=10).mean()
    
    # MACD 
    # EMA12
    feats["ema_12"] = feats["EndOfDayQuote ExchangeOfficialClose"].ewm(span=12).mean()
    # EMA 26
    feats["ema_26"] = feats["EndOfDayQuote ExchangeOfficialClose"].ewm(span=26).mean()
    feats["macd"] = feats["ema_12"] - feats["ema_26"]
    feats["signal"] = feats["macd"].ewm(span=9).mean()
    
    # PBR 株価 ÷ BPS（1株あたり純資産）
    feats["pbr"] = feats["EndOfDayQuote ExchangeOfficialClose"] / fin_data["bps"]
    # PER 株価 ÷ 1株当たり利益（EPS）
    feats["per"] = feats["EndOfDayQuote ExchangeOfficialClose"] / fin_data["eps"]

    # おおまかな手順の3つ目
    # 欠損値処理
    feats = feats.fillna(0)
    # 元データのカラムを削除
    feats = feats.drop(["EndOfDayQuote ExchangeOfficialClose"], axis=1)

    # 財務データの特徴量とマーケットデータの特徴量のインデックスを合わせる
    feats = feats.loc[feats.index.isin(fin_feats.index)]
    fin_feats = fin_feats.loc[fin_feats.index.isin(feats.index)]

    # データを結合
    feats = pd.concat([feats, fin_feats], axis=1).dropna()

    # 欠損値処理を行います。l
    feats = feats.replace([np.inf, -np.inf], 0)
    
    # 市場・商品区分を数値に変換
    feats["Section/Products"] = section_products[feats["Section/Products"][0]]
    # 銘柄コードを設定
    feats["code"] = code

    return feats

In [11]:
def get_features_and_label(dfs, codes, feature, label):
    """
    Args:
        dfs (dict[pd.DataFrame]): loaded data
        codes  (array) : target codes
        feature (pd.DataFrame): features
        label (str) : label column name
    Returns:
        train_X (pd.DataFrame): training data
        train_y (pd.DataFrame): label for train_X
        val_X (pd.DataFrame): validation data
        val_y (pd.DataFrame): label for val_X
        test_X (pd.DataFrame): test data
        test_y (pd.DataFrame): label for test_X
    """
    # 分割データ用の変数を定義
    trains_X, vals_X, tests_X = [], [], []
    trains_y, vals_y, tests_y = [], [], []

    # 銘柄コード毎に特徴量を作成
    for code in tqdm(codes):
        # 特徴量取得
        feats = feature[feature["code"] == code]

        # stock_labelデータを読み込み
        stock_labels = dfs["stock_labels"].copy()
        # 特定の銘柄コードのデータに絞る
        stock_labels = stock_labels[stock_labels["Local Code"] == code]
        # 日付列をpd.Timestamp型に変換してindexに設定
        stock_labels["datetime"] = pd.to_datetime(stock_labels["base_date"])
        stock_labels.set_index("datetime", inplace=True)

        # 特定の目的変数に絞る
        labels = stock_labels[label]
        # nanを削除
        labels.dropna(inplace=True)

        if feats.shape[0] > 0 and labels.shape[0] > 0:
            # 特徴量と目的変数のインデックスを合わせる
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]
            labels.index = feats.index

            # データを分割（ホールドアウト法）
            _train_X = feats[: TRAIN_END].copy()
            _val_X = feats[VAL_START : VAL_END].copy()
            _test_X = feats[TEST_START :].copy()

            _train_y = labels[: TRAIN_END].copy()
            _val_y = labels[VAL_START : VAL_END].copy()
            _test_y = labels[TEST_START :].copy()

            # データを配列に格納 (後ほど結合するため)
            trains_X.append(_train_X)
            vals_X.append(_val_X)
            tests_X.append(_test_X)

            trains_y.append(_train_y)
            vals_y.append(_val_y)
            tests_y.append(_test_y)

    # 銘柄毎に作成した説明変数データを結合します。
    train_X = pd.concat(trains_X)
    val_X = pd.concat(vals_X)
    test_X = pd.concat(tests_X)
    # 銘柄毎に作成した目的変数データを結合します。
    train_y = pd.concat(trains_y)
    val_y = pd.concat(vals_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, val_X, val_y, test_X, test_y

In [12]:
# 対象銘柄コードを定義
codes = [9984]
# 対象の目的変数を定義
label = "label_high_20"
# 特徴量を取得
feat = get_features_for_predict(dfs, codes[0])
# 特徴量と目的変数を入力し、分割データを取得
ret = get_features_and_label(dfs, codes, feat, label)
for v in ret:
    print(v.T)

divide by zero encountered in log
divide by zero encountered in log
divide by zero encountered in log
divide by zero encountered in log
divide by zero encountered in log
divide by zero encountered in log


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


datetime                                              2016-02-10    2016-04-21    2016-05-10    2016-07-28  \
EndOfDayQuote High                                  2.397500e+03  3.115000e+03  2.998500e+03  2.700500e+03   
EndOfDayQuote Low                                   2.231500e+03  2.851000e+03  2.910500e+03  2.672000e+03   
return_1month                                      -1.913212e-01  8.908766e-02  8.688554e-02 -7.610193e-02   
return_2month                                       0.000000e+00  1.441458e-01  2.409844e-02 -1.417147e-01   
return_3month                                       0.000000e+00  2.555166e-01  1.739812e-01 -1.038744e-01   
volatility_1month                                   4.402584e-02  1.717726e-02  2.271106e-02  3.175834e-02   
volatility_2month                                   0.000000e+00  1.705030e-02  1.818629e-02  3.062359e-02   
volatility_3month                                   0.000000e+00  3.428422e-02  3.271347e-02  2.758085e-02   
MA_gap_1m

In [12]:
def get_codes(dfs):
    """
    Args:
        dfs (dict[pd.DataFrame]): loaded data
    Returns:
        array: list of stock codes
    """
    stock_list = dfs["stock_list"].copy()
    # 予測対象の銘柄コードを取得
    codes = stock_list[stock_list["prediction_target"] == True][
        "Local Code"
    ].values
    return codes

In [13]:
# 対象の目的変数を定義
labels = {
    "label_high_5",
    "label_high_10",
    "label_high_20",
    "label_low_5",
    "label_low_10",
    "label_low_20",
}
# 目的変数毎にデータを保存するための変数
train_X, val_X, test_X = {}, {}, {}
train_y, val_y, test_y = {}, {}, {}

# 予測対象銘柄を取得
codes = get_codes(dfs)

# 特徴量を作成
buff = []
for code in tqdm(codes):
    feat = get_features_for_predict(dfs, code)
    buff.append(feat)
feature = pd.concat(buff)

HBox(children=(IntProgress(value=0, max=3523), HTML(value='')))

divide by zero encountered in log
divide by zero encountered in log
divide by zero encountered in log
divide by zero encountered in log
divide by zero encountered in log
divide by zero encountered in log
invalid value encountered in subtract





In [16]:
model_path = os.path.join(os.path.dirname("__file__"), "../../../feature")
# tag::save_model[]
# モデル保存先ディレクトリを作成
os.makedirs(model_path, exist_ok=True)
with open(os.path.join(model_path, f"high_low_feature.pkl"), "wb") as f:
    # モデルをpickle形式で保存
    pickle.dump(feature, f)

In [14]:
model_path = os.path.join(os.path.dirname("__file__"), "../../../feature")
# tag::save_model[]
# モデル保存先ディレクトリを作成
os.makedirs(model_path, exist_ok=True)
with open(os.path.join(model_path, f"high_low_feature.pkl"), "rb") as f:
    # モデルをpickle形式で保存
    feature = pickle.load(f)

EOFError: Ran out of input

In [17]:
# 対象の目的変数を定義
labels = {
    "label_high_5",
    "label_high_10",
    "label_high_20",
    "label_low_5",
    "label_low_10",
    "label_low_20",
}
# 目的変数毎にデータを保存するための変数
train_X, val_X, test_X = {}, {}, {}
train_y, val_y, test_y = {}, {}, {}

# 予測対象銘柄を取得
codes = get_codes(dfs)


# 目的変数毎に処理
for label in tqdm(labels):
    # 特徴量と目的変数を取得
    _train_X, _train_y, _val_X, _val_y, _test_X, _test_y = get_features_and_label(dfs, codes, feature, label)
    # 目的変数をキーとして値を保存
    train_X[label] = _train_X
    val_X[label] = _val_X
    test_X[label] = _test_X
    train_y[label] = _train_y
    val_y[label] = _val_y
    test_y[label] = _test_y

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3523), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3523), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3523), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3523), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3523), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3523), HTML(value='')))




In [21]:
test_X['label_low_20']

Unnamed: 0_level_0,EndOfDayQuote High,EndOfDayQuote Low,return_1month,return_2month,return_3month,volatility_1month,volatility_2month,volatility_3month,MA_gap_1month,MA_gap_2month,MA_gap_3month,return_1month_high,return_2month_high,return_3month_high,volatility_1month_high,volatility_2month_high,volatility_3month_high,MA_gap_1month_high,MA_gap_2month_high,MA_gap_3month_high,return_1month_low,return_2month_low,return_3month_low,volatility_1month_low,volatility_2month_low,volatility_3month_low,MA_gap_1month_low,MA_gap_2month_low,MA_gap_3month_low,EWMA,ema_10,ema_12,ema_26,macd,signal,pbr,per,Result_FinancialStatement FiscalYear,Result_FinancialStatement NetSales,Result_FinancialStatement OperatingIncome,Result_FinancialStatement OrdinaryIncome,Result_FinancialStatement NetIncome,Result_FinancialStatement TotalAssets,Result_FinancialStatement NetAssets,Result_FinancialStatement CashFlowsFromOperatingActivities,Result_FinancialStatement CashFlowsFromFinancingActivities,Result_FinancialStatement CashFlowsFromInvestingActivities,Forecast_FinancialStatement FiscalYear,Forecast_FinancialStatement NetSales,Forecast_FinancialStatement OperatingIncome,Forecast_FinancialStatement OrdinaryIncome,Forecast_FinancialStatement NetIncome,Result_Dividend FiscalYear,Result_Dividend QuarterlyDividendPerShare,Result_Dividend AnnualDividendPerShare,Forecast_Dividend FiscalYear,Forecast_Dividend QuarterlyDividendPerShare,Forecast_Dividend AnnualDividendPerShare,IssuedShareEquityQuote IssuedShare,Section/Products,33 Sector(Code),17 Sector(Code),Previous_FinancialStatement NetSales,Previous_FinancialStatement OperatingIncome,Previous_FinancialStatement OrdinaryIncome,Previous_FinancialStatement NetIncome,Previous_FinancialStatement TotalAssets,Previous_FinancialStatement NetAssets,Previous_FinancialStatement CashFlowsFromOperatingActivities,Previous_FinancialStatement CashFlowsFromFinancingActivities,Previous_FinancialStatement CashFlowsFromInvestingActivities,operating_profit_margin,ordinary_profit_margin,net_profit_margin,total_asset_turnover,net_sales_growth_rate,ordinary_income_growth_rate,operationg_income_growth_rate,total_assets_growth_rate,net_assets_growth_rate,eps,bps,roe,code
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1


In [345]:
model_path = os.path.join(os.path.dirname("__file__"), "../../../current_data/train_X")
# tag::save_model[]
# モデル保存先ディレクトリを作成
os.makedirs(model_path, exist_ok=True)
for label in labels:
    with open(os.path.join(model_path, f"train_X_{label}.pkl"), "wb") as f:
        # モデルをpickle形式で保存
        pickle.dump(train_X[label], f)

In [None]:
./../../current_data/train_X

In [26]:
model_path = os.path.join(os.path.dirname("__file__"), "../../../high_low_datas/train_X")

with open(os.path.join(model_path, "train_X_label_high_20.pkl"), "rb") as f:
    train_X = pickle.load(f)


In [27]:
train_X

Unnamed: 0_level_0,EndOfDayQuote High,EndOfDayQuote Low,return_1month,return_2month,return_3month,volatility_1month,volatility_2month,volatility_3month,MA_gap_1month,MA_gap_2month,MA_gap_3month,return_1month_high,return_2month_high,return_3month_high,volatility_1month_high,volatility_2month_high,volatility_3month_high,MA_gap_1month_high,MA_gap_2month_high,MA_gap_3month_high,return_1month_low,return_2month_low,return_3month_low,volatility_1month_low,volatility_2month_low,volatility_3month_low,MA_gap_1month_low,MA_gap_2month_low,MA_gap_3month_low,EWMA,ema_10,ema_12,ema_26,macd,signal,pbr,per,Result_FinancialStatement FiscalYear,Result_FinancialStatement NetSales,Result_FinancialStatement OperatingIncome,Result_FinancialStatement OrdinaryIncome,Result_FinancialStatement NetIncome,Result_FinancialStatement TotalAssets,Result_FinancialStatement NetAssets,Result_FinancialStatement CashFlowsFromOperatingActivities,Result_FinancialStatement CashFlowsFromFinancingActivities,Result_FinancialStatement CashFlowsFromInvestingActivities,Forecast_FinancialStatement FiscalYear,Forecast_FinancialStatement NetSales,Forecast_FinancialStatement OperatingIncome,Forecast_FinancialStatement OrdinaryIncome,Forecast_FinancialStatement NetIncome,Result_Dividend FiscalYear,Result_Dividend QuarterlyDividendPerShare,Result_Dividend AnnualDividendPerShare,Forecast_Dividend FiscalYear,Forecast_Dividend QuarterlyDividendPerShare,Forecast_Dividend AnnualDividendPerShare,IssuedShareEquityQuote IssuedShare,Section/Products,33 Sector(Code),17 Sector(Code),Previous_FinancialStatement NetSales,Previous_FinancialStatement OperatingIncome,Previous_FinancialStatement OrdinaryIncome,Previous_FinancialStatement NetIncome,Previous_FinancialStatement TotalAssets,Previous_FinancialStatement NetAssets,Previous_FinancialStatement CashFlowsFromOperatingActivities,Previous_FinancialStatement CashFlowsFromFinancingActivities,Previous_FinancialStatement CashFlowsFromInvestingActivities,operating_profit_margin,ordinary_profit_margin,net_profit_margin,total_asset_turnover,net_sales_growth_rate,ordinary_income_growth_rate,operationg_income_growth_rate,total_assets_growth_rate,net_assets_growth_rate,eps,bps,roe,code
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1
2016-02-05,2700.0,2610.0,-0.022140,0.000000,0.000000,0.016141,0.000000,0.000000,1.000189,0.000000,0.000000,-0.021739,0.000000,0.000000,0.011838,0.000000,0.000000,0.990099,0.000000,0.000000,-0.036900,0.000000,0.000000,0.015371,0.000000,0.000000,1.008180,0.000000,0.000000,2688.307004,2681.597299,2677.841946,2667.429080,10.412866,3.542501,1.236706e+06,2.556041e+07,2016.0,178890.0,2467.0,2688.0,1133.0,114363.0,23417.0,0.0,0.0,0.0,2016.0,229000.0,2600.0,3000.0,2200.0,0.0,0.00,0.0,2016.0,5.00,5.0,10928283.0,1,50,1,226626.0,2433.0,2814.0,1799.0,94608.0,23065.0,2689.0,2482.0,-5114.0,1.379060,1.502599,0.633350,763.932186,-21.063779,-4.477612,1.397452,20.880898,1.526122,0.000104,0.002143,4.838365,1301
2016-05-09,2650.0,2590.0,0.015564,-0.036900,-0.015094,0.010985,0.012129,0.012798,1.002112,0.987701,0.990951,0.023166,-0.025735,-0.018519,0.011438,0.010141,0.010460,0.995044,0.980282,0.983607,0.019685,-0.037175,-0.007663,0.008402,0.009864,0.010211,1.008891,0.993718,0.997643,2602.442067,2606.111189,2607.778338,2617.500014,-9.721676,-8.439268,1.236628e+06,1.585482e+07,2016.0,226626.0,2433.0,2814.0,1799.0,94608.0,23065.0,2689.0,2482.0,-5114.0,2017.0,117000.0,1400.0,1300.0,800.0,2016.0,5.00,5.0,2017.0,50.00,50.0,10928283.0,1,50,1,52206.0,467.0,380.0,551.0,101632.0,22995.0,0.0,0.0,0.0,1.073575,1.241693,0.793819,982.553653,334.099529,640.526316,420.985011,-6.911209,0.304414,0.000165,0.002111,7.799697,1301
2016-08-05,2620.0,2590.0,-0.011407,-0.029851,-0.011407,0.009727,0.011334,0.009773,0.989157,0.991704,0.986343,-0.003802,-0.029630,-0.003802,0.004159,0.006068,0.005316,0.983545,0.984942,0.980454,-0.003846,-0.033582,-0.007663,0.005217,0.007810,0.006702,0.993125,0.997028,0.991546,2617.963653,2622.148580,2623.746750,2627.154812,-3.408063,0.925419,1.235640e+06,5.156722e+07,2017.0,52206.0,467.0,380.0,551.0,101632.0,22995.0,0.0,0.0,0.0,2017.0,117000.0,1400.0,1300.0,800.0,2016.0,5.00,5.0,2017.0,50.00,50.0,10928283.0,1,50,1,109570.0,1171.0,1004.0,1186.0,106554.0,23600.0,0.0,0.0,0.0,0.894533,0.727886,1.055434,227.031963,-52.353746,-62.151394,-60.119556,-4.619254,-2.563559,0.000050,0.002104,2.396173,1301
2016-11-04,2767.0,2686.0,-0.024240,0.025475,0.037308,0.007637,0.008217,0.007867,0.980620,0.986900,0.999926,-0.000722,0.040226,0.056107,0.004086,0.005104,0.005219,0.975830,0.983302,0.996232,-0.026812,0.021293,0.037066,0.007369,0.007379,0.006863,0.985080,0.992986,1.005643,2746.368761,2749.816817,2750.476413,2741.911960,8.564453,15.665648,1.248880e+06,2.485125e+07,2017.0,109570.0,1171.0,1004.0,1186.0,106554.0,23600.0,0.0,0.0,0.0,2017.0,244000.0,3500.0,3300.0,2100.0,2016.0,5.00,5.0,2017.0,50.00,50.0,10928283.0,1,50,1,179975.0,2872.0,2827.0,2449.0,117168.0,25779.0,0.0,0.0,0.0,1.068723,0.916309,1.082413,464.279661,-39.119322,-64.485320,-59.227019,-9.058787,-8.452616,0.000109,0.002160,5.025424,1301
2017-02-10,2750.0,2737.0,0.008450,0.018175,0.044918,0.004767,0.005084,0.005005,1.007839,1.006462,1.013570,0.007326,0.018519,0.039698,0.003496,0.004525,0.004797,1.005016,1.003308,1.010058,0.011082,0.027017,0.042270,0.002808,0.004647,0.004541,1.011087,1.010808,1.017955,2734.894325,2732.835863,2731.796328,2726.608030,5.188298,4.019979,1.163666e+06,1.224914e+07,2017.0,179975.0,2872.0,2827.0,2449.0,117168.0,25779.0,0.0,0.0,0.0,2017.0,244000.0,3500.0,3300.0,2100.0,2016.0,5.00,5.0,2017.0,50.00,50.0,10928283.0,1,50,1,179975.0,2872.0,2827.0,2449.0,117168.0,25779.0,0.0,0.0,0.0,1.595777,1.570774,1.360745,698.145778,0.000000,0.000000,0.000000,0.000000,0.000000,0.000224,0.002359,9.499981,1301
2017-02-17,2833.0,2811.0,0.032517,0.040884,0.069240,0.004474,0.004805,0.005282,1.027431,1.032216,1.037978,0.035075,0.035453,0.063438,0.004172,0.004460,0.005245,1.024748,1.029161,1.034420,0.039187,0.038803,0.066388,0.003888,0.004616,0.004964,1.031105,1.036617,1.042689,2797.794259,2785.374800,2778.987846,2754.294182,24.693664,14.130199,1.198003e+06,1.261059e+07,2017.0,179975.0,2872.0,2827.0,2449.0,117168.0,25779.0,0.0,0.0,0.0,2017.0,244000.0,3500.0,3300.0,2100.0,2016.0,5.00,5.0,2017.0,60.00,60.0,10928283.0,1,50,1,236561.0,3723.0,3709.0,2422.0,97391.0,25391.0,601.0,105.0,-1998.0,1.595777,1.570774,1.360745,698.145778,-23.920257,-23.779995,-22.857910,20.306805,1.528101,0.000224,0.002359,9.499981,1301
2017-05-11,3040.0,2972.0,0.038961,-0.030303,0.107468,0.007895,0.011378,0.010606,1.039352,1.014238,1.018278,0.031558,-0.042520,0.105455,0.006482,0.008411,0.008829,1.035176,1.007365,1.012175,0.019554,-0.044373,0.085860,0.005346,0.009750,0.009402,1.046616,1.020648,1.025324,2980.443769,2968.241454,2963.143921,2958.097558,5.046363,-11.256664,1.308416e+06,1.371675e+07,2017.0,236561.0,3723.0,3709.0,2422.0,97391.0,25391.0,601.0,105.0,-1998.0,2018.0,250000.0,4000.0,4000.0,2700.0,2017.0,60.00,60.0,2018.0,50.00,50.0,10928283.0,1,50,1,56844.0,979.0,1103.0,754.0,107422.0,25560.0,0.0,0.0,0.0,1.573801,1.567883,1.023837,931.672640,316.158258,236.264733,280.286006,-9.337938,-0.661189,0.000222,0.002323,9.538813,1301
2017-08-04,3230.0,3100.0,0.033708,0.042071,0.059211,0.009633,0.009569,0.009329,1.034621,1.033542,1.037678,0.022152,0.036918,0.062500,0.009424,0.008764,0.008477,1.030318,1.028097,1.032051,0.000000,0.014730,0.043069,0.004305,0.007394,0.008185,1.042746,1.041523,1.046309,3140.221997,3132.044277,3128.480304,3116.072097,12.408207,5.478823,1.376724e+06,4.666986e+07,2018.0,56844.0,979.0,1103.0,754.0,107422.0,25560.0,0.0,0.0,0.0,2018.0,250000.0,4000.0,4000.0,2700.0,2017.0,60.00,60.0,2018.0,50.00,50.0,10928283.0,1,50,1,120458.0,2246.0,2396.0,1633.0,119806.0,26692.0,0.0,0.0,0.0,1.722257,1.940398,1.326437,222.394366,-52.810108,-53.964942,-56.411398,-10.336711,-4.240971,0.000069,0.002339,2.949922,1301
2017-11-06,3900.0,3655.0,0.072423,0.180982,0.177370,0.014065,0.014092,0.013679,1.055735,1.106560,1.126115,0.075862,0.192661,0.190840,0.015384,0.014202,0.013536,1.047762,1.098431,1.117724,0.045780,0.140406,0.126348,0.009355,0.010970,0.010810,1.072498,1.121468,1.139503,3709.437142,3686.664971,3673.771876,3592.130987,81.640890,76.603400,1.576273e+06,2.576478e+07,2018.0,120458.0,2246.0,2396.0,1633.0,119806.0,26692.0,0.0,0.0,0.0,2018.0,250000.0,4000.0,4000.0,2700.0,2017.0,60.00,60.0,2018.0,50.00,50.0,10928283.0,1,50,1,198323.0,3863.0,4065.0,2784.0,124543.0,28204.0,0.0,0.0,0.0,1.864550,1.989075,1.355659,451.288776,-39.261709,-41.057811,-41.858659,-3.803506,-5.360942,0.000149,0.002442,6.117938,1301
2016-02-05,624.0,580.0,-0.076562,0.000000,0.000000,0.029858,0.000000,0.000000,0.966160,0.000000,0.000000,-0.063063,0.000000,0.000000,0.026422,0.000000,0.000000,0.942058,0.000000,0.000000,-0.088050,0.000000,0.000000,0.026956,0.000000,0.000000,0.985000,0.000000,0.000000,610.589506,611.252069,611.544827,613.613557,-2.068730,-2.574485,1.612631e+06,1.643345e+07,2016.0,486719.0,17341.0,18899.0,11236.0,475504.0,114500.0,0.0,0.0,0.0,2016.0,640000.0,18000.0,20000.0,11500.0,0.0,0.00,0.0,2016.0,2.00,4.0,312430277.0,1,50,1,637164.0,19442.0,20696.0,12307.0,445707.0,114030.0,37395.0,-23141.0,-17051.0,3.562836,3.882939,2.308519,425.082096,-23.611660,-8.682837,-10.806501,6.685334,0.412172,0.000036,0.000366,9.813100,1332


In [18]:
with open(os.path.join(model_path, "train_X"), "rb") as f:
    train_X = pickle.load(f)
with open(os.path.join(model_path, "train_y"), "rb") as f:
    train_y = pickle.load(f)

In [19]:
with open(os.path.join(model_path, "val_X"), "rb") as f:
    val_X = pickle.load(f)
with open(os.path.join(model_path, "val_y"), "rb") as f:
    val_y = pickle.load(f)

In [20]:
# 目的変数を指定
label = "label_high_20"

# 学習用データセット定義
# ファンダメンタル情報
#fundamental_cols = dfs["stock_fin"].select_dtypes("float64").columns
fundamental_cols = pd.Index(SELECT_FIN_DATA_COLUMNS)
fundamental_cols = fundamental_cols[fundamental_cols != "Result_Dividend DividendPayableDate"]
fundamental_cols = fundamental_cols[fundamental_cols != "Local Code"]
# 価格変化率
returns_cols = [x for x in train_X[label].columns if "return" in x]
# テクニカル
technical_cols = [x for x in train_X[label].columns if (x not in fundamental_cols) and (x != "code")]

columns = {
    "fundamental_only": fundamental_cols,
    "return_only": returns_cols,
    "technical_only": technical_cols,
    "fundamental+technical": list(fundamental_cols) + list(technical_cols),
}
# 学習用データセットを指定
col = "fundamental+technical"

In [None]:
'''reg_cv = GridSearchCV(xgb_model, {
    "eta": [0.1], 
    "gamma": [0.1,0.2,0.3,0.4,0.5],
    "n_estimators": [50, 100, 200], 
    "max_depth": [5, 7, 9,10,20,30],
    "subsample":[0.6,0.8,1],
    "colsample_bytree": [0.5,0.7,0.9],
}, verbose=1)
'''

In [None]:
'''
{'colsample_bytree': 0.5, 'eta': 0.1, 'gamma': 0.5, 'max_depth': 5, 'n_estimators': 50, 'subsample': 1}
0.07780464612358796
'''

In [53]:
# ライブラリインポート
from sklearn.model_selection import GridSearchCV

# モデル定義
model = XGBRegressor()

# ハイパーパラメータ探索
reg_cv = GridSearchCV(model, {
    "eta": [0.1], 
    "gamma": [0.4, 0.5],
    "max_depth": [5],
    "n_estimators": [50], 
    "subsample":[1],
    "colsample_bytree": [0.5],
}, verbose=1)

# 訓練実施
reg_cv.fit(train_X[label][columns[col]].values, train_y[label])

You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    5.6s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
       colsample_bynode=None, colsample_bytree=None, gamma=None,
       gpu_id=None, importance_type='gain', interaction_constraints=None,
       learning_rate=None, max_delta_step=None, max_depth=None,
       min_child_we..._pos_weight=None, subsample=None,
       tree_method=None, validate_parameters=None, verbosity=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'eta': [0.1], 'gamma': [0.4, 0.5], 'max_depth': [5], 'n_estimators': [50], 'subsample': [1], 'colsample_bytree': [0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [54]:
# 結果を表示
print(reg_cv.best_params_)
print(reg_cv.best_score_)

{'colsample_bytree': 0.5, 'eta': 0.1, 'gamma': 0.5, 'max_depth': 5, 'n_estimators': 50, 'subsample': 1}
0.07780464612358796


In [55]:
best_model = reg_cv.best_estimator_

In [56]:
# 目的変数を指定
label = "label_high_20"

# 学習用データセット定義
# ファンダメンタル情報
#fundamental_cols = dfs["stock_fin"].select_dtypes("float64").columns
fundamental_cols = pd.Index(SELECT_FIN_DATA_COLUMNS)
fundamental_cols = fundamental_cols[fundamental_cols != "Result_Dividend DividendPayableDate"]
fundamental_cols = fundamental_cols[fundamental_cols != "Local Code"]
# 価格変化率
returns_cols = [x for x in train_X[label].columns if "return" in x]
# テクニカル
technical_cols = [x for x in train_X[label].columns if (x not in fundamental_cols) and (x != "code")]

columns = {
    "fundamental_only": fundamental_cols,
    "return_only": returns_cols,
    "technical_only": technical_cols,
    "fundamental+technical": list(fundamental_cols) + list(technical_cols),
}
# 学習用データセットを指定
col = "fundamental+technical"

# 学習
#pred_model = models[model](reg_cv.best_estimator_, random_state=0)
best_model.fit(train_X[label][columns[col]].values, train_y[label])


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.5, eta=0.1, gamma=0.5,
       gpu_id=-1, importance_type='gain', interaction_constraints='',
       learning_rate=0.100000001, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=50, n_jobs=6, num_parallel_tree=1,
       objective='reg:squarederror', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [57]:
columns[col]

['Result_FinancialStatement FiscalYear',
 'Result_FinancialStatement NetSales',
 'Result_FinancialStatement OperatingIncome',
 'Result_FinancialStatement OrdinaryIncome',
 'Result_FinancialStatement NetIncome',
 'Result_FinancialStatement TotalAssets',
 'Result_FinancialStatement NetAssets',
 'Result_FinancialStatement CashFlowsFromOperatingActivities',
 'Result_FinancialStatement CashFlowsFromFinancingActivities',
 'Result_FinancialStatement CashFlowsFromInvestingActivities',
 'Forecast_FinancialStatement FiscalYear',
 'Forecast_FinancialStatement NetSales',
 'Forecast_FinancialStatement OperatingIncome',
 'Forecast_FinancialStatement OrdinaryIncome',
 'Forecast_FinancialStatement NetIncome',
 'Result_Dividend FiscalYear',
 'Result_Dividend QuarterlyDividendPerShare',
 'Result_Dividend AnnualDividendPerShare',
 'Forecast_Dividend FiscalYear',
 'Forecast_Dividend QuarterlyDividendPerShare',
 'Forecast_Dividend AnnualDividendPerShare',
 'IssuedShareEquityQuote IssuedShare',
 'Section/Pr

In [58]:
test_X[label][columns[col]].T.shape

(63, 32515)

In [59]:
train_X[label][columns[col]].T.shape

(63, 31752)

In [60]:
val_X[label][columns[col]].T.shape

(63, 14585)

In [61]:
test_y[label].shape

(32515,)

In [62]:
train_y[label].shape

(31752,)

In [63]:
val_y[label].shape

(14585,)

In [27]:
# 予測
result = {}
result[label] = pd.DataFrame(
    best_model.predict(val_X[label][columns[col]]), columns=["predict"]
)

# 予測結果に日付と銘柄コードを追加
result[label]["datetime"] = val_X[label][columns[col]].index
result[label]["code"] = val_X[label]["code"].values

# 予測の符号を取得
result[label]["predict_dir"] = np.sign(result[label]["predict"])

# 実際の値を追加
result[label]["actual"] = val_y[label].values

ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62'] ['Result_FinancialStatement FiscalYear', 'Result_FinancialStatement NetSales', 'Result_FinancialStatement OperatingIncome', 'Result_FinancialStatement OrdinaryIncome', 'Result_FinancialStatement NetIncome', 'Result_FinancialStatement TotalAssets', 'Result_FinancialStatement NetAssets', 'Result_FinancialStatement CashFlowsFromOperatingActivities', 'Result_FinancialStatement CashFlowsFromFinancingActivities', 'Result_FinancialStatement CashFlowsFromInvestingActivities', 'Forecast_FinancialStatement FiscalYear', 'Forecast_FinancialStatement NetSales', 'Forecast_FinancialStatement OperatingIncome', 'Forecast_FinancialStatement OrdinaryIncome', 'Forecast_FinancialStatement NetIncome', 'Result_Dividend FiscalYear', 'Result_Dividend QuarterlyDividendPerShare', 'Result_Dividend AnnualDividendPerShare', 'Forecast_Dividend FiscalYear', 'Forecast_Dividend QuarterlyDividendPerShare', 'Forecast_Dividend AnnualDividendPerShare', 'IssuedShareEquityQuote IssuedShare', 'Section/Products', '33 Sector(Code)', '17 Sector(Code)', 'return_1month', 'return_2month', 'return_3month', 'volatility_1month', 'volatility_2month', 'volatility_3month', 'MA_gap_1month', 'MA_gap_2month', 'MA_gap_3month', 'EWMA', 'ema_10', 'ema_12', 'ema_26', 'macd', 'signal', 'pbr', 'per', 'Previous_FinancialStatement NetSales', 'Previous_FinancialStatement OperatingIncome', 'Previous_FinancialStatement OrdinaryIncome', 'Previous_FinancialStatement NetIncome', 'Previous_FinancialStatement TotalAssets', 'Previous_FinancialStatement NetAssets', 'Previous_FinancialStatement CashFlowsFromOperatingActivities', 'Previous_FinancialStatement CashFlowsFromFinancingActivities', 'Previous_FinancialStatement CashFlowsFromInvestingActivities', 'operating_profit_margin', 'ordinary_profit_margin', 'net_profit_margin', 'total_asset_turnover', 'net_sales_growth_rate', 'ordinary_income_growth_rate', 'operationg_income_growth_rate', 'total_assets_growth_rate', 'net_assets_growth_rate', 'eps', 'bps', 'roe']
expected f7, f30, f28, f38, f16, f18, f6, f39, f40, f44, f14, f35, f31, f12, f55, f11, f60, f24, f5, f13, f41, f4, f54, f0, f57, f8, f21, f10, f42, f23, f48, f47, f52, f53, f61, f15, f43, f3, f51, f9, f29, f62, f59, f34, f22, f32, f49, f19, f50, f37, f26, f36, f45, f25, f17, f20, f58, f2, f46, f33, f27, f56, f1 in input data
training data did not have the following fields: Result_FinancialStatement NetAssets, signal, per, roe, Result_FinancialStatement NetSales, Forecast_Dividend QuarterlyDividendPerShare, net_sales_growth_rate, macd, Forecast_Dividend FiscalYear, Previous_FinancialStatement NetAssets, Previous_FinancialStatement CashFlowsFromOperatingActivities, MA_gap_3month, EWMA, 33 Sector(Code), Result_FinancialStatement OperatingIncome, net_assets_growth_rate, ordinary_income_growth_rate, volatility_2month, return_3month, ema_12, Previous_FinancialStatement TotalAssets, bps, Result_FinancialStatement FiscalYear, ema_26, total_assets_growth_rate, pbr, Result_FinancialStatement NetIncome, Previous_FinancialStatement NetSales, MA_gap_1month, Result_FinancialStatement CashFlowsFromOperatingActivities, Forecast_FinancialStatement OperatingIncome, Result_Dividend AnnualDividendPerShare, Previous_FinancialStatement OrdinaryIncome, MA_gap_2month, operating_profit_margin, 17 Sector(Code), Forecast_FinancialStatement NetSales, net_profit_margin, Previous_FinancialStatement OperatingIncome, Result_FinancialStatement CashFlowsFromInvestingActivities, Forecast_Dividend AnnualDividendPerShare, Previous_FinancialStatement CashFlowsFromFinancingActivities, Previous_FinancialStatement CashFlowsFromInvestingActivities, eps, return_2month, volatility_1month, Section/Products, Forecast_FinancialStatement FiscalYear, IssuedShareEquityQuote IssuedShare, Forecast_FinancialStatement OrdinaryIncome, Result_Dividend QuarterlyDividendPerShare, operationg_income_growth_rate, ordinary_profit_margin, volatility_3month, total_asset_turnover, Forecast_FinancialStatement NetIncome, ema_10, Previous_FinancialStatement NetIncome, return_1month, Result_Dividend FiscalYear, Result_FinancialStatement TotalAssets, Result_FinancialStatement CashFlowsFromFinancingActivities, Result_FinancialStatement OrdinaryIncome

In [None]:
pred_model.fit(train_X[label][columns[col]].values, train_y[label])

In [None]:
pred_model.feature_importances_

# Submit Model

In [None]:
model_path = os.path.join(os.path.dirname("__file__"), "../model")
# tag::save_model[]
# モデル保存先ディレクトリを作成
os.makedirs(model_path, exist_ok=True)
with open(os.path.join(model_path, f"my_model_{label}.pkl"), "wb") as f:
    # モデルをpickle形式で保存
    pickle.dump(pred_model, f)


In [None]:
sns.jointplot(data=result[label], x="predict", y="actual")

In [None]:
pred_model

In [None]:
# 学習済みモデルを指定
rf = pred_model

# 重要度順を取得
sorted_idx = rf.feature_importances_.argsort()
# プロット
fig, ax = plt.subplots(figsize=(8, 8))
ax.barh(fundamental_cols[sorted_idx], rf.feature_importances_[sorted_idx])
ax.set_xlabel("Random Forest Feature Importance")

In [None]:
# モデルを定義します
sample_model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(train_X["label_high_20"], label=train_y["label_high_20"]), 100)

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(model=sample_model, feature_perturbation='tree_path_dependent', model_output='margin')
# SHAP値
shap_values = explainer.shap_values(X=train_X["label_high_20"])
# プロット
shap.summary_plot(shap_values, train_X["label_high_20"], plot_type="bar")

In [None]:
shap.summary_plot(shap_values, train_X["label_high_20"])

In [None]:
# モデルを定義
models = {
    "rf": RandomForestRegressor,
    "extraTree": ExtraTreesRegressor,
    "gbr": GradientBoostingRegressor,
}

# 学習用データセット定義
columns = {
    "fundamental_only": fundamental_cols,
    "return_only": returns_cols,
    "technical_only": technical_cols,
    "fundamental+technical": list(fundamental_cols) + list(technical_cols),
}

# 結果保存用
all_results = dict()
# モデル毎に処理
for model in tqdm(models.keys()):
    all_results[model] = dict()
    # データセット毎に処理
    for col in columns.keys():
        result = dict()
        # 目的変数毎に処理
        for label in tqdm(labels):
            if len(test_X[label][columns[col]]) > 0:
                # モデル取得
                pred_model = models[model](random_state=0)
                # 学習
                pred_model.fit(train_X[label][columns[col]].values, train_y[label])
                # 結果データ作成
                result[label] = test_X[label][["code"]].copy()
                result[label]["datetime"] = test_X[label][columns[col]].index
                # 予測
                result[label]["predict"] = pred_model.predict(test_X[label][columns[col]])
                result[label]["predict_dir"] = np.sign(result[label]["predict"])
                # 実際の結果
                result[label]["actual"] = test_y[label].values
                result[label]["actual_dir"] = np.sign(result[label]["actual"])
                result[label].dropna(inplace=True)

        all_results[model][col] = result

In [None]:
results = []
for model in all_results.keys():
    for col in all_results[model]:
        tmp = pd.concat(all_results[model][col])
        tmp["model"] = model
        tmp["feature"] = col
        results.append(tmp)
results = pd.concat(results)
results["label"] = [x[0] for x in results.index]
results.head(5)

In [None]:
# 結果保存用変数
all_metrics = []

# データセット毎に処理
for feature in columns:
    matrix = dict()
    # モデル毎に処理
    for model in models:
        # 目的変数毎に処理
        for label in labels:
            # 処理対象データに絞り込み
            tmp_df = results[(results["model"] == model) & (results["label"] == label) & (results["feature"] == feature)]
            # RMSE
            rmse = np.sqrt(mean_squared_error(tmp_df["predict"], tmp_df["actual"]))
            # 精度
            accuracy = accuracy_score(tmp_df["predict_dir"], tmp_df["actual_dir"])
            # 相関係数
            corr = np.corrcoef(tmp_df["actual"], tmp_df["predict"])[0, 1]
            # 順位相関
            spearman_corr = spearmanr(tmp_df["actual"], tmp_df["predict"])[0]
            # 結果を保存
            matrix[label] = [rmse, accuracy, spearman_corr,corr, corr**2, feature, model, tmp_df.shape[0]]
        res = pd.DataFrame.from_dict(matrix).T
        res.columns = ["RMSE","accuracy","spearman_corr","corr","R^2 score","feature", "model", "# of samples"]
        all_metrics.append(res)
all_metrics = pd.concat(all_metrics)
all_metrics.reset_index()