# Imports & Load Data
作業に必要なライブラリをインポートして、 以下のデータを読み込みます。

* stock_price : 株価情報
* stock_list : 銘柄情報
* stock_fin : 財務諸表
* stock_labels : 目的変数

In [5]:
import os
import pickle
import sys
import warnings
from glob import glob

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shap
import  xgboost as xgb
from scipy.stats import spearmanr
from sklearn.ensemble import (
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.metrics import accuracy_score, mean_squared_error
from tqdm.auto import tqdm


# 表示用の設定を変更します
%matplotlib inline
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.width = 120

In [1]:
# python 3.7.3であることを確認します
import sys
print(sys.version)

3.7.3 (default, Mar 27 2019, 22:11:17) 
[GCC 7.3.0]


In [2]:
# データセット保存先ディレクトリ（""の中身はご自身の環境に合わせて定義してください。）
dataset_dir="/path/to"

In [3]:
inputs = {"stock_price": f"{dataset_dir}/stock_price.csv.gz"}

In [7]:
df = pd.read_csv(inputs["stock_price"])
df.head(1)

Unnamed: 0,Local Code,EndOfDayQuote Date,EndOfDayQuote Open,EndOfDayQuote High,EndOfDayQuote Low,EndOfDayQuote Close,EndOfDayQuote ExchangeOfficialClose,EndOfDayQuote Volume,EndOfDayQuote CumulativeAdjustmentFactor,EndOfDayQuote PreviousClose,EndOfDayQuote PreviousCloseDate,EndOfDayQuote PreviousExchangeOfficialClose,EndOfDayQuote PreviousExchangeOfficialCloseDate,EndOfDayQuote ChangeFromPreviousClose,EndOfDayQuote PercentChangeFromPreviousClose,EndOfDayQuote VWAP
0,1301,2016/01/04,2800.0,2820.0,2740.0,2750.0,2750.0,32000.0,0.1,2770.0,2015/12/30,2770.0,2015/12/30,-20.0,-0.722,2778.25


In [8]:
df.loc[:, "datetime"] = pd.to_datetime(df.loc[:, "EndOfDayQuote Date"])
df.set_index("datetime", inplace=True)
df.head(1)

Unnamed: 0_level_0,Local Code,EndOfDayQuote Date,EndOfDayQuote Open,EndOfDayQuote High,EndOfDayQuote Low,EndOfDayQuote Close,EndOfDayQuote ExchangeOfficialClose,EndOfDayQuote Volume,EndOfDayQuote CumulativeAdjustmentFactor,EndOfDayQuote PreviousClose,EndOfDayQuote PreviousCloseDate,EndOfDayQuote PreviousExchangeOfficialClose,EndOfDayQuote PreviousExchangeOfficialCloseDate,EndOfDayQuote ChangeFromPreviousClose,EndOfDayQuote PercentChangeFromPreviousClose,EndOfDayQuote VWAP
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2016-01-04,1301,2016/01/04,2800.0,2820.0,2740.0,2750.0,2750.0,32000.0,0.1,2770.0,2015/12/30,2770.0,2015/12/30,-20.0,-0.722,2778.25


In [9]:
feats = (
    df[["EndOfDayQuote ExchangeOfficialClose", "Local Code"]]
    .groupby("Local Code")
    .pct_change()
    .rolling(20)
    .std()
    .values
)
feats[:21]


array([[       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [       nan],
       [0.01590696]])

In [10]:
df.loc[:, "code"] = df.index.strftime("%Y-%m-%d-") + df.loc[
    :, "Local Code"
].astype(str)
df.head(1).T


datetime,2016-01-04 00:00:00
Local Code,1301
EndOfDayQuote Date,2016/01/04
EndOfDayQuote Open,2800
EndOfDayQuote High,2820
EndOfDayQuote Low,2740
EndOfDayQuote Close,2750
EndOfDayQuote ExchangeOfficialClose,2750
EndOfDayQuote Volume,32000
EndOfDayQuote CumulativeAdjustmentFactor,0.1
EndOfDayQuote PreviousClose,2770


In [11]:
df.loc[:, "label_high_20"] = feats
df.loc[:, "label_low_20"] = feats

In [12]:
# 出力対象列を設定
output_columns = ["code", "label_high_20", "label_low_20"]

In [14]:
import io
out = io.StringIO()
df[output_columns].loc["2020-01-01":].dropna().to_csv(out, header=False, index=False)

print("\n".join(out.getvalue().split("\n")[:10]))

2020-01-06-1301,0.003585883791862692,0.003585883791862692
2020-01-07-1301,0.004547330404501981,0.004547330404501981
2020-01-08-1301,0.00453195173316532,0.00453195173316532
2020-01-09-1301,0.004999748259166511,0.004999748259166511
2020-01-10-1301,0.004999647093872243,0.004999647093872243
2020-01-14-1301,0.005258378653780425,0.005258378653780425
2020-01-15-1301,0.0052927619941197995,0.0052927619941197995
2020-01-16-1301,0.005278015255069897,0.005278015255069897
2020-01-17-1301,0.005145885202200792,0.005145885202200792
2020-01-20-1301,0.0051441150911965775,0.0051441150911965775
