In [1]:
import pandas as pd
import numpy as np 
# データ可視化のライブラリ
import matplotlib.pyplot as plt
%matplotlib inline
# 機械学習ライブラリ
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
# LightGBM関連
import lightgbm as lgb
import japanize_matplotlib
import scipy
from numpy.random import randn #正規乱数
# graphvizのインポート
import graphviz
#grid searchとcross validation用
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [48]:
# CSVファイルの読み込み。2017-2019の２年分→2010-2021年の10年分に変更してみた
#stooqのデータにはvolumeがないので、仮値として100,000を入れてある
df = pd.read_csv('usdjpy_d_2010-2021_2.csv')
# 最後の5行を確認
df.tail()

Unnamed: 0,日付,始値,高値,安値,終値
95,2021年05月10日,108.582,109.052,108.464,108.806
96,2021年05月11日,108.799,108.979,108.344,108.614
97,2021年05月12日,108.612,109.704,108.533,109.639
98,2021年05月13日,109.639,109.783,109.4,109.434
99,2021年05月14日,109.394,109.651,109.184,109.324


In [49]:
df = df.rename(columns={'日付': 'time','始値': 'open', '終値': 'close','安値': 'low','高値': 'high',})
print(df)

           time     open     high      low    close
0   2020年12月25日  103.551  103.669  103.492  103.594
1   2020年12月28日  103.450  103.894  103.399  103.758
2   2020年12月29日  103.764  103.800  103.458  103.523
3   2020年12月30日  103.523  103.580  102.957  103.147
4   2020年12月31日  103.140  103.312  102.991  103.304
..          ...      ...      ...      ...      ...
95  2021年05月10日  108.582  109.052  108.464  108.806
96  2021年05月11日  108.799  108.979  108.344  108.614
97  2021年05月12日  108.612  109.704  108.533  109.639
98  2021年05月13日  109.639  109.783  109.400  109.434
99  2021年05月14日  109.394  109.651  109.184  109.324

[100 rows x 5 columns]


In [50]:
# 翌日終値 - 当日終値で差分を計算
#shift(-1)でcloseを上に1つずらす
df['close+1'] = df.close.shift(-1)
df['diff'] = df['close+1'] - df['close']
#最終日はclose+1がNaNになるので削る
df = df[:-1]
print(df)

           time     open     high      low    close  close+1   diff
0   2020年12月25日  103.551  103.669  103.492  103.594  103.758  0.164
1   2020年12月28日  103.450  103.894  103.399  103.758  103.523 -0.235
2   2020年12月29日  103.764  103.800  103.458  103.523  103.147 -0.376
3   2020年12月30日  103.523  103.580  102.957  103.147  103.304  0.157
4   2020年12月31日  103.140  103.312  102.991  103.304  103.115 -0.189
..          ...      ...      ...      ...      ...      ...    ...
94  2021年05月07日  109.087  109.283  108.331  108.573  108.806  0.233
95  2021年05月10日  108.582  109.052  108.464  108.806  108.614 -0.192
96  2021年05月11日  108.799  108.979  108.344  108.614  109.639  1.025
97  2021年05月12日  108.612  109.704  108.533  109.639  109.434 -0.205
98  2021年05月13日  109.639  109.783  109.400  109.434  109.324 -0.110

[99 rows x 7 columns]


In [51]:
# 上昇と下降のデータ割合を確認
m = len(df['close'])
#df['diff']>0で全行に対してtrueかfalseで返してくれる。df[(df['diff'] > 0)]でdff>0に絞って全てのカラムを出力
print(len(df[(df['diff'] > 0)]) / m * 100)
print(len(df[(df['diff'] < 0)]) / m * 100)

52.52525252525253
47.474747474747474


In [52]:
df.loc[df['diff'] > 0, "target"] = 1
df.loc[df['diff'] < 0, "target"] = 0
#↑修正が必要？
# 不要なカラムを削除
del df['close+1']
del df['time']
# カラムの並び替え
df = df[['target', 'open', 'high', 'low', 'close']]
# 最初の5行を出力
df.head()

Unnamed: 0,target,open,high,low,close
0,1.0,103.551,103.669,103.492,103.594
1,0.0,103.45,103.894,103.399,103.758
2,0.0,103.764,103.8,103.458,103.523
3,1.0,103.523,103.58,102.957,103.147
4,0.0,103.14,103.312,102.991,103.304


In [38]:
#targetに1と0以外の数字が表示されてしまう
#ちなみにここまでコピペ

In [53]:
#移動平均の計算、5日、25日、50日、75日
#ついでにstdも計算する。（=ボリンジャーバンドと同等の情報を持ってる）
#75日分のデータ確保
for i in range(1, 75):
    df['close-'+str(i)] = df.close.shift(+i)
#移動平均の値とstdを計算する, skipnaの設定で一つでもNanがあるやつはNanを返すようにする
nclose = 5    
df['MA5'] = df.iloc[:, np.arange(nclose, nclose+5)].mean(axis='columns', skipna=False)
df['MA25'] = df.iloc[:, np.arange(nclose, nclose+25)].mean(axis='columns', skipna=False)
df['MA50'] = df.iloc[:, np.arange(nclose, nclose+50)].mean(axis='columns', skipna=False)
df['MA75'] = df.iloc[:, np.arange(nclose, nclose+75)].mean(axis='columns', skipna=False)

df['STD5'] = df.iloc[:, np.arange(nclose, nclose+5)].std(axis='columns', skipna=False)
df['STD25'] = df.iloc[:, np.arange(nclose, nclose+25)].std(axis='columns', skipna=False)
df['STD50'] = df.iloc[:, np.arange(nclose, nclose+50)].std(axis='columns', skipna=False)
df['STD75'] = df.iloc[:, np.arange(nclose, nclose+75)].std(axis='columns', skipna=False)
#計算終わったら余分な列は削除
for i in range(1, 75):
    del df['close-'+str(i)]
#それぞれの平均線の前日からの変化（移動平均線が上向か、下向きかわかる）
#shift(-1)でcloseを上に1つずらす
df['diff_MA5'] = df['MA5'] - df.MA5.shift(1) 
df['diff_MA25'] = df['MA25'] - df.MA25.shift(1) 
df['diff_MA50'] = df['MA50'] - df.MA50.shift(1) 
df['diff_MA75'] = df['MA50'] - df.MA50.shift(1) 
#3日前までのopen, close, high, lowも素性に加えたい
for i in range(1, 4):
    df['close-'+str(i)] = df.close.shift(+i)
    df['open-'+str(i)] = df.open.shift(+i)
    df['high-'+str(i)] = df.high.shift(+i)
    df['low-'+str(i)] = df.low.shift(+i)
#NaNを含む行を削除
df = df.dropna()
#何日分使うか決める
nday = 100
df = df[-nday:]
#df.head()
df

Unnamed: 0,target,open,high,low,close,MA5,MA25,MA50,MA75,STD5,...,high-1,low-1,close-2,open-2,high-2,low-2,close-3,open-3,high-3,low-3
74,0.0,109.257,109.956,109.172,109.658,109.9302,109.27916,107.43686,106.254136,0.519031,...,109.899,108.994,109.842,109.729,109.938,109.573,109.729,110.174,110.548,109.665
75,0.0,109.67,109.765,109.241,109.379,109.732,109.33724,107.5362,106.332347,0.331125,...,109.956,109.172,109.257,109.847,109.899,108.994,109.842,109.729,109.938,109.573
76,0.0,109.377,109.748,109.013,109.059,109.573,109.35804,107.62534,106.405173,0.245669,...,109.765,109.241,109.658,109.257,109.956,109.172,109.257,109.847,109.899,108.994
77,0.0,109.057,109.092,108.747,108.911,109.439,109.38108,107.7073,106.4772,0.312847,...,109.748,109.013,109.379,109.67,109.765,109.241,109.658,109.257,109.956,109.172
78,1.0,108.904,108.962,108.607,108.756,109.2528,109.40252,107.78504,106.551571,0.289187,...,109.092,108.747,109.059,109.377,109.748,109.013,109.379,109.67,109.765,109.241
79,0.0,108.765,108.962,108.606,108.769,109.1526,109.41296,107.84976,106.622928,0.364386,...,108.962,108.607,108.911,109.057,109.092,108.747,109.059,109.377,109.748,109.013
80,0.0,108.729,108.829,108.003,108.139,108.9748,109.40456,107.91902,106.695944,0.257277,...,108.962,108.606,108.756,108.904,108.962,108.607,108.911,109.057,109.092,108.747
81,0.0,108.134,108.544,107.967,108.076,108.7268,109.36536,107.97804,106.765237,0.350866,...,108.829,108.003,108.769,108.765,108.962,108.606,108.756,108.904,108.962,108.607
82,0.0,108.077,108.277,107.871,108.039,108.5302,109.32892,108.04838,106.829989,0.391266,...,108.544,107.967,108.139,108.729,108.829,108.003,108.769,108.765,108.962,108.606
83,0.0,108.034,108.228,107.809,107.963,108.3558,109.29704,108.11794,106.884317,0.37301,...,108.277,107.871,108.076,108.134,108.544,107.967,108.139,108.729,108.829,108.003


In [54]:
n = df.shape[0]
p = df.shape[1]
print(n,p)
# 訓練データとテストデータへ分割。シャッフルはしない
train_start = 0
train_end = int(np.floor(0.8*n))
test_start = train_end + 1
test_end = n
data_train = np.arange(train_start, train_end)
data_train = df.iloc[np.arange(train_start, train_end), :]
data_test = df.iloc[np.arange(test_start, test_end), :]
# 訓練データとテストデータのサイズを確認
print(data_train.shape)
print(data_test.shape)

25 29
(20, 29)
(4, 29)


In [55]:
#targetを分離
X_train = data_train.iloc[:, 1:]
y_train = data_train.iloc[:, 0]
X_test = data_test.iloc[:, 1:]
y_test = data_test.iloc[:, 0]
# 決定技モデルの訓練
clf_2 = DecisionTreeClassifier(max_depth=5)

In [56]:
# grid searchでmax_depthの最適なパラメータを決める
#k=10のk分割交差検証も行う
params = {'max_depth': [2, 5, 10, 20]}

grid = GridSearchCV(estimator=clf_2,
                    param_grid=params,
                    cv=10,
                    scoring='roc_auc')
grid.fit(X_train, y_train)
for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_['mean_test_score'][r],
             grid.cv_results_['std_test_score'][r] / 2.0,
             grid.cv_results_['params'][r]))
print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

Traceback (most recent call last):
  File "c:\users\ryusm\onedrive\ドキュメント\fx-predict\env\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\users\ryusm\onedrive\ドキュメント\fx-predict\env\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "c:\users\ryusm\onedrive\ドキュメント\fx-predict\env\lib\site-packages\sklearn\metrics\_scorer.py", line 362, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "c:\users\ryusm\onedrive\ドキュメント\fx-predict\env\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "c:\users\ryusm\onedrive\ドキュメント\fx-predict\env\lib\site-packages\sklearn\metrics\_ranking.py", line 542, in roc_auc_score
    return _average_binary_score(partial(_binary_roc_auc_score,
  File "c:\users\ryusm\onedrive\ドキュメント\fx-predict\env\

nan +/- nan {'max_depth': 2}
nan +/- nan {'max_depth': 5}
nan +/- nan {'max_depth': 10}
nan +/- nan {'max_depth': 20}
Best parameters: {'max_depth': 2}
Accuracy: nan


Traceback (most recent call last):
  File "c:\users\ryusm\onedrive\ドキュメント\fx-predict\env\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\users\ryusm\onedrive\ドキュメント\fx-predict\env\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "c:\users\ryusm\onedrive\ドキュメント\fx-predict\env\lib\site-packages\sklearn\metrics\_scorer.py", line 362, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "c:\users\ryusm\onedrive\ドキュメント\fx-predict\env\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "c:\users\ryusm\onedrive\ドキュメント\fx-predict\env\lib\site-packages\sklearn\metrics\_ranking.py", line 542, in roc_auc_score
    return _average_binary_score(partial(_binary_roc_auc_score,
  File "c:\users\ryusm\onedrive\ドキュメント\fx-predict\env\

In [57]:
#grid searchで最適だったパラメータを使って学習する
clf_2 = grid.best_estimator_
clf_2 = clf_2.fit(X_train, y_train)
clf_2

DecisionTreeClassifier(max_depth=2)

In [58]:
pred_test_2 = clf_2.predict(X_test)
#テストデータ 正解率
accuracy_score(y_test, pred_test_2)

0.5

In [59]:
#重要度の高い素性を表示
importances = clf_2.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                            df.columns[1+indices[f]],
                            importances[indices[f]]))    

 1) open-2                         0.684575
 2) open-1                         0.220459
 3) STD25                          0.094967
 4) low-3                          0.000000
 5) diff_MA5                       0.000000
 6) high                           0.000000
 7) low                            0.000000
 8) close                          0.000000
 9) MA5                            0.000000
10) MA25                           0.000000
11) MA50                           0.000000
12) MA75                           0.000000
13) STD5                           0.000000
14) STD50                          0.000000
15) STD75                          0.000000
16) diff_MA25                      0.000000
17) high-3                         0.000000
18) diff_MA50                      0.000000
19) diff_MA75                      0.000000
20) close-1                        0.000000
21) high-1                         0.000000
22) low-1                          0.000000
23) close-2                     