In [1]:
import pandas as pd
import numpy as np 
# データ可視化のライブラリ
import matplotlib.pyplot as plt
# 機械学習ライブラリ
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
# graphvizのインポート
import graphviz
#grid searchとcross validation用
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
# CSVファイルの読み込み。2017-2019の２年分
df = pd.read_csv('usdjpy_d_2010-2021_2.csv')
# 最後の5行を確認
df.tail()

Unnamed: 0,time,close,open,high,low,volume
2892,2021/3/3,107.009,106.703,107.153,106.7,100000
2893,2021/3/4,107.977,107.007,107.997,106.969,100000
2894,2021/3/5,108.351,107.976,108.636,107.821,100000
2895,2021/3/8,108.93,108.382,108.943,108.31,100000
2896,2021/3/9,108.522,108.924,109.233,108.417,100000


In [3]:
# 翌日終値 - 当日終値で差分を計算
#shift(-1)でcloseを上に1つずらす
df['close+1'] = df.close.shift(-1)
df['diff'] = df['close+1'] - df['close']
#最終日はclose+1がNaNになるので削る
df = df[:-1]
# close+1を追加して翌日と比較
df.tail()

Unnamed: 0,time,close,open,high,low,volume,close+1,diff
2891,2021/3/2,106.704,106.776,106.957,106.679,100000,107.009,0.305
2892,2021/3/3,107.009,106.703,107.153,106.7,100000,107.977,0.968
2893,2021/3/4,107.977,107.007,107.997,106.969,100000,108.351,0.374
2894,2021/3/5,108.351,107.976,108.636,107.821,100000,108.93,0.579
2895,2021/3/8,108.93,108.382,108.943,108.31,100000,108.522,-0.408


In [4]:
# 上昇と下降のデータ割合を確認
m = len(df['close'])
#df['diff']>0で全行に対してtrueかfalseで返してくれる。df[(df['diff'] > 0)]でdff>0に絞って全てのカラムを出力
#上昇した日の割合
print(len(df[(df['diff'] > 0)]) / m * 100)
#下降した日の割合
print(len(df[(df['diff'] < 0)]) / m * 100)

49.620165745856355
50.03453038674033


In [5]:
df.rename(columns={"diff" : "target"}, inplace=True)

# 不要なカラムを削除
del df['close+1']
del df['time']
# カラムの並び替え
df = df[['target', 'volume', 'open', 'high', 'low', 'close']]
# 最初の5行を出力
df.head()

Unnamed: 0,target,volume,open,high,low,close
0,-0.68,100000,92.914,93.2,92.18,92.4
1,0.62,100000,92.41,92.53,91.25,91.72
2,1.25,100000,91.71,92.73,91.52,92.34
3,-0.94,100000,92.34,93.75,92.1,93.59
4,-0.6,100000,93.58,93.66,92.28,92.65


In [6]:
df.loc[df['target'] > 0, "target"] = 1
df.loc[df['target'] < 0, "target"] = 0
# 最初の5行を出力
df.head()

Unnamed: 0,target,volume,open,high,low,close
0,0.0,100000,92.914,93.2,92.18,92.4
1,1.0,100000,92.41,92.53,91.25,91.72
2,1.0,100000,91.71,92.73,91.52,92.34
3,0.0,100000,92.34,93.75,92.1,93.59
4,0.0,100000,93.58,93.66,92.28,92.65


In [7]:
#移動平均の計算、5日、25日、50日、75日
#ついでにstdも計算する。（=ボリンジャーバンドと同等の情報を持ってる）
#75日分のデータ確保
for i in range(1, 75):
    df['close-'+str(i)] = df.close.shift(+i)
#移動平均の値とstdを計算する, skipnaの設定で一つでもNanがあるやつはNanを返すようにする
nclose = 5    
df['MA5'] = df.iloc[:, np.arange(nclose, nclose+5)].mean(axis='columns', skipna=False)
df['MA25'] = df.iloc[:, np.arange(nclose, nclose+25)].mean(axis='columns', skipna=False)
df['MA50'] = df.iloc[:, np.arange(nclose, nclose+50)].mean(axis='columns', skipna=False)
df['MA75'] = df.iloc[:, np.arange(nclose, nclose+75)].mean(axis='columns', skipna=False)

df['STD5'] = df.iloc[:, np.arange(nclose, nclose+5)].std(axis='columns', skipna=False)
df['STD25'] = df.iloc[:, np.arange(nclose, nclose+25)].std(axis='columns', skipna=False)
df['STD50'] = df.iloc[:, np.arange(nclose, nclose+50)].std(axis='columns', skipna=False)
df['STD75'] = df.iloc[:, np.arange(nclose, nclose+75)].std(axis='columns', skipna=False)
#計算終わったら余分な列は削除
for i in range(1, 75):
    del df['close-'+str(i)]
#それぞれの平均線の前日からの変化（移動平均線が上向か、下向きかわかる）
#shift(-1)でcloseを上に1つずらす
df['diff_MA5'] = df['MA5'] - df.MA5.shift(1) 
df['diff_MA25'] = df['MA25'] - df.MA25.shift(1) 
df['diff_MA50'] = df['MA50'] - df.MA50.shift(1) 
df['diff_MA75'] = df['MA50'] - df.MA50.shift(1) 
#3日前までのopen, close, high, lowも素性に加えたい
for i in range(1, 4):
    df['close-'+str(i)] = df.close.shift(+i)
    df['open-'+str(i)] = df.open.shift(+i)
    df['high-'+str(i)] = df.high.shift(+i)
    df['low-'+str(i)] = df.low.shift(+i)
#NaNを含む行を削除
df = df.dropna()
#何日分使うか決める
nday = 500
df = df[-nday:]
#df.head()
df

Unnamed: 0,target,volume,open,high,low,close,MA5,MA25,MA50,MA75,...,high-1,low-1,close-2,open-2,high-2,low-2,close-3,open-3,high-3,low-3
2396,0.0,100000,110.947,111.444,110.816,111.410,110.7880,111.13020,110.57814,110.334027,...,110.948,110.540,110.657,110.491,110.8320,110.021,110.492,110.529,110.7100,110.236
2397,1.0,100000,111.412,111.456,111.251,111.334,110.9490,111.16080,110.61804,110.306587,...,111.444,110.816,110.852,110.658,110.9480,110.540,110.657,110.491,110.8320,110.021
2398,1.0,100000,111.332,111.575,111.209,111.464,111.1434,111.18136,110.65606,110.288427,...,111.456,111.251,111.410,110.947,111.4440,110.816,110.852,110.658,110.9480,110.540
2399,1.0,100000,111.456,111.678,111.336,111.616,111.3352,111.19080,110.69680,110.276853,...,111.575,111.209,111.334,111.412,111.4560,111.251,111.410,110.947,111.4440,110.816
2400,0.0,100000,111.617,111.820,111.564,111.678,111.5004,111.17880,110.74002,110.266933,...,111.678,111.336,111.464,111.332,111.5750,111.209,111.334,111.412,111.4560,111.251
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2891,1.0,100000,106.776,106.957,106.679,106.704,106.4392,105.36424,104.48746,104.344540,...,106.889,106.367,106.598,106.250,106.6925,105.853,106.246,105.874,106.3980,105.846
2892,1.0,100000,106.703,107.153,106.700,107.009,106.6666,105.48160,104.56082,104.376860,...,106.957,106.679,106.776,106.553,106.8890,106.367,106.598,106.250,106.6925,105.853
2893,1.0,100000,107.007,107.997,106.969,107.977,107.0128,105.63152,104.64788,104.427313,...,107.153,106.700,106.704,106.776,106.9570,106.679,106.776,106.553,106.8890,106.367
2894,1.0,100000,107.976,108.636,107.821,108.351,107.3634,105.77552,104.74402,104.487567,...,107.997,106.969,107.009,106.703,107.1530,106.700,106.704,106.776,106.9570,106.679


In [8]:
n = df.shape[0]
p = df.shape[1]
print(n,p)
# 訓練データとテストデータへ分割。シャッフルはしない
train_start = 0
train_end = int(np.floor(0.8*n))
test_start = train_end + 1
test_end = n
data_train = np.arange(train_start, train_end)
data_train = df.iloc[np.arange(train_start, train_end), :]
data_test = df.iloc[np.arange(test_start, test_end), :]
# 訓練データとテストデータのサイズを確認
print(data_train.shape)
print(data_test.shape)

500 30
(400, 30)
(99, 30)


In [9]:
#targetを分離
X_train = data_train.iloc[:, 1:]
y_train = data_train.iloc[:, 0]
X_test = data_test.iloc[:, 1:]
y_test = data_test.iloc[:, 0]
# 決定技モデルの訓練
clf_2 = DecisionTreeClassifier(max_depth=5)

In [10]:
# grid searchでmax_depthの最適なパラメータを決める
#k=10のk分割交差検証も行う
params = {'max_depth': [2, 5, 10, 20]}

grid = GridSearchCV(estimator=clf_2,
                    param_grid=params,
                    cv=10,
                    scoring='roc_auc')
grid.fit(X_train, y_train)
for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_['mean_test_score'][r],
             grid.cv_results_['std_test_score'][r] / 2.0,
             grid.cv_results_['params'][r]))
print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

0.492 +/- 0.04 {'max_depth': 2}
0.470 +/- 0.03 {'max_depth': 5}
0.471 +/- 0.03 {'max_depth': 10}
0.447 +/- 0.04 {'max_depth': 20}
Best parameters: {'max_depth': 2}
Accuracy: 0.49


In [11]:
#grid searchで最適だったパラメータを使って学習する
clf_2 = grid.best_estimator_
clf_2 = clf_2.fit(X_train, y_train)
clf_2

DecisionTreeClassifier(max_depth=2)

In [12]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0,
                       random_state=None, splitter='best')

DecisionTreeClassifier(max_depth=10)

In [13]:
pred_test_2 = clf_2.predict(X_test)
#テストデータ 正解率
accuracy_score(y_test, pred_test_2)

0.48484848484848486

In [14]:
#重要度の高い素性を表示
importances = clf_2.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, 
                            df.columns[1+indices[f]],
                            importances[indices[f]]))    

 1) open-1                         0.418741
 2) MA25                           0.295883
 3) high-3                         0.285376
 4) low-3                          0.000000
 5) STD75                          0.000000
 6) open                           0.000000
 7) high                           0.000000
 8) low                            0.000000
 9) close                          0.000000
10) MA5                            0.000000
11) MA50                           0.000000
12) MA75                           0.000000
13) STD5                           0.000000
14) STD25                          0.000000
15) STD50                          0.000000
16) diff_MA25                      0.000000
17) diff_MA5                       0.000000
18) diff_MA50                      0.000000
19) diff_MA75                      0.000000
20) close-1                        0.000000
21) high-1                         0.000000
22) low-1                          0.000000
23) close-2                     