In [5]:
import pandas as pd
import numpy as np
from io import StringIO
import urllib

# 上記の関数を使い日経平均株価の時系列データを取得
url = "https://indexes.nikkei.co.jp/nkave/historical/nikkei_stock_average_daily_jp.csv"
def read_csv(url):
    res = urllib.request.urlopen(url)
    res = res.read().decode('shift_jis')
    df = pd.read_csv(StringIO(res))
    # 必要のない最後の行を取り除く
    df = df.drop(df.shape[0]-1)
    return df

df = read_csv(url)

# indexを日付にした後、時系列にする
df["データ日付"] = pd.to_datetime(df["データ日付"], format='%Y/%m/%d')
df = df.set_index('データ日付')

# カラムから'始値', '高値', '安値'を取り除いて、日付が古い順に並べる
df = df.drop(['始値', '高値', '安値'], axis=1)
df = df.sort_index(ascending=True)

df.to_csv("./time_data.csv")
print(df)

                  終値
データ日付               
2018-01-04  23506.33
2018-01-05  23714.53
2018-01-09  23849.99
2018-01-10  23788.20
2018-01-11  23710.43
...              ...
2021-04-12  29538.73
2021-04-13  29751.61
2021-04-14  29620.99
2021-04-15  29642.69
2021-04-16  29683.37

[801 rows x 1 columns]


In [6]:
 df = pd.read_csv("./time_data.csv" , index_col="データ日付")
print(df)
# dfとdf_tweetsの二つのテーブルを結合し、Nanを消去
df_tweets = pd.read_csv('./df_tweets.csv', index_col='date')
table = df_tweets.join(df, how='right').dropna()
# table.csvとして出力
table.to_csv("./table.csv")
print(table)

                  終値
データ日付               
2018-01-04  23506.33
2018-01-05  23714.53
2018-01-09  23849.99
2018-01-10  23788.20
2018-01-11  23710.43
...              ...
2021-04-12  29538.73
2021-04-13  29751.61
2021-04-14  29620.99
2021-04-15  29642.69
2021-04-16  29683.37

[801 rows x 1 columns]
                  pn        終値
データ日付                         
2018-01-04 -1.005170  23506.33
2018-01-05 -0.667022  23714.53
2018-01-09 -0.357653  23849.99
2018-01-10 -0.602215  23788.20
2018-01-11 -1.342974  23710.43
...              ...       ...
2021-04-09  0.148085  29768.06
2021-04-12  0.451674  29538.73
2021-04-13 -0.832217  29751.61
2021-04-14 -0.459771  29620.99
2021-04-15 -0.087326  29642.69

[800 rows x 2 columns]


In [7]:
from sklearn.model_selection import train_test_split
import pandas as pd

table = pd.read_csv("./table.csv",index_col='データ日付')

X = table.values[:, 0]
y = table.values[:, 1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=False)
X_train_std = (X_train - X_train.mean()) / X_train.std()
X_test_std = (X_test - X_train.mean()) / X_train.std()

df_train = pd.DataFrame(
    {'pn': X_train_std,
     '終値': y_train},
    columns=['pn', '終値'],
    index=table.index[:len(X_train_std)])
df_train.to_csv('./df_train.csv')

df_test = pd.DataFrame(
    {'pn': X_test_std,
     '終値': y_test},
    columns=['pn', '終値'],
    index=table.index[len(X_train_std):])
df_test.to_csv('./df_test.csv')

In [10]:
rates_fd = open('./df_train.csv', 'r', encoding="utf-8")
rates_fd.readline()  #1行ごとにファイル終端まで全て読み
next(rates_fd)  # 先頭の行を飛ばす

exchange_dates = []

pn_rates = []
pn_rates_diff = []

exchange_rates = []
exchange_rates_diff = []

prev_pn = df_train['pn'][0]
prev_exch = df_train['終値'][0]

for line in rates_fd:
    splited = line.split(",")
    time = splited[0]   # table.csvの１列目日付
    pn_val = float(splited[1])   # table.csvの２列目PN値
    exch_val = float(splited[2])  # table.csvの３列目株価の終値
    exchange_dates.append(time)  # 日付

    pn_rates.append(pn_val)
    pn_rates_diff.append(pn_val - prev_pn)   # PN値の変化

    exchange_rates.append(exch_val)
    exchange_rates_diff.append(exch_val - prev_exch)   # 株価の変化

    prev_pn = pn_val
    prev_exch = exch_val
rates_fd.close()

INPUT_LEN = 3
data_len = len(pn_rates_diff)
tr_input_mat = []
tr_angle_mat = []

for i in range(INPUT_LEN, data_len):
    tmp_arr = []
    for j in range(INPUT_LEN):
        tmp_arr.append(exchange_rates_diff[i-INPUT_LEN+j])
        tmp_arr.append(pn_rates_diff[i-INPUT_LEN+j])   
    tr_input_mat.append(tmp_arr)  # i日目の直近3日間の株価とネガポジの変化

    if exchange_rates_diff[i] >= 0:  # i日目の株価の上下、プラスなら1、マイナスなら0
        tr_angle_mat.append(1)
    else:
        tr_angle_mat.append(0)   
train_feature_arr = np.array(tr_input_mat)
train_label_arr = np.array(tr_angle_mat)
    
# train_feature_arr, train_label_arrを表示して上のコードの概要を把握
print(train_feature_arr)
print(train_label_arr)

[[ 2.08200000e+02  5.68934449e-01  1.35460000e+02  5.20515921e-01
  -6.17900000e+01 -4.11477489e-01]
 [ 1.35460000e+02  5.20515921e-01 -6.17900000e+01 -4.11477489e-01
  -7.77700000e+01 -1.24632996e+00]
 [-6.17900000e+01 -4.11477489e-01 -7.77700000e+01 -1.24632996e+00
  -5.66100000e+01  1.25381094e+00]
 ...
 [ 3.97500000e+01  1.90734995e+00 -1.92610000e+02 -8.78395747e-01
  -4.56700000e+01 -2.06293843e-01]
 [-1.92610000e+02 -8.78395747e-01 -4.56700000e+01 -2.06293843e-01
   5.95300000e+01 -6.56995061e-01]
 [-4.56700000e+01 -2.06293843e-01  5.95300000e+01 -6.56995061e-01
  -2.29990000e+02  1.06997490e+00]]
[0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 1 1 0 1 0 1 1 1 0 0 0 0
 1 0 1 1 1 1 0 1 0 0 0 1 0 1 1 0 1 1 0 0 1 1 0 1 1 0 0 1 1 1 1 1 0 0 1 0 1
 1 1 0 0 1 0 1 1 1 0 0 1 1 1 0 0 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 1
 0 0 1 0 0 1 0 0 0 0 1 1 1 0 1 1 1 1 0 0 0 1 1 0 1 0 1 1 0 1 0 1 0 0 0 0 1
 0 0 1 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 0
 1 0 1 0 1 1 

In [11]:
# test_feature_arr, test_label_arrを同様に作成
rates_fd = open('./df_test.csv', 'r', encoding="utf-8")
rates_fd.readline()  #1行ごとにファイル終端まで全て読む
next(rates_fd)  # 先頭の行を飛ばす。

exchange_dates = []

pn_rates = []
pn_rates_diff = []

exchange_rates = []
exchange_rates_diff = []

prev_pn = df_test['pn'][0]
prev_exch = df_test['終値'][0]

for line in rates_fd:
    splited = line.split(",")
    time = splited[0]   # table.csvの１列目日付
    pn_val = float(splited[1])   # table.csvの２列目PN値
    exch_val = float(splited[2])  # table.csvの３列目株価の終値
    exchange_dates.append(time)  # 日付

    pn_rates.append(pn_val)
    pn_rates_diff.append(pn_val - prev_pn)   # PN値の変化

    exchange_rates.append(exch_val)
    exchange_rates_diff.append(exch_val - prev_exch)   # 株価の変化

    prev_pn = pn_val
    prev_exch = exch_val
rates_fd.close()
INPUT_LEN = 3
data_len = len(pn_rates_diff)
test_input_mat = []
test_angle_mat = []

for i in range(INPUT_LEN, data_len):
    test_arr = []
    for j in range(INPUT_LEN):
        test_arr.append(exchange_rates_diff[i - INPUT_LEN + j])
        test_arr.append(pn_rates_diff[i - INPUT_LEN + j])   
    test_input_mat.append(test_arr)  # i日目の直近3日間の株価とネガポジの変化

    if exchange_rates_diff[i] >= 0:  # i日目の株価の上下、プラスなら1、マイナスなら0
        test_angle_mat.append(1)
    else:
        test_angle_mat.append(0)   
test_feature_arr = np.array(test_input_mat)
test_label_arr = np.array(test_angle_mat)

# train_feature_arr, train_label_arr,test_feature_arr, test_label_arrを特徴量にして、予測モデル(ロジスティック回帰、SVM、ランダムフォレスト)を構築し予測精度を計測
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

for model in [LogisticRegression(), RandomForestClassifier(n_estimators=200, max_depth=8, random_state=0), SVC()]:
    model.fit(train_feature_arr, train_label_arr)
    print("--Method:", model.__class__.__name__, "--")
    print("Cross validatin scores:{}".format(model.score(test_feature_arr, test_label_arr)))

--Method: LogisticRegression --
Cross validatin scores:0.5576923076923077
--Method: RandomForestClassifier --
Cross validatin scores:0.5448717948717948
--Method: SVC --
Cross validatin scores:0.5064102564102564
