# 目的
OHLCを画像で表現し、それを用いてFXTFのデータのトレンドを予測できるか検討する。


In [None]:
from keras.callbacks import CSVLogger, ModelCheckpoint
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D
from keras.layers.pooling import MaxPool2D
from keras.models import Sequential
from keras.models import model_from_json, load_model
from keras.optimizers import Adam, Adagrad
from keras.callbacks import EarlyStopping, TensorBoard
from keras.utils import plot_model

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

import copy
import datetime
import glob
import numpy as np
import os
import pandas as pd
import pickle

from PIL import Image
from PIL import ImageDraw

# from FX.FX import SQLAnaforFX
from FX.FX.core import drawfigfunc as dff
from FX.FX.core import datetimefuncs as dtf
from FX.FX.core import analyzefuncs as af
# from FX.FX import KerasModelAdapter

In [None]:
# basepath = "C:/Users/Surpris/Desktop/20170918/"
basepath = "../../images/20170918/"

## データセットの準備

### 画像

In [None]:
filelist = glob.glob(basepath + "images-ohlc/*.png")
img = Image.open(filelist[0])
img

In [None]:
nbr_of_img = 6000

img1 = np.array(Image.open(filelist[0]))[:,:,0]
imglist_shape = (nbr_of_img, img1.shape[0], img1.shape[1], 1)
imglist = np.zeros(imglist_shape)
for ii in range(nbr_of_img):
    img = Image.open(filelist[ii])
    buff = np.array(img)[:,:,0][: ,:, None]
    imglist[ii] = buff.copy()
imglist = imglist.astype("float32") / 255.0

In [None]:
np.save(basepath + "ML/imglist.npy", imglist)

In [None]:
imglist = np.load(basepath + "ML/imglist.npy")
nbr_of_img = len(imglist)

### ラベル

In [None]:
data = pd.read_csv(basepath + "FXTF/USDJPY-cd1_20170806_k030.csv")
y = data[["label1", "label2", "label3"]].as_matrix()[9:nbr_of_img+9].copy()

### 分離

In [None]:
X_train, X_test, y_train, y_test = train_test_split(imglist, y, test_size=0.3, random_state=300)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## モデルの構築

In [None]:
try:
    img1
except:
    img1 = imglist[0]
input_shape = (img1.shape[0], img1.shape[1], 1)

model = Sequential()
# Input layer
model.add(Conv2D(10, 3, input_shape=input_shape, activation="relu"))
# model.add(Activation("relu"))
# 2nd layer
model.add(Conv2D(10, 3, activation="relu"))
model.add(MaxPool2D(pool_size=(3,3)))
# 3rd layer()
model.add(Conv2D(20, 3, activation="relu"))
# model.add(Activation("relu"))
model.add(MaxPool2D(pool_size=(2,2)))
# 4th layer
model.add(Flatten())
model.add(Dense(1024, activation="relu"))
# model.add(Activation("relu"))
model.add(Dropout(0.3))
# Output layer
model.add(Dense(y.shape[1], activation="softmax"))

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=1e-4),
              metrics=['accuracy'])

In [None]:
model.save(basepath + "ML/model.h5")

### modelの読み込み

In [None]:
model = load_model(basepath + "ML/model.h5")

In [None]:
plot_model(model, to_file=basepath+"ML/test.png", show_shapes=True)

## 訓練

In [None]:
callbacks = []
callbacks.append(EarlyStopping(monitor='val_loss', patience=2))
callbacks.append(CSVLogger(basepath + "ML/history.csv"))
# callbacks.append(ModelCheckpoint(filepath=basepath+"ML/model_ep/ep{epoch:02d}.h5"))

hist = model.fit(
    X_train, y_train,
    batch_size=100,
    epochs=80,
    validation_split=0.1,
    callbacks=callbacks,
    verbose=0)

score = model.evaluate(X_test, y_test, verbose=0)
print('loss=', score[0])
print('accuracy=', score[1])

### 訓練結果の確認

In [None]:
def plot_probability(model, testdata):
    """
    Plot predicted probability
    """
    ### Calculate probability
    probs = model.predict_proba(X_test, verbose=0).T
    labels = ["high", "lose", "low"]

    ### Make histograms of each probability
    xbins = np.arange(0, 1.0, 0.05)
    hists = np.zeros((3, len(xbins)))
    for ii in range(len(probs)):
        hists[ii, :-1], bins = np.histogram(probs[ii], bins=xbins)
        hists[ii] /= hists[ii].sum()
    
    ### Plot probability of each dataset
    dff.makefig(18, 5)
    for ii in range(len(probs)):
        plt.subplot(1,3,ii + 1)
        plt.plot(probs[ii], linewidth=1.2)
        dff.arrangefig(xlabel="Time index", ylabel="Probability", title="Probability of {}".format(labels[ii]))
        plt.ylim(0, 1)
    plt.tight_layout()

    ### Plot histogram
    dff.makefig(18, 5)
    dxbins = np.diff(xbins)[0]
    for ii in range(len(hists)):
        plt.subplot(1,3,ii + 1)
        plt.bar(xbins, hists[ii], width=0.8*dxbins, hold="center", color="g")
        dff.arrangefig(ylabel="Frequency")
        ax2 = plt.gca().twinx()
        ax2.plot(xbins, 1.0 - np.cumsum(hists[ii]), "r-", linewidth=1.5)
        dff.arrangefig(xlabel="Probability", ylabel="Accumulation", title="Hist of {}".format(labels[ii]))
        plt.ylim(0, 1)
    #     plt.yscale("log")
    plt.tight_layout()
    
    return

def calc_accuracy_above_threshold(model, X, y, threshold=0.5, verbose=0):
    """
    Calculate accuracy of model for the datasets
    where the predicted probability is above 'threshold'.
    """
    
    ### Extract the datasets with the predicted probability above 'threshold'
    probs = model.predict_proba(X, verbose=0).T
    inds = np.zeros_like(probs, dtype=bool)
    for ii in range(0, probs.shape[0]):
        inds[ii] = probs[ii] >= threshold
    ind_sum = inds.sum(axis=0) > 0
    
    ### Evaluate the datasets
    if inds.sum() == 0:
        score = [0, 0]
    else:
        score = model.evaluate(X[ind_sum], y[ind_sum], verbose=0)
    if verbose > 0:
        print("<# of events over threshold>")
        print("[high, lose, low]:", inds.sum(axis=1), ",total:", ind_sum.sum())
        print('loss=', score[0])
        print('accuracy=', score[1])
    return score

In [None]:
fig = plot_probability(model, imglist)

In [None]:
calc_accuracy_above_threshold(model, X_test, y_test, 0.7, 1)

In [None]:
calc_accuracy_above_threshold(model, imglist, y, 0.65, 1)

In [None]:
model.save(basepath + "ML/model_{}.h5".format(datetime.datetime.now().strftime("%Y%m%d%H%M%S")))

## データセットを増やす
一気に読み込むとメモリが足りないので、次の手順で訓練する。   

1. データセットのパスをシャッフルする。
1. シャッフルされたデータセットをいくつかのグループに分ける。
1. 順番にそれぞれのグループを与えてモデルを訓練する。
1. 評価値もそれぞれのグループを与えつつ計算する。

In [None]:
def grouping_dataset(filelist, labelData, nbr_of_grp, 
                     shuffle=True, seed=None):
    """
    Separate datasets into 'nbr_of_grp' groups.
    """
    if shuffle is True:
        if seed is not None and isinstance(seed, int):
            np.random.seed(seed)
        ind = np.random.permutation(np.arange(0, len(filelist))).astype("int32")
    else:
        ind = np.arange(0, len(filelist)).astype("int32")
    groups_X = []
    groups_Y = []
    grp_size = len(filelist) // nbr_of_grp
    for ii in range(nbr_of_grp-1):
        ind_ii = ind[ii*grp_size:(ii+1)*grp_size]
        groups_X.append(filelist[ind_ii])
        groups_Y.append(labelData[ind_ii])
    ind_ii = ind[(ii+1)*grp_size:]
    groups_X.append(filelist[ind_ii])
    groups_Y.append(labelData[ind_ii])
    return groups_X, groups_Y

def load_images_from_filelist(filelist):
    img1 = np.array(Image.open(filelist[0]))[:,:,0]
    imglist_shape = (len(filelist), img1.shape[0], img1.shape[1], 1)
    imglist = np.zeros(imglist_shape)
    for ii in range(len(filelist)):
        img = Image.open(filelist[ii])
        buff = np.array(img)[:,:,0][: ,:, None]
        imglist[ii] = buff.copy()
    imglist = imglist.astype("float32") / 255.0
    return imglist

def create_model():
    pass

def train_with_groups(model, groups_Xpath_train, groups_y_train, 
                      X_test, y_test, 
                      epochs=80, useCsvLogger=False, useModelCheckPoint=False):
    """
    Train a model with groups of datasets.
    """
    callbacks = []
    callbacks.append(EarlyStopping(monitor='val_loss', patience=2))
    if useCsvLogger: # TODO: modify so that logging is for each group.
        callbacks.append(CSVLogger(basepath + "ML/history.csv"))
    if useModelCheckPoint:
        callbacks.append(ModelCheckpoint(filepath=basepath+"ML/model_ep/ep{epoch:02d}.h5"))
        
    hists = []
    scores = []
    for ii in range(len(groups_Xpath_train)):
        X = load_images_from_filelist(groups_Xpath_train[ii])
        y = groups_y_train[ii]
        hist = model.fit(X, y, batch_size=100, epochs=80,
                         validation_split=0.1, callbacks=callbacks, verbose=0)
        score = model.evaluate(X_test, y_test, verbose=0)
        print('Group {0}: loss={1:.4f}, accuracy={2:.4f}'.format(ii, score[0], score[1]))
        hists.append(hist)
        scores.append(score)
    return hists, scores

In [None]:
basepath = "C:/Users/Surpris/Desktop/20170918/"
filelist = np.array(glob.glob(os.path.join(basepath, "images-ohlc/*.png")))
data = pd.read_csv(basepath + "FXTF/USDJPY-cd1_20170806_k030.csv")
y = data[["label1", "label2", "label3"]].as_matrix()[9:-1].copy()

Xpath_train, Xpath_test, y_train, y_test = train_test_split(filelist, y, test_size=0.3)

In [None]:
grpX, grpY = grouping_dataset(Xpath_train, y_train, 8)
len(Xpath_train), len(grpX), len(y_train), len(grpY), [len(q) for q in grpX]

X_test = load_images_from_filelist(Xpath_test)

In [None]:
model = load_model(basepath + "ML/model.h5")
hists, scores = train_with_groups(model, grpX, grpY, X_test, y_test, 80)

In [None]:
plot_probability(model, X_test)

In [None]:
calc_accuracy_above_threshold(model, X_test, y_test, threshold=0.63, verbose=1)

## まとめ
CNNを用いたFXの予測を行った。   
シンプルに入出力層＋３層のモデルを構築してみたが、ただ数値を入れるよりは精度が良くなっているかもしれないという印象である。