In [118]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from keras.datasets import mnist
from keras.layers import *
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from sklearn import preprocessing

## Data Preprocessing

### Load Data

In [55]:
df = pd.read_pickle("database.pkl")
data = df["Close"].values
signal = np.load("trading_signal.npy")

In [3]:
signal[:30]

array([[ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [-1]], dtype=int32)

### One-hot Encoding

In [4]:
# One-hot encode
y_train = to_categorical(signal, 13)

### Scaling

In [5]:
# Standardize
standardize = preprocessing.scale(data)
# Scaling
scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
x_scaled = scaler.fit_transform(standardize.reshape(standardize.shape[0], 1))



### Timestep Splitting

In [6]:
time_step = 30
d = np.zeros((x_scaled.shape[0] - time_step + 1, 30, 1))
y_step = np.zeros((y_train.shape[0] - time_step + 1, time_step, 13))
for i in range(d.shape[0]):
    d[i] = x_scaled[i:i + time_step]
    if i < y_step.shape[0]:
        y_step[i] = y_train[i:i + time_step]

### Train/Test Splitting

In [7]:
# Split train/test
x_train = d[:signal.shape[0]]
x_test = d[signal.shape[0]:]

### Return

In [85]:
# Calculate Return Values
return_x = df["Close"].values - df["Open"].values

In [86]:
# Standardize
# return_x = preprocessing.scale(return_x)
return_x = return_x.astype("float32") - np.mean(return_x)
# Scaling
scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
return_x = scaler.fit_transform(standardize.reshape(return_x.shape[0], 1))

In [87]:
# Timestep Splitting
tmp = np.zeros((return_x.shape[0] - time_step + 1, 30, 1))
for i in range(d.shape[0]):
    tmp[i] = return_x[i:i + time_step]

In [88]:
# Train/Test Splitting
return_x_train = tmp[:signal.shape[0]]
return_x_test = tmp[signal.shape[0]:]

## DNN

### Build Model

In [95]:
model0 = Sequential()
# model0.add(Flatten(input_shape=(30,1), name="input"))
model0.add(Dense(128, activation="tanh", input_shape=(1,), name="fc1"))
model0.add(Dense(64, activation="tanh", name="fc2"))
model0.add(Dense(32, activation="tanh", name="fc3"))
model0.add(Dense(16, activation="tanh", name="fc4"))
model0.add(Dense(13, activation="softmax", name="output"))

model0.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
fc1 (Dense)                  (None, 128)               256       
_________________________________________________________________
fc2 (Dense)                  (None, 64)                8256      
_________________________________________________________________
fc3 (Dense)                  (None, 32)                2080      
_________________________________________________________________
fc4 (Dense)                  (None, 16)                528       
_________________________________________________________________
output (Dense)               (None, 13)                221       
Total params: 11,341
Trainable params: 11,341
Non-trainable params: 0
_________________________________________________________________


### Train Model

In [96]:
model0.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy", "mse"])

In [97]:
history0 = model0.fit(data[:270000], y_train, batch_size=128, epochs=500, validation_split=0.1, callbacks=[EarlyStopping(patience=10)])

Train on 243000 samples, validate on 27000 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500


## LSTM

In [104]:
# Hyperparameters
latent_dim = 16
batch_size = 300

### Build Model

In [71]:
model1 = Sequential()

model1.add(LSTM(latent_dim, return_sequences=True, batch_input_shape=(batch_size, time_step, 1), name="lstm1"))
model1.add(Flatten(name="flatten1"))
model1.add(Dense(64, activation="tanh", name="fc1"))
model1.add(Dense(32, activation="tanh", name="fc2"))
model1.add(Dense(16, activation="tanh", name="fc3"))
model1.add(Dense(13, activation="softmax", name="output"))

model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm1 (LSTM)                 (300, 30, 16)             1152      
_________________________________________________________________
flatten1 (Flatten)           (300, 480)                0         
_________________________________________________________________
fc1 (Dense)                  (300, 64)                 30784     
_________________________________________________________________
fc2 (Dense)                  (300, 32)                 2080      
_________________________________________________________________
fc3 (Dense)                  (300, 16)                 528       
_________________________________________________________________
output (Dense)               (300, 13)                 221       
Total params: 34,765
Trainable params: 34,765
Non-trainable params: 0
_________________________________________________________________


### Train Model

In [111]:
model1.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [73]:
history1 = model1.fit(return_x_train, y_train, batch_size=300, epochs=300, validation_split=0.1, shuffle=False, callbacks=[EarlyStopping(patience=10)])

Train on 243000 samples, validate on 27000 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300


In [None]:
h = {}
h["loss"] = []
h["acc"] = []
h["val_loss"] = []
h["val_acc"] = []
for e in range(100):
    for b in range(return_x_train.shape[0]-1000 // batch_size):
        # Random select batch data
        idx = np.random.randint(0, return_x_train.shape[0]-1000, batch_size)
        loss, acc = model1.train_on_batch(return_x_train[idx], y_train[idx])
    loss, acc = model1.evaluate(return_x_train, y_train, batch_size=batch_size)
    val_loss, val_acc = model1.evaluate(return_x_train[return_x_train.shape[0]-1000:], y_train[return_x_train.shape[0]-1000:], batch_size=batch_size)
    print("Epoch %d/100\tloss: %.4f - acc: %.4f - val_loss: %.4f - val_acc: %.4f" % (e+1, loss, acc, val_loss, val_acc))
    h["loss"].append(loss)
    h["acc"].append(acc)
    h["val_loss"].append(val_loss)
    h["val_acc"].append(val_acc)

### Predict

In [37]:
preds1 = model1.predict(x_test[:3000], batch_size=batch_size)

In [40]:
preds1 = np.argmax(preds1, axis=1)

## 偉嘉Ver.

### Data Preprocessing

In [115]:
# -*- coding: utf-8 -*-
"""
Created on Mon Jul  9 14:31:02 2018

@author: chia
"""
from keras.utils import np_utils
from keras import backend as K
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from itertools import product
import warnings
warnings.filterwarnings("ignore")

#train_data, test_data = data[:270000], data[270000:]

target = np.array(pd.get_dummies(signal.flatten()))
y_target = signal+6
y_target = y_target[30:]

Kseconds = 30
epochs = 300
batch = 2048


def normalization_for_class(x):
    sc = MinMaxScaler()
    ss = StandardScaler()
#    x = sc.fit_transform(x.reshape(x.shape[0],1))
    x = ss.fit_transform(x.reshape(x.shape[0], 1))
    return x, ss


def weighted_categorical_crossentropy(weights):
    """
    A weighted version of keras.objectives.categorical_crossentropy

    Variables:
        weights: numpy array of shape (C,) where C is the number of classes

    Usage:
        weights = np.array([0.5,2,10]) # Class one at 0.5, class 2 twice the normal weights, class 3 10x.
        loss = weighted_categorical_crossentropy(weights)
        model.compile(loss=loss,optimizer='adam')
    """

    weights = K.variable(weights)

    def loss(y_true, y_pred):
        # scale predictions so that the class probas of each sample sum to 1
        y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
        # clip to prevent NaN's and Inf's
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
        # calc
        loss = y_true * K.log(y_pred) * weights
        loss = -K.sum(loss, -1)
        return loss

    return loss


def cal_return_old(data):  # 用來算Return
    after_price = np.array(data)
    current_price = np.insert(after_price, 0, 0)  # 前補0
    after_price = np.append(after_price, 0)  # 後補0

    # 後一秒(after_price) - 前一秒(current_price)
    secReturn = (after_price - current_price)

    secReturn = np.delete(secReturn, [0, len(secReturn)-1])  # 去除最頭最尾
    return secReturn


return_ = cal_return_old(data)

#data, ss = normalization_for_class(data)
#data = data.flatten()

return_, ss = normalization_for_class(return_)
return_ = return_.flatten()

"""#price"""
x_train_price = np.zeros((data.shape[0] - Kseconds + 1, Kseconds))

for i in range(x_train_price.shape[0]):
    x_train_price[i] = data[i: i + Kseconds]

x_train_price, x_test_price = x_train_price[:270000], x_train_price[270000:]

a = np.mean(x_train_price, axis=1)
a = np.repeat(a, Kseconds, axis=0)
a = a.reshape(int(a.shape[0]/Kseconds), Kseconds)
data_sampling = x_train_price - a
b = np.mean(x_test_price, axis=1)
b = np.repeat(b, Kseconds, axis=0)
b = b.reshape(int(b.shape[0]/Kseconds), Kseconds)
data_sampling_test = x_test_price - b

"""#return"""
x_train = np.zeros((return_.shape[0] - Kseconds + 1, Kseconds))

for i in range(x_train.shape[0]):
    x_train[i] = return_[i: i + Kseconds]

x_train, x_test = x_train[:270000-Kseconds], x_train[270000-Kseconds:]
y_train = target

nb_class = 13

unique, counts = np.unique(signal, return_counts=True)
sum_dict = dict(zip(unique+6, counts))
c = np.zeros(13)
for i in range(13):
    c[i] = 270000 / (sum_dict[i] * nb_class)
weights = dict(zip(unique+6, c))

#from sklearn.utils import class_weight
#sample_weights = class_weight.compute_sample_weight('balanced', signal[29:])

### Build Model

In [182]:
def cnn(window_size, nb_input_series, output_dim=1):

    model = Sequential()
    model.add(Conv1D(filters=36, kernel_size=6,
                     input_shape=(window_size, nb_input_series), name="conv1d"))
    model.add(PReLU(name="prelu_1"))
    model.add(Flatten(name="faltten"))
    model.add(Dense(64, kernel_regularizer="l2", name="fc1"))
    model.add(PReLU(name="prelu_2"))
    model.add(Dense(32, kernel_regularizer="l2", name="fc2"))
    model.add(PReLU(name="prelu_3"))
    model.add(Dense(16, kernel_regularizer="l2", name="fc3"))
    model.add(PReLU(name="prelu_4"))
    model.add(Dense(output_dim, activation="softmax", name="output"))
    
    model.summary()

    return model


def cnn_train(model, x, y, epochs=100):
    x = np.reshape(x, (x.shape[0], x.shape[1], 1))
    train_fit = model.fit(x=x, y=y, validation_split=0.1,
                          epochs=epochs, batch_size=batch, callbacks=[EarlyStopping(patience=20)])
    return model, train_fit


#自己的解 (正確的，能overfitting，目前用price減去每條time-series平均處理)
#model, history = cnn_train(cnn(data_sampling[29:].shape[1], 1, output_dim = 13, loss = "categorical_crossentropy"), data_sampling[29:], y_train[29:], epochs = 100)

#print("Predicting cnn...")
#predict_test = model.predict(data_sampling_test.reshape(data_sampling_test.shape[0],data_sampling_test.shape[1],1))
#predict_train = model.predict(data_sampling.reshape(data_sampling.shape[0],x_train.shape[1],1))
#    
#loss_test = model.evaluate(data_sampling_test.reshape(data_sampling_test.shape[0],data_sampling_test.shape[1],1), predict_test)
#
#test = np.argmax(predict_test, axis = 1) - 6
#train = np.argmax(predict_train, axis = 1) - 6

# 頭哥解
model = cnn(x_train.shape[1], 1, output_dim=13)
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy", "mae"])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 25, 36)            252       
_________________________________________________________________
prelu_1 (PReLU)              (None, 25, 36)            900       
_________________________________________________________________
faltten (Flatten)            (None, 900)               0         
_________________________________________________________________
fc1 (Dense)                  (None, 64)                57664     
_________________________________________________________________
prelu_2 (PReLU)              (None, 64)                64        
_________________________________________________________________
fc2 (Dense)                  (None, 32)                2080      
_________________________________________________________________
prelu_3 (PReLU)              (None, 32)                32        
__________

### Train

In [183]:
iidx = np.arange(270000-Kseconds).reshape(270000-Kseconds, 1)
w = np.array([100]*13)
x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], 1)

for ii in range(1):
    mat_idx = np.array([])
    for t in range(10):
        for c in range(13):
            leng = y_target[y_target == c].shape[0]
            temp = iidx[y_target == c]
            idx = np.random.randint(0, high=leng, size=w[c])
            mat_idx = np.append(mat_idx, temp[idx]).astype('int32')
    history = model.fit(x_train[mat_idx], y_target[mat_idx, :],
                        epochs=500, batch_size=256, validation_split=0.1, callbacks=[EarlyStopping(patience=20)])

Train on 11700 samples, validate on 1300 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/50

Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500


### Predict

In [160]:
predict_test = model.predict(x_test.reshape(
    x_test.shape[0], x_test.shape[1], 1))
predict_train = model.predict(x_train.reshape(
    x_train.shape[0], x_train.shape[1], 1))
# Predict probability to categorical
pred_test = np.argmax(predict_test, axis=1) - 6
pred_train = np.argmax(predict_train, axis=1) - 6

In [168]:
np.save("pred7231", pred_test)