Dataset Classification

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers

import sklearn
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, confusion_matrix

import mplfinance as mplf

In [None]:
np.__version__, pd.__version__, tf.__version__, keras.__version__, sklearn.__version__, mplf.__version__

In [None]:
SEED = 1291

In [None]:
ohlc = pd.read_csv("../input/candle-stick-patterns/ohlc.csv", index_col=0, parse_dates=True)
print(ohlc.shape)
ohlc[:3]

data_df = pd.read_pickle("../input/candlestick-eda/data_df.pkl")
data_df = data_df.sort_values("imgID").reset_index(drop=True)
print(data_df.shape)
data_df[:3]

Data_Size = data_df.shape[0]

In [None]:
data_df = pd.concat([ohlc.reset_index(drop=True)[:Data_Size],data_df],1)
print(data_df.shape)
data_df[:3]

In [None]:
NBars = 3
window_size = 5
targetBarPos = 3


pct_changes = ((ohlc["high"].shift(-(targetBarPos+NBars)).rolling(window_size, center=True).mean() - ohlc["high"])/ohlc["high"])
pct_changes.plot(kind="hist", bins=120)
plt.xlim(-0.5,0.5)
plt.xscale("symlog")
plt.show()

# categorize ranges of change
qsize = 0.125
qs = np.arange(0,1+qsize,qsize)
qranges = pd.qcut(pct_changes, q = qs)
print(qs)
print(qranges.value_counts())

# use top ranges as 1 and others as 0
data_df["h_labels"] = qranges.cat.codes.values[:Data_Size]
data_df["ts"] = qranges.index[:Data_Size]
data_df = data_df.set_index("ts")
data_df["h_labels"] = data_df["h_labels"].map({
    7:1, 6:1, 
    0:2, 1:2
})
data_df["h_labels"] = data_df["h_labels"].fillna(0) # should drop -1 first, not handled here

dict(
    zip(
        qranges.cat.categories, 
        range(len(qranges.cat.categories))
    )
)

In [None]:
data_df.head(10)

Prerp X,Y

In [None]:
X = data_df.sort_values("imgID")["imgData"].apply(lambda x: x.reshape(1,40,40,1))
X = np.concatenate(X.values)
X.shape

y = data_df["h_labels"]

In [None]:
y.value_counts() # 1: buy, 2: sell

In [None]:
class_weights = (y.astype(int).value_counts().max() / y.astype(int).value_counts()).to_dict()
class_weights

Split Data

In [None]:
X.shape

In [None]:
tss = TimeSeriesSplit(n_splits=2)
for train_idx, test_idx in tss.split(X,y):
    break

X_tr, X_ts = X[train_idx], X[test_idx]
y_tr, y_ts = keras.utils.to_categorical(y[train_idx]), keras.utils.to_categorical(y[test_idx])

[x.shape for x in [X_tr, X_ts, y_tr, y_ts]]

In [None]:
keras.backend.clear_session()
model = keras.models.load_model('../input/candle-stick-autoencoder/best_model.ckp')
# model.summary()
encoder = keras.models.Sequential(model.layers[:15])
encoder.compile(optimizer="adam", loss='binary_crossentropy')

for layer in encoder.layers: layer.trainable = False
encoder.summary()

In [None]:
def define_model():
    clf = keras.models.Sequential(encoder.layers + 
                                 [
                                     layers.Flatten(),
                                     layers.Dense(256, activation="relu", name="dense_clf"),
                                     layers.BatchNormalization(trainable=False, name="bn1"),
                                     layers.Dropout(0.3, name="drop_clf1"),
                                     layers.Dense(128, activation="relu", name="dense_clf_1"),
                                     layers.BatchNormalization(trainable=False, name="bn2"),
                                     layers.Dropout(0.3, name="drop_clf2"),
                                     layers.Dense(64, activation="relu", name="dense_clf_2"),
                                     layers.BatchNormalization(trainable=False, name="bn3"),
                                     layers.Dropout(0.3, name="drop_clf3"),
                                     layers.Dense(12, activation="relu", name="dense_clf_3"),
                                     layers.Dense(3, activation="softmax", name="dense_clf_4"),
                                 ])
    adam = keras.optimizers.Adam(learning_rate=0.001)
    clf.compile(optimizer=adam, loss='binary_crossentropy')
    clf.build(input_shape=(128,40,40,1))
    return clf

clf = define_model()    
clf.summary()

In [None]:
np.random.seed(1291)

clf = define_model()


es = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0.00001,
    patience=15,
    verbose=1,
    mode='auto',
    restore_best_weights=True)
ckp = keras.callbacks.ModelCheckpoint(
    filepath="best_model_clf.ckp",
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    save_weights_only=False,
    mode='auto',
    save_freq='epoch',
)

clf.fit(X_tr.astype(np.float32), y_tr.astype(np.float32), 
                batch_size=128,
                epochs=1500,
                verbose=1,
                validation_split=0.2,
                # class_weight = class_weights,
                callbacks=[es, ckp] 
               )

In [None]:
plt.plot(clf.history.history["loss"], ".:")
plt.plot(clf.history.history["val_loss"], ".:")
plt.yscale("log")

In [None]:
plt.plot(clf.history.history["loss"], ".:")
plt.plot(clf.history.history["val_loss"], ".:")
plt.yscale("log")

In [None]:
cr = classification_report(y_tr.argmax(1), pred_y_tr)
print(cr)

cm = pd.DataFrame(
    confusion_matrix(y_tr.argmax(1), pred_y_tr), 
    columns=tuple(zip(["pred"]*3,[0,1,2])), 
    index=tuple(zip(["true"]*3,[0,1,2]))
)
cm.style.background_gradient()

In [None]:
cr = classification_report(y_ts.argmax(1), pred_y_ts)
print(cr)

cm = pd.DataFrame(
    confusion_matrix(y_ts.argmax(1), pred_y_ts),
    columns=tuple(zip(["pred"]*3,range(3))), 
    index=tuple(zip(["true"]*3,range(3)))
)
cm.style.background_gradient()

Test Series

In [None]:
test_start_idx = test_idx[0]
test_start_idx