In [1]:
import numpy as np
import time
import os

import keras

# import matplotlib.pyplot as plt
import pandas as pd
import random

import tensorflow as tf
from tensorflow.keras import models

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
# from sklearn.utils import shuffle
from sklearn.utils import class_weight
# from sklearn.metrics import r2_score
# from sklearn.metrics import mean_absolute_error
from tensorflow.keras.optimizers import Adam
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

os.environ["KERAS_BACKEND"] = "tensorflow"
os.environ["TF_ENABLE_GPU_GARBAGE_COLLECTION"] = 'false'
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

tensorboard = TensorBoard(log_dir="./logs")

DEBUG = False

2022-01-25 16:56:35.176898: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-25 16:56:35.211436: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-25 16:56:35.211608: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [2]:
column_names = ['timestamp','open','close','high','low','volume','adosc','atr','macd','macd_signal','macd_hist','mfi','upper_band','middle_band','lower_band','rsi','difference_low_high','difference_open_close','target']

data_directory = '/home/joren/Coding/cryptodata/Normalized_labelled/'
max_df_length = 40000

#####################################
frame_size = 240
batch_size = 128
layers = 5
layer_sizes = [128]*layers
dropouts = [0.1]*layers
batchnormalizations = [1]*layers
learning_rate = 0.0001
optimizer = Adam(learning_rate)

# class_weights = {0: 1,
#                 1: 50.,
#                 2: 50.}

#####################################

In [3]:
field_info = [
    { "type": np.uint64, "count": 1 },
    { "type": np.double, "count": 17 },
    { "type": np.int64, "count": 1 }
]
BYTES_EIGHT = 8

def read_bin_full_file(file):
    f = open(file, 'rb')
    b = f.read(-1)

    BYTES_TO_READ = 0
    for field in field_info:
        BYTES_TO_READ += BYTES_EIGHT * field["count"]

    data = []
    BYTES_READ = 0
    for i in range(0, int(os.path.getsize(file) / BYTES_TO_READ)):
        row = []

        for idx, field in enumerate(field_info):
            row += np.frombuffer(b, dtype=field["type"], count=field["count"], offset=BYTES_READ).tolist()

            BYTES_READ += BYTES_EIGHT * field["count"]

        data.append(row)
    return np.array(data)

In [4]:
def random_file():
    filenames = []
    for file in os.listdir(data_directory):
        filename = os.fsdecode(file)
        filenames.append(filename)
        
    randomname = filenames[random.randint(0, len(filenames)-1)]
    if randomname.endswith(".bin"): 
        print(f"reading file: {os.path.join(data_directory, randomname)}")
        return os.path.join(data_directory, randomname)

In [5]:
class DQN(tf.keras.Model):
    def __init__(self, n_actions, feature_size, layers = 2, layer_sizes = [128, 128], dropouts = [0.1, 0], batchnormalizations = [0, 0], optimizer='adam'):
        super().__init__()
        self._n_actions = n_actions
        self._feature_size = feature_size
        self._frame_size = frame_size

        self._model = self.create_model(layers, layer_sizes, dropouts, batchnormalizations, optimizer)
    
    def create_model(self, layers, layer_sizes, dropouts, batchnormalizations, optimizer):
        model = Sequential()

        for i in range(0, layers):
            if i == 0:
                model.add(LSTM(units=layer_sizes[i], return_sequences = True, stateful = True, batch_input_shape = (batch_size, self._frame_size, self._feature_size)))
            elif i == layers:
                model.add(LSTM(units=layer_sizes[i]))
            elif i >= len(layer_sizes):
                model.add(LSTM(units=layer_sizes[0], return_sequences = True, stateful = True))
            else:
                model.add(LSTM(units=layer_sizes[i], return_sequences = True, stateful = True))
            
            if i < len(dropouts) and dropouts[i] > 0:
                model.add(Dropout(dropouts[i]))
            if i < len(batchnormalizations) and batchnormalizations[i] == 1:
                model.add(BatchNormalization()) 

        model.add(LSTM(units=layer_sizes[0], return_sequences = False, stateful = True))
        
        model.add(Dense(units=128, activation='relu'))
        model.add(Dense(units=self._n_actions, activation='softmax'))
        
        model.compile(optimizer=optimizer, loss=keras.losses.categorical_crossentropy, metrics=['accuracy'])

        if DEBUG:
            print(model.summary())
        return model


In [6]:
# train-test split
def df_split(df):  
    X = df.drop(columns=['timestamp','target'], axis=0).to_numpy()
    Y = df['target'].to_numpy()

    X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, Y, test_size=0.5, shuffle=False)

    y_train_raw = to_categorical(y_train_raw, 3).tolist()
    y_test_raw = to_categorical(y_test_raw, 3).tolist()

    X_train = []
    y_train = []
    # y_train_weights = []
    for i in range(frame_size, X_train_raw.shape[0]): #frame size up to size of array
        X_train.append(X_train_raw[i-frame_size:i])
        # y_train.append(y_train_raw[i-frame_size:i]) # dit wil ik dus graag veranderen naar y_train_raw[i] zodat we enkel op het einde de output hebben
        y_train.append(y_train_raw[i])
    X_train, y_train = np.array(X_train), np.array(y_train)
    # X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], X_train.shape[2], 1))


    X_test = []
    y_test = []
    for i in range(frame_size, X_test_raw.shape[0]): #frame size up to size of array
        X_test.append(X_test_raw[i-frame_size:i])
        # y_test.append(y_test_raw[i-frame_size:i])
        y_test.append(y_test_raw[i])
    X_test, y_test = np.array(X_test), np.array(y_test)

    if DEBUG:
        print(f"""
        X_train shape: {X_train.shape}
        y_train shape: {y_train.shape}
        """)

        print(f"""
        X_train[0] shape: {X_train[0].shape}
        y_train[0] shape: {y_train[0].shape}
        """)

        print(f"X_train[0]: {X_train[0]}")
        print(f"y_train[0]: {y_train[0]}")

    return X_train, X_test, y_train, y_test

In [7]:
# model initialization
dqn = DQN(3, 17, layers, layer_sizes, dropouts, batchnormalizations, optimizer)
# dqn._model = models.load_model('./models/model1')

def test_accuracy(model):
    y_pred_raw = np.array(model.predict(X_test))

    y_pred = np.argmax(y_pred_raw, axis=-1, keepdims=True)
    y_pred = y_pred.flatten()
    y_test_2 = np.argmax(y_test, axis=-1, keepdims=True)
    y_test_2 = y_test_2.flatten()

    # print(y_pred_raw.shape)
    # print(y_test.shape)

    # print(y_pred_raw[0])
    # print(y_test[0])

    # y_pred = y_pred_raw
    # y_test_2 = y_test

    print(f"""
    Class. report:
    {classification_report(y_test_2, y_pred)}
    """)

    cf = confusion_matrix(y_test_2, y_pred)

    print(cf)
    print(accuracy_score(y_test_2, y_pred) * 100) 


2022-01-25 16:56:35.400926: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-25 16:56:35.402055: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-25 16:56:35.402427: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-25 16:56:35.402647: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

In [8]:
for i in range(100):
    file = random_file()
    data = read_bin_full_file(file)

    df = pd.DataFrame(data, columns=column_names) # variable from cell 1

    df.fillna(0, inplace=True)
    # if df.isnull().values.any():
    #     print('nan values found')
    #     continue

    df = df.iloc[240:]

    if len(df) > max_df_length:
        randstart = random.randint(0, len(df)-max_df_length)
        df = df.iloc[randstart:randstart+max_df_length]

    X_train, X_test, y_train, y_test = df_split(df)

    # print(X_train.shape)
    # print(y_train.shape)

    y_train_list = np.argmax(y_train, axis=-1)
    next_file = False
    for i in [0,1,2]:
        print(y_train_list.tolist().count(i))
        if y_train_list.tolist().count(i) < 150:
            next_file = True
    if next_file:
        continue

    class_weights = dict(enumerate(class_weight.compute_class_weight( class_weight='balanced', classes=[0,1,2], y = y_train_list )))
    print(class_weights)

    dqn._model.fit(X_train, y_train, epochs = 10, batch_size = 320, callbacks=[tensorboard], class_weight=class_weights)
    test_accuracy(dqn._model)

reading file: /home/joren/Coding/cryptodata/Normalized_labelled/KAVAUSDT.bin
19671
44
45
reading file: /home/joren/Coding/cryptodata/Normalized_labelled/CREAMBNB.bin
19357
202
201
{0: 0.3402731139467204, 1: 32.60726072607261, 2: 32.769485903814264}


2022-01-25 16:56:50.845960: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 322483200 exceeds 10% of free system memory.
2022-01-25 16:56:51.030250: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 322483200 exceeds 10% of free system memory.


Epoch 1/10


2022-01-25 16:56:57.651747: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8300




In [None]:
# save model
dqn._model.save(f'models/model_small')

2022-01-25 13:24:59.214609: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: models/model1/assets


INFO:tensorflow:Assets written to: models/model1/assets
