In [1]:
import numpy as np
import pandas as pd
from collections import deque
import random

import sys
sys.path.append("..")
from Environment import cryptoTrade

In [2]:
from tensorflow.keras import layers
import tensorflow as tf

def convet_to_ragged_tensor(obs, single=True):
    # Make sure nesting depth is consistent
    if single:
        for i, value in enumerate(obs):
            if not isinstance(value, list):
                obs[i] = list([value])

        return tf.ragged.constant([obs])

    else:
        for i, entry in enumerate(obs):
            for j, value in enumerate(entry):
                if not isinstance(value, list):
                    obs[i][j] = list([value])

        return tf.ragged.constant(obs)
    
init = tf.keras.initializers.he_uniform(seed=None)

def agent(observation_space, action_space):
    
    # Convert input to a ragged tensor
    observation_space_tensor = convet_to_ragged_tensor(observation_space)
    
    # Get maximum sequence length
    max_seq = observation_space_tensor.bounding_shape()[-1]
    
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=[None, max_seq], dtype=tf.float32, ragged=True),
        tf.keras.layers.LSTM(64, kernel_initializer=init),
        tf.keras.layers.Dense(len(action_space), activation='linear', kernel_initializer=init)
    ])
    
    # Can also use Huber loss?
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
                  optimizer=tf.keras.optimizers.Adam(1e-4),
                  metrics=['accuracy'])
    return model

In [3]:
def train(env, memory, model, target_model, done):
    learning_rate = 0.7
    discount_factor = 0.9
    
    MIN_REPLAY_SIZE = 1000
    if len(memory) < MIN_REPLAY_SIZE:
        return
    
    batch_size = 64
    mini_batch_indexes = np.random.choice(np.arange(len(memory)), size=batch_size, replace=False)

    current_states = [memory[i][0] for i in mini_batch_indexes]
    current_qs_list = model.predict(convet_to_ragged_tensor(current_states, single=False))
    
    new_current_states = [memory[i][3] for i in mini_batch_indexes]
    future_qs_list = target_model.predict(convet_to_ragged_tensor(new_current_states, single=False))

    X = []
    Y = []
    for index, i in enumerate(mini_batch_indexes):
        (observation, action, reward, new_observation, done) = memory[i]
        if not done:
            max_future_q = reward + discount_factor * np.max(future_qs_list[index])
        else:
            max_future_q = reward

        current_qs = current_qs_list[index]
        current_qs[action] = (1 - learning_rate) * current_qs[action] + learning_rate * max_future_q

        X.append(observation)
        Y.append(current_qs)
    
    X = convet_to_ragged_tensor(X, single=False)
    model.fit(X, np.array(Y), batch_size=batch_size, verbose=0, shuffle=True)

In [6]:
# The main loop

data_path = "../data/Coinbase_BTCUSD_dailydata.csv"
env = cryptoTrade(data_path)
env.reset()

epsilon, max_epsilon, min_epsilon = 1, 1, 0.01
decay = 0.01

model = agent(env.observation_space, env.action_space)
target_model = agent(env.observation_space, env.action_space)
target_model.set_weights(model.get_weights())

memory = []

X = []
y = []

f = open("20220329_RNNTraining.txt","w")

steps_to_update_target_model = 0
for episode in range(200):
    total_training_rewards = 0
    
    observation = env.reset()
    done = False
    while not done:
        steps_to_update_target_model += 1

        if np.random.rand() <= epsilon:
            action = int(np.random.choice(len(env.action_space)))
        else: 
            # Choose the best action
            action = int(model.predict(convet_to_ragged_tensor(observation, single=True)).argmax())
            
        # Now step the simulation
        new_observation, reward, done = env.step(action)
        memory.append([observation, action, reward, new_observation, done])
        
        # Update the neural network
        if steps_to_update_target_model % 4 ==0:#or done:
            train(env, memory, model, target_model, done)
            
        #observation = new_observation
        total_training_rewards += reward
        
        if done:
            print('Total training rewards: {} after n steps = {} with final reward = {}'.format(total_training_rewards, episode, reward))
            #total_training_rewards += 1
            txt = "{:.2f}\n"
            f.write(txt.format(total_training_rewards))

            if steps_to_update_target_model >= 100:
                print('Copying main network weights to the target network weights')
                target_model.set_weights(model.get_weights())
                steps_to_update_target_model = 0
            break
        
    # Update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay * episode)

f.close()

Total training rewards: -68981142.37 after n steps = 0 with final reward = -1964840.0
Copying main network weights to the target network weights
Total training rewards: 17726838.199999962 after n steps = 1 with final reward = 147363.0
Copying main network weights to the target network weights
Total training rewards: 1092511311.3300002 after n steps = 2 with final reward = 1817477.0
Copying main network weights to the target network weights
Total training rewards: 1191900145.8300002 after n steps = 3 with final reward = 11789040.0
Copying main network weights to the target network weights
Total training rewards: 1988990188.9499996 after n steps = 4 with final reward = 13606517.0
Copying main network weights to the target network weights
Total training rewards: -34975265.10999998 after n steps = 5 with final reward = 491210.0
Copying main network weights to the target network weights
Total training rewards: 2144544637.010001 after n steps = 6 with final reward = 12574976.0
Copying main n

## Model implementation

In [None]:
# Set up the environment to predict over
observation = env.reset()
done = False
val_memory = []

while not done:
    action = int(model.predict(convet_to_ragged_tensor(observation, single=True)).argmax())
    new_observation, reward, done = env.step(action)
    val_memory.append([observation, action, reward, new_observation, done])

In [None]:
# Run a simulation to see what happened
data = env.data
actions = np.array(val_memory, dtype=object)[:, 1]
data

Unnamed: 0,unix,low,high,open,close,volume,date,vol_fiat
0,1645142400,40323.85,40972.30,40537.94,40659.51,3062.079456,2022-02-18,1.245027e+08
1,1645056000,40099.99,44195.62,43895.55,40536.73,18630.108422,2022-02-17,7.552037e+08
2,1644969600,43330.59,44585.69,44580.73,43895.56,9663.440404,2022-02-16,4.241821e+08
3,1644883200,42433.28,44775.96,42548.71,44583.52,14154.734526,2022-02-15,6.310679e+08
4,1644796800,41570.00,42876.15,42073.37,42548.71,14805.983388,2022-02-14,6.299755e+08
...,...,...,...,...,...,...,...,...
295,1619654400,52369.61,55226.86,54889.81,53580.00,14592.316888,2021-04-29,7.818563e+08
296,1619568000,53887.00,56476.17,55069.61,54894.03,16484.336777,2021-04-28,9.048917e+08
297,1619481600,53321.00,55509.39,54047.80,55069.62,13957.086495,2021-04-27,7.686114e+08
298,1619395200,48817.62,54400.00,49121.00,54053.60,18005.223994,2021-04-26,9.732472e+08


array([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4], dtype=object)