In [1]:
import numpy as np
import pandas as pd
from collections import deque
import random

import sys
sys.path.append("..")
from Environment import cryptoTrade




In [2]:
from tensorflow.keras import layers
import tensorflow as tf

def convet_to_ragged_tensor(obs, single=True):
    # Make sure nesting depth is consistent
    if single:
        for i, value in enumerate(obs):
            if not isinstance(value, list):
                obs[i] = list([value])

        return tf.ragged.constant([obs])

    else:
        for i, entry in enumerate(obs):
            for j, value in enumerate(entry):
                if not isinstance(value, list):
                    obs[i][j] = list([value])

        return tf.ragged.constant(obs)
    
init = tf.keras.initializers.he_uniform(seed=None)

def agent(observation_space, action_space):
    
    # Convert input to a ragged tensor
    observation_space_tensor = convet_to_ragged_tensor(observation_space)
    
    # Get maximum sequence length
    max_seq = observation_space_tensor.bounding_shape()[-1]
    
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=[None, max_seq], dtype=tf.float32, ragged=True),
        tf.keras.layers.LSTM(64, kernel_initializer=init),
        tf.keras.layers.Dense(len(action_space), activation='linear', kernel_initializer=init)
    ])
    
    # Can also use Huber loss?
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
                  optimizer=tf.keras.optimizers.Adam(1e-4),
                  metrics=['accuracy'])
    return model

In [23]:
def train(env, memory, model, target_model, done):
    learning_rate = 0.7
    discount_factor = 0.6
    
    MIN_REPLAY_SIZE = 1000
    if len(memory) < MIN_REPLAY_SIZE:
        return
    
    batch_size = 64
    mini_batch_indexes = np.random.choice(np.arange(len(memory)), size=batch_size, replace=False)

    current_states = [memory[i][0] for i in mini_batch_indexes]
    current_qs_list = model.predict(convet_to_ragged_tensor(current_states, single=False))
    
    new_current_states = [memory[i][3] for i in mini_batch_indexes]
    future_qs_list = target_model.predict(convet_to_ragged_tensor(new_current_states, single=False))

    X = []
    Y = []
    for index, i in enumerate(mini_batch_indexes):
        (observation, action, reward, new_observation, done) = memory[i]
        if not done:
            max_future_q = reward + discount_factor * np.max(future_qs_list[index])
        else:
            max_future_q = reward

        current_qs = current_qs_list[index]
        current_qs[action] = (1 - learning_rate) * current_qs[action] + learning_rate * max_future_q

        X.append(observation)
        Y.append(current_qs)
    
    X = convet_to_ragged_tensor(X, single=False)
    model.fit(X, np.array(Y), batch_size=batch_size, verbose=0, shuffle=True)

In [26]:
# The main loop

data_path = "../data/Coinbase_BTCUSD_dailydata.csv"
env = cryptoTrade(data_path)
env.reset()

epsilon, max_epsilon, min_epsilon = 1, 1, 0.01
decay = 0.01

model = agent(env.observation_space, env.action_space)
target_model = agent(env.observation_space, env.action_space)
target_model.set_weights(model.get_weights())

memory = []

X = []
y = []

steps_to_update_target_model = 0
for episode in range(200):
    total_training_rewards = 0
    
    observation = env.reset()
    done = False
    while not done:
        steps_to_update_target_model += 1

        if np.random.rand() <= epsilon:
            action = int(np.random.choice(len(env.action_space)))
        else: 
            # Choose the best action
            action = int(model.predict(convet_to_ragged_tensor(observation, single=True)).argmax())
            
        # Now step the simulation
        new_observation, reward, done = env.step(action)
        memory.append([observation, action, reward, new_observation, done])
        
        # Update the neural network
        if steps_to_update_target_model % 4 ==0:#or done:
            train(env, memory, model, target_model, done)
            
        #observation = new_observation
        total_training_rewards += reward
        
        if done:
            print('Total training rewards: {} after n steps = {} with final reward = {}'.format(total_training_rewards, episode, reward))
            total_training_rewards += 1

            if steps_to_update_target_model >= 100:
                print('Copying main network weights to the target network weights')
                target_model.set_weights(model.get_weights())
                steps_to_update_target_model = 0
            break
        
    # Update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay * episode)

MAX SEQ tf.Tensor(10, shape=(), dtype=int64)
(10, 256)
MAX SEQ tf.Tensor(10, shape=(), dtype=int64)
(10, 256)
Total training rewards: -1214272562.18 after n steps = 0 with final reward = -7515513.0
Copying main network weights to the target network weights
Total training rewards: -248919875.25999996 after n steps = 1 with final reward = -1473630.0
Copying main network weights to the target network weights


2022-03-28 19:09:40.481648: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-28 19:09:40.519334: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Total training rewards: 876495917.1900002 after n steps = 2 with final reward = 3438470.0
Copying main network weights to the target network weights


2022-03-28 19:09:40.698608: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-28 19:09:40.734374: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-28 19:09:41.148614: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-28 19:09:41.267906: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-28 19:09:41.335249: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Total training rewards: 951442026.35 after n steps = 3 with final reward = 4077043.0
Copying main network weights to the target network weights
Total training rewards: 339294564.18 after n steps = 4 with final reward = 2406929.0
Copying main network weights to the target network weights
Total training rewards: -846976675.29 after n steps = 5 with final reward = -4912100.0
Copying main network weights to the target network weights
Total training rewards: 681500648.92 after n steps = 6 with final reward = 3684075.0
Copying main network weights to the target network weights
Total training rewards: -1513475880.6599998 after n steps = 7 with final reward = -11985524.0
Copying main network weights to the target network weights
Total training rewards: -1760244029.2499998 after n steps = 8 with final reward = -10266289.0
Copying main network weights to the target network weights
Total training rewards: -4007587526.5800004 after n steps = 9 with final reward = -23185112.0
Copying main network w

Total training rewards: -5236374304.129998 after n steps = 57 with final reward = -41409003.0
Copying main network weights to the target network weights
Total training rewards: -5601320083.990005 after n steps = 58 with final reward = -44307142.0
Copying main network weights to the target network weights
Total training rewards: -6923744270.210001 after n steps = 59 with final reward = -56292666.0
Copying main network weights to the target network weights
Total training rewards: -5056756077.790003 after n steps = 60 with final reward = -41065156.0
Copying main network weights to the target network weights
Total training rewards: -7694321294.919996 after n steps = 61 with final reward = -53296285.0
Copying main network weights to the target network weights
Total training rewards: -6061537002.979998 after n steps = 62 with final reward = -46812313.0
Copying main network weights to the target network weights
Total training rewards: -6110772749.060001 after n steps = 63 with final reward = 

Total training rewards: -4358104697.5599985 after n steps = 111 with final reward = -37528444.0
Copying main network weights to the target network weights
Total training rewards: -5180216178.560002 after n steps = 112 with final reward = -38707348.0
Copying main network weights to the target network weights
Total training rewards: -4860074067.850003 after n steps = 113 with final reward = -36447782.0
Copying main network weights to the target network weights
Total training rewards: -4921915141.509997 after n steps = 114 with final reward = -35514483.0
Copying main network weights to the target network weights
Total training rewards: -5687113304.04 after n steps = 115 with final reward = -40573946.0
Copying main network weights to the target network weights
Total training rewards: -4183810367.839998 after n steps = 116 with final reward = -34679426.0
Copying main network weights to the target network weights
Total training rewards: -4986171196.340002 after n steps = 117 with final rewar

Total training rewards: -1234633574.7599998 after n steps = 165 with final reward = -9234748.0
Copying main network weights to the target network weights
Total training rewards: -1958834667.2 after n steps = 166 with final reward = -13655638.0
Copying main network weights to the target network weights
Total training rewards: -1490222057.7199998 after n steps = 167 with final reward = -13164428.0
Copying main network weights to the target network weights
Total training rewards: -1183486604.1199994 after n steps = 168 with final reward = -11543435.0
Copying main network weights to the target network weights
Total training rewards: -1286777488.0199993 after n steps = 169 with final reward = -10855741.0
Copying main network weights to the target network weights
Total training rewards: -411485737.38000005 after n steps = 170 with final reward = -4568253.0
Copying main network weights to the target network weights
Total training rewards: -2287632597.1200004 after n steps = 171 with final rew

## Model implementation

In [30]:
# Set up the environment to predict over
observation = env.reset()
done = False
val_memory = []

while not done:
    action = int(model.predict(convet_to_ragged_tensor(observation, single=True)).argmax())
    new_observation, reward, done = env.step(action)
    val_memory.append([observation, action, reward, new_observation, done])

In [39]:
# Run a simulation to see what happened
data = env.data
actions = np.array(val_memory, dtype=object)[:, 1]
data

Unnamed: 0,unix,low,high,open,close,volume,date,vol_fiat
0,1645142400,40323.85,40972.30,40537.94,40659.51,3062.079456,2022-02-18,1.245027e+08
1,1645056000,40099.99,44195.62,43895.55,40536.73,18630.108422,2022-02-17,7.552037e+08
2,1644969600,43330.59,44585.69,44580.73,43895.56,9663.440404,2022-02-16,4.241821e+08
3,1644883200,42433.28,44775.96,42548.71,44583.52,14154.734526,2022-02-15,6.310679e+08
4,1644796800,41570.00,42876.15,42073.37,42548.71,14805.983388,2022-02-14,6.299755e+08
...,...,...,...,...,...,...,...,...
295,1619654400,52369.61,55226.86,54889.81,53580.00,14592.316888,2021-04-29,7.818563e+08
296,1619568000,53887.00,56476.17,55069.61,54894.03,16484.336777,2021-04-28,9.048917e+08
297,1619481600,53321.00,55509.39,54047.80,55069.62,13957.086495,2021-04-27,7.686114e+08
298,1619395200,48817.62,54400.00,49121.00,54053.60,18005.223994,2021-04-26,9.732472e+08


array([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4], dtype=object)