In [1]:
import numpy as np
import pandas as pd
from collections import deque
import random

import sys
sys.path.append("..")
from Environment import cryptoTrade




In [2]:
from tensorflow.keras import layers
import tensorflow as tf

def convet_to_ragged_tensor(obs, single=True):
    # Make sure nesting depth is consistent
    if single:
        for i, value in enumerate(obs):
            if not isinstance(value, list):
                obs[i] = list([value])

        return tf.ragged.constant([obs])

    else:
        for i, entry in enumerate(obs):
            for j, value in enumerate(entry):
                if not isinstance(value, list):
                    obs[i][j] = list([value])

        return tf.ragged.constant(obs)
    
init = tf.keras.initializers.he_uniform(seed=None)

def agent(observation_space, action_space):
    
    # Convert input to a ragged tensor
    observation_space_tensor = convet_to_ragged_tensor(observation_space)
    
    # Get maximum sequence length
    max_seq = observation_space_tensor.bounding_shape()[-1]
    
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=[None, max_seq], dtype=tf.float32, ragged=True),
        tf.keras.layers.LSTM(64, kernel_initializer=init),
        tf.keras.layers.Dense(len(action_space), activation='linear', kernel_initializer=init)
    ])
    
    # Can also use Huber loss?
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
                  optimizer=tf.keras.optimizers.Adam(1e-4),
                  metrics=['accuracy'])
    return model

In [3]:
def train(env, memory, model, target_model, done):
    learning_rate = 0.7
    discount_factor = 0.9
    
    MIN_REPLAY_SIZE = 1000
    if len(memory) < MIN_REPLAY_SIZE:
        return
    
    batch_size = 64
    mini_batch_indexes = np.random.choice(np.arange(len(memory)), size=batch_size, replace=False)

    current_states = [memory[i][0] for i in mini_batch_indexes]
    current_qs_list = model.predict(convet_to_ragged_tensor(current_states, single=False))
    
    new_current_states = [memory[i][3] for i in mini_batch_indexes]
    future_qs_list = target_model.predict(convet_to_ragged_tensor(new_current_states, single=False))

    X = []
    Y = []
    for index, i in enumerate(mini_batch_indexes):
        (observation, action, reward, new_observation, done) = memory[i]
        if not done:
            max_future_q = reward + discount_factor * np.max(future_qs_list[index])
        else:
            max_future_q = reward

        current_qs = current_qs_list[index]
        current_qs[action] = (1 - learning_rate) * current_qs[action] + learning_rate * max_future_q

        X.append(observation)
        Y.append(current_qs)
    
    X = convet_to_ragged_tensor(X, single=False)
    model.fit(X, np.array(Y), batch_size=batch_size, verbose=0, shuffle=True)

In [4]:
# The main loop

data_path = "../data/Coinbase_BTCUSD_dailydata.csv"
env = cryptoTrade(data_path)
env.reset()

epsilon, max_epsilon, min_epsilon = 1, 1, 0.01
decay = 0.01

model = agent(env.observation_space, env.action_space)
target_model = agent(env.observation_space, env.action_space)
target_model.set_weights(model.get_weights())

memory = []

X = []
y = []

f = open("20220330_RNNTraining.txt","w")

steps_to_update_target_model = 0
for episode in range(300):
    total_training_rewards = 0
    
    observation = env.reset()
    done = False
    while not done:
        steps_to_update_target_model += 1

        if np.random.rand() <= epsilon:
            action = int(np.random.choice(len(env.action_space)))
        else: 
            # Choose the best action
            action = int(model.predict(convet_to_ragged_tensor(observation, single=True)).argmax())
            
        # Now step the simulation
        new_observation, reward, done = env.step(action)
        memory.append([observation, action, reward, new_observation, done])
        
        # Update the neural network
        if steps_to_update_target_model % 4 ==0:#or done:
            train(env, memory, model, target_model, done)
            
        #observation = new_observation
        total_training_rewards += reward
        
        if done:
            print('Total training rewards: {} after n steps = {} with final reward = {}'.format(total_training_rewards, episode, reward))
            #total_training_rewards += 1
            txt = "{:.2f}\n"
            f.write(txt.format(total_training_rewards))

            if steps_to_update_target_model >= 100:
                print('Copying main network weights to the target network weights')
                target_model.set_weights(model.get_weights())
                steps_to_update_target_model = 0
            break
        
    # Update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay * episode)

f.close()
target_model.save('')

2022-04-04 19:24:59.765347: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-04-04 19:24:59.765429: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1


AttributeError: 'cryptoTrade' object has no attribute 'current_shares'

## Model implementation

In [None]:
# Set up the environment to predict over
observation = env.reset()
done = False
val_memory = []

while not done:
    action = int(model.predict(convet_to_ragged_tensor(observation, single=True)).argmax())
    new_observation, reward, done = env.step(action)
    val_memory.append([observation, action, reward, new_observation, done])

In [None]:
# Run a simulation to see what happened
data = env.data
actions = np.array(val_memory, dtype=object)[:, 1]
data