## Main file for training and saving an agent.

In [1]:
import numpy as np
import pandas as pd
from collections import deque
import random

### Import the agent and environment

In [2]:
from Environment import cryptoTrade

from networks.Deep_RL_agents import RNN_agent, convet_to_ragged_tensor




### Define the training loop

In [3]:
def train(env, memory, model, target_model, done):
    learning_rate = 0.7
    discount_factor = 0.9
    
    MIN_REPLAY_SIZE = 1000
    if len(memory) < MIN_REPLAY_SIZE:
        return
    
    batch_size = 64
    mini_batch_indexes = np.random.choice(np.arange(len(memory)), size=batch_size, replace=False)

    current_states = [memory[i][0] for i in mini_batch_indexes]
    current_qs_list = model.predict(convet_to_ragged_tensor(current_states, single=False))
    
    new_current_states = [memory[i][4] for i in mini_batch_indexes]
    future_qs_list = target_model.predict(convet_to_ragged_tensor(new_current_states, single=False))

    X = []
    Y = []
    for index, i in enumerate(mini_batch_indexes):
        (observation, action, actual_action, reward, new_observation, done) = memory[i]
        if not done:
            max_future_q = reward + discount_factor * np.max(future_qs_list[index])
        else:
            max_future_q = reward

        current_qs = current_qs_list[index]
        current_qs[action] = (1 - learning_rate) * current_qs[action] + learning_rate * max_future_q

        X.append(observation)
        Y.append(current_qs)
    
    X = convet_to_ragged_tensor(X, single=False)
    model.fit(X, np.array(Y), batch_size=batch_size, verbose=0, shuffle=True)

### Do the training

In [4]:
# Create the env and the model
data_path = "data/Coinbase_BTCUSD_dailydata.csv"
env = cryptoTrade(data_path)
env.reset()

model = RNN_agent(env.observation_space, env.action_space)
target_model = RNN_agent(env.observation_space, env.action_space)
target_model.set_weights(model.get_weights())

Metal device set to: Apple M1


2022-04-04 20:17:50.319287: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-04-04 20:17:50.319377: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
# The main loop
epsilon, max_epsilon, min_epsilon = 1, 1, 0.01
decay = 0.01

memory = []

X, y = [], []

steps_to_update_target_model = 0
for episode in range(10):
    total_training_rewards = 0
    
    observation = env.reset()
    
    done = False
    while not done:
        steps_to_update_target_model += 1

        # Implement epsilon greedy learning
        if np.random.rand() <= epsilon:
            action = int(np.random.choice(len(env.action_space)))
        else: 
            action = int(model.predict(convet_to_ragged_tensor(observation, single=True)).argmax())
            
        # Now step the simulation
        actual_action, new_observation, reward, done = env.step(action)
        memory.append([observation, action, actual_action, reward, new_observation, done])
        
        # Update the neural network
        if (steps_to_update_target_model % 4 == 0) or done:
            train(env, memory, model, target_model, done)
            
        observation = new_observation
        total_training_rewards += reward
        
        if done:
            print('Total epoch rewards (profit): {:.2e} after {} steps'.format(total_training_rewards, episode))

            if steps_to_update_target_model >= 100:
#                 print('Copying main network weights to the target network weights')
                target_model.set_weights(model.get_weights())
                steps_to_update_target_model = 0
            break
        
    # Update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay * episode)

# target_model.save('')

Total epoch rewards (profit): 3.04e+07 after 0 steps
Total epoch rewards (profit): 3.44e+07 after 1 steps


2022-04-04 20:17:50.655390: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-04-04 20:17:50.656051: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-04-04 20:17:50.700516: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Total epoch rewards (profit): -1.54e+06 after 2 steps


2022-04-04 20:17:50.940190: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-04-04 20:17:50.976057: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-04-04 20:17:51.436014: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-04-04 20:17:51.558998: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-04-04 20:17:51.640199: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Total epoch rewards (profit): 1.11e+07 after 3 steps
Total epoch rewards (profit): 4.06e+07 after 4 steps
Total epoch rewards (profit): 3.77e+07 after 5 steps
Total epoch rewards (profit): -2.08e+06 after 6 steps
Total epoch rewards (profit): 4.83e+07 after 7 steps
Total epoch rewards (profit): 3.57e+07 after 8 steps
Total epoch rewards (profit): 2.09e+07 after 9 steps


# Test the model

In [6]:
observation = env.reset()
done = False
val_memory = []

while not done:
    action = int(model.predict(convet_to_ragged_tensor(observation, single=True)).argmax())
    actual_action, new_observation, reward, done = env.step(action)
    val_memory.append([observation, action, actual_action, reward, new_observation, done])

In [7]:
# Plot the results
observations = [x[0] for x in val_memory]
actions = [x[1] for x in val_memory]
actual_actions = [x[2] for x in val_memory]
rewards = [x[3] for x in val_memory]
new_observations = [x[4] for x in val_memory]

data = env.data

In [8]:
for i in range(len(rewards)):
    print(actual_actions[i], rewards[i])

10 0
-4 -870.0799999999872
6 -870.0799999999872
-10 -20995.76000000004
6 -20995.76000000004
8 -20995.76000000004
6 -20995.76000000004
3 -20995.76000000004
0 -20995.76000000004
0 -20995.76000000004
0 -20995.76000000004
0 -20995.76000000004
-10 -58290.040000000125
-4 -60247.16000000015
-10 -64081.08000000019
6 -64081.08000000019
7 -64081.08000000019
3 -64081.08000000019
6 -64081.08000000019
2 -64081.08000000019
-4 -46180.8400000002
4 -46180.8400000002
0 -46180.8400000002
-4 -20125.910000000207
4 -20125.910000000207
0 -20125.910000000207
0 -20125.910000000207
0 -20125.910000000207
-4 5757.969999999797
4 5757.969999999797
-10 64474.23999999976
9 64474.23999999976
1 64474.23999999976
-10 78739.39999999979
-10 94281.11999999988
6 94281.11999999988
-4 113379.99999999988
6 113379.99999999988
10 113379.99999999988
0 113379.99999999988
0 113379.99999999988
0 113379.99999999988
-4 124543.34999999986
-4 142671.52999999982
0 142671.52999999982
-4 155188.48999999982
6 155188.48999999982
6 155188.489

In [9]:
net_worth = [x[-3] for x in new_observations]

In [10]:
net_worth

[[57515.69,
  53241.92,
  57212.73,
  56625.2,
  57859.28,
  57798.77,
  53580.0,
  54894.03,
  55069.62,
  54053.6],
 [57515.69,
  53241.92,
  57212.73,
  56625.2,
  57859.28,
  57798.77,
  53580.0,
  54894.03,
  55069.62,
  54053.6],
 [57515.69,
  53241.92,
  57212.73,
  56625.2,
  57859.28,
  57798.77,
  53580.0,
  54894.03,
  55069.62,
  54053.6],
 [57515.69,
  53241.92,
  57212.73,
  56625.2,
  57859.28,
  57798.77,
  53580.0,
  54894.03,
  55069.62,
  54053.6],
 [57515.69,
  53241.92,
  57212.73,
  56625.2,
  57859.28,
  57798.77,
  53580.0,
  54894.03,
  55069.62,
  54053.6],
 [57515.69,
  53241.92,
  57212.73,
  56625.2,
  57859.28,
  57798.77,
  53580.0,
  54894.03,
  55069.62,
  54053.6],
 [57515.69,
  53241.92,
  57212.73,
  56625.2,
  57859.28,
  57798.77,
  53580.0,
  54894.03,
  55069.62,
  54053.6],
 [57515.69,
  53241.92,
  57212.73,
  56625.2,
  57859.28,
  57798.77,
  53580.0,
  54894.03,
  55069.62,
  54053.6],
 [57515.69,
  53241.92,
  57212.73,
  56625.2,
  57859.2

In [11]:
3//1

3

In [12]:
for i in range(-10000):
    print(i)