## Main file for training and saving an agent.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import deque
import random

### Import the agent and environment

In [2]:
from Environment import cryptoTrade

from networks.Deep_RL_agents import DNN_agent, convert_to_1d




### Define the training loop

In [3]:
def train(env, memory, model, target_model, done):
    learning_rate = 0.7
    discount_factor = 0.9
    
    MIN_REPLAY_SIZE = 1000
    if len(memory) < MIN_REPLAY_SIZE:
        return
    
    batch_size = 64
    mini_batch_indexes = np.random.choice(np.arange(len(memory)), size=batch_size, replace=False)

    current_states = [memory[i][0] for i in mini_batch_indexes]
    current_qs_list = model.predict(convert_to_1d(current_states, single=False))
    
    new_current_states = [memory[i][4] for i in mini_batch_indexes]
    future_qs_list = target_model.predict(convert_to_1d(new_current_states, single=False))

    X = []
    Y = []
    for index, i in enumerate(mini_batch_indexes):
        (observation, action, actual_action, reward, new_observation, done) = memory[i]
        if not done:
            max_future_q = reward + discount_factor * np.max(future_qs_list[index])
        else:
            max_future_q = reward

        current_qs = current_qs_list[index]
        current_qs[action] = (1 - learning_rate) * current_qs[action] + learning_rate * max_future_q

        X.append(observation)
        Y.append(current_qs)
    
    X = convert_to_1d(X, single=False)
    return X, Y
    print(X)
    model.fit(X, np.array(Y), batch_size=batch_size, verbose=0, shuffle=True)

### Do the training

In [4]:
# Create the env and the model
training_data_path = "data/training_2015_2021.df"
env = cryptoTrade(training_data_path, episode_size=720)

model = DNN_agent(env.observation_space, env.action_space)
target_model = DNN_agent(env.observation_space, env.action_space)
target_model.set_weights(model.get_weights())

Metal device set to: Apple M1


2022-04-07 16:08:45.111569: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-04-07 16:08:45.111700: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
# The main loop
epsilon, max_epsilon, min_epsilon = 1, 1, 0.01
decay = 0.01

memory = []

X, y = [], []

steps_to_update_target_model = 0
for episode in range(1000):
    total_training_rewards = 0
    
    observation = env.reset()
    
    done = False
    while not done:
        steps_to_update_target_model += 1

        # Implement epsilon greedy learning
        if np.random.rand() <= epsilon:
            action = int(np.random.choice(len(env.action_space)))
        else: 
            action = int(model.predict(convert_to_1d(observation, single=True)).argmax())
            
        # Now step the simulation
        actual_action, new_observation, reward, done = env.step(action)
        memory.append([observation.copy(), action, actual_action, reward, new_observation.copy(), done])
        
        # Update the neural network
        if (steps_to_update_target_model % 4 == 0) or done:
            X = train(env, memory, model, target_model, done)
            
        observation = new_observation
        total_training_rewards += reward
        
        if done:
            #print('Total epoch rewards (profit): {:.2e} after {} steps'.format(total_training_rewards, episode))
            print('Total epoch rewards (profit): {:.2e} after {} steps'.format(reward, episode))

            if steps_to_update_target_model >= 100:
#                 print('Copying main network weights to the target network weights')
                target_model.set_weights(model.get_weights())
                steps_to_update_target_model = 0
            break
        
    # Update epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay * episode)

# target_model.save('')

Total epoch rewards (profit): 0.00e+00 after 0 steps


2022-04-07 16:08:56.279465: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-04-07 16:08:56.346149: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-04-07 16:08:56.397838: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Total epoch rewards (profit): -9.10e+00 after 1 steps


2022-04-07 16:09:13.571535: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Total epoch rewards (profit): 9.34e+01 after 2 steps
Total epoch rewards (profit): -1.24e+02 after 3 steps
Total epoch rewards (profit): -1.24e+02 after 4 steps
Total epoch rewards (profit): 4.26e+01 after 5 steps
Total epoch rewards (profit): -2.93e+02 after 6 steps
Total epoch rewards (profit): 2.82e+02 after 7 steps
Total epoch rewards (profit): 0.00e+00 after 8 steps
Total epoch rewards (profit): -6.20e+02 after 9 steps
Total epoch rewards (profit): 0.00e+00 after 10 steps
Total epoch rewards (profit): 0.00e+00 after 11 steps
Total epoch rewards (profit): 0.00e+00 after 12 steps
Total epoch rewards (profit): 0.00e+00 after 13 steps
Total epoch rewards (profit): 0.00e+00 after 14 steps
Total epoch rewards (profit): 0.00e+00 after 15 steps
Total epoch rewards (profit): -2.47e+01 after 16 steps
Total epoch rewards (profit): 2.54e+02 after 17 steps
Total epoch rewards (profit): 4.45e+01 after 18 steps
Total epoch rewards (profit): -6.05e+01 after 19 steps
Total epoch rewards (profit): 

In [None]:
model = DNN_agent(env.observation_space, env.action_space)
model.predict(convert_to_1d(observation, single=False))

In [None]:
len(convert_to_1d(observation, single=True))

In [None]:
env.dim_2_features

In [None]:
memory[-1][4][7]

In [None]:
memory[-1][4][7]

In [None]:
i += 1
X[i]

In [None]:
i = 2

# Test the model

In [None]:
# Create the env and the model
testing_data_path = "data/testing_2022.df"
testing_env = cryptoTrade(testing_data_path, episode_size=720)

observation = testing_env.reset()
done = False
val_memory = []

while not done:
    action = int(model.predict(convet_to_ragged_tensor(observation, single=True)).argmax())
    actual_action, new_observation, reward, done = testing_env.step(action)
    
    info = {"observation":observation.copy(), "action":action, "actual_action":actual_action, 
            "reward":reward, "new_observation":new_observation.copy(), "done":done}
    val_memory.append(info)
    
    observation = new_observation

In [None]:
# Plot the results
observations = [x["observation"] for x in val_memory]
actions = [x["action"] for x in val_memory]
actual_actions = [x["actual_action"] for x in val_memory]
rewards = [x["reward"] for x in val_memory]
new_observations = [x["new_observation"] for x in val_memory]

In [None]:
plt.figure()
plt.plot(rewards)
plt.title("Profits over a random 12h interval")
plt.xlabel("Minutes")
plt.ylabel("Profit")
plt.show()

In [None]:
# EXPAND THE DATA WE ARE USING
# TEST ON TESTING DATASET
# PLOT THE RESULTS
# CREATE BUY SELL ANIMATION
# DO A DNN AFTER EVAL IS CREATED