In [1]:
import random
import math
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from environment import Environment

EPOCHS = 10
THRESHOLD = 195
MONITOR = True

class DQN:
	def __init__(self, env_string, batch_size=128):
		self.memory = deque(maxlen=100000)
		self.env = Environment('EURUSD_H1.csv')
		self.input_size = self.env.input_size
		self.action_size = len(self.env.action_space)
		self.batch_size = batch_size
		self.gamma = 1.0
		self.epsilon = 1.0
		self.epsilon_min = 0.01
		self.epsilon_decay = 0.9998
		alpha=0.01
		alpha_decay=0.01
		dropout = 0.3

		# Init model
		self.model = Sequential()
		self.model.add(Dense(8, input_dim=self.input_size, activation='tanh', kernel_regularizer=l2(0.01), activity_regularizer=l2(0.01)))
		self.model.add(Dropout(dropout))
		self.model.add(Dense(16, activation='tanh', kernel_regularizer=l2(0.01), activity_regularizer=l2(0.01)))
		self.model.add(Dropout(dropout))
		self.model.add(Dense(self.action_size, activation='linear'))
		self.model.compile(loss='mse', optimizer=Adam(lr=alpha, decay=alpha_decay))

		self.model.summary()

	def remember(self, state, action, reward, next_state, done):
		self.memory.append((state, action, reward, next_state, done))

	def replay(self, batch_size):
		x_batch, y_batch = [], []
		minibatch = random.sample(self.memory, min(len(self.memory),batch_size))

		for state, action, reward, next_state, done in minibatch:
			y_target = self.model.predict(state)
			# print("action: {}, reward: {}".format(action, reward))
			#print(y_target[0])
			# y_target[0][action] = reward if done else reward + self.gamma * np.max(self.model.predict(next_state)[0])
			#y_target[0] = [0, 0, 0]
			y_target[0][action] = max(min(reward, 1), 0)
			#print(y_target[0])
			x_batch.append(state[0])
			y_batch.append(y_target[0])

		self.model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=1)

	def train(self):
		scores = deque(maxlen=100)
		avg_scores = []
		for e in range(EPOCHS):
			print("Training ({}):".format(e + 1))

			state = self.env.reset()
			state = self.preprocess_state(state)
			done = False

			while not done:
				next_states = []
				action = self.choose_action(state, self.epsilon)
				# print(action)
				next_state, rewards, dones = self.env.step(action)
				for s in next_state:
					s = self.preprocess_state(s)
					next_states.append(s)
				self.remember(state, action, rewards[0], next_states[0], dones[0])
				for i in range(1, len(next_states)):
					self.remember(next_states[i-1], action, rewards[i], next_states[i], dones[i])
				state = next_states[len(next_states) - 1]
				self.epsilon = max(self.epsilon_min, self.epsilon_decay*self.epsilon) # decrease epsilon
				if self.env.next_index >= 70000: break

			self.replay(self.batch_size)
			print(self.env.report())

			# test on unoptimized data
			print("Evaluating: ")

			state = self.env.reset()
			self.env.next_index = 70001
			state = self.preprocess_state(state)
			done = False

			while not done:
				action = self.choose_action(state, -1)
				next_state, reward, done = self.env.step(action)
				state = self.preprocess_state(next_state[len(next_state)-1])
				if self.env.next_index >= 100000: break

			print(self.env.report())
			print()


		print('Did not solve after {} episodes :('.format(e))
		return avg_scores

	def choose_action(self, state, epsilon):
		if np.random.random() <= epsilon:
			return random.choice(self.env.action_space)
		else:
			return np.argmax(self.model(state, training=epsilon>0))


	def preprocess_state(self, state):
		return np.reshape(state, [1, self.input_size])

env_string = 'CartPole-v0'
agent = DQN(env_string)
scores = agent.train()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 8)                 624       
_________________________________________________________________
dropout (Dropout)            (None, 8)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                144       
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 51        
Total params: 819
Trainable params: 819
Non-trainable params: 0
_________________________________________________________________
Training (1):
Long Trades: 4935 (1390 won), Short Trades: 24340 (6475 won), profit: -677250 (gross gain: 393250, gross loss:

In [3]:
state = agent.env.reset()
agent.env.next_index = 0
state = agent.preprocess_state(state)
done = False
long, short, pas = 0, 0 ,0
actions = []

while not done:
	action = agent.choose_action(state, -1)
	next_state, reward, done = self.env.step(action)
	if action == 1:
		long += 1
	elif action == 0:
		pas += 1
	else:
		short += 1   
	actions.append(action)
	state = next_states[len(next_states) - 1]
	if agent.env.next_index >= len(agent.env.dataset) -1: break

print(agent.env.report())
print("pas: ", pas)
print("short: ", short)
print("long: ", long)

Long Trades: 0 (0 won), Short Trades: 0 (0 won), profit: 0 (gross gain: 0, gross loss: 0)
pas:  100337
short:  0
long:  0
