In [1]:
import random
import math
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import pdb
import csv
import random

In [2]:
class Environment:
	def __init__(self, csv_path):
		self.points = 0.00001
		self.spread_points = 5
		self.tp_points = 500
		self.sl_points = 500

		self.dataset = []
		self.next_index = 0;

		with open(csv_path, newline='') as csvfile:
			reader = csv.reader(csvfile, delimiter=';')
			for row in reader:
				date, hour, \
				op, high, low, close, \
				tick_vol, vol, spread = row
				
				entry = {
					'date': date,
					'hour': hour,
					'tick_vol': float(tick_vol),
					'vol': float(vol),
					'open': float(op),
					'close': float(close),
					'high': float(high),
					'low': float(low),
				}
				self.dataset.append(entry)

		self.input_size = len(self.__build_state(0)[0])
		self.action_space = [0, 1, 2] # 0: pass, 1: long, 2: short

	def report(self):
		return "Long Trades: {} ({} won), Short Trades: {} ({} won), won %: {}, profit: {} (gross gain: {}, gross loss: {})".format(
			self.long_trades, self.long_trades_won, self.short_trades, self.short_trades_won,
			(self.long_trades_won + self.short_trades_won) / (self.long_trades + self.short_trades),
			round(self.gross_profit + self.gross_loss), round(self.gross_profit), round(self.gross_loss)
		)

	def reset(self):
		self.next_index = 0
		self.short_trades = 0
		self.long_trades = 0
		self.long_trades_won = 0
		self.short_trades_won = 0
		self.short_trades_lost = 0
		self.gross_profit = 0
		self.gross_loss = 0

		return self.step(self.action_space[0])[0]

	def step(self, action):
		reward, nindex = self.__calculate_reward(action, self.next_index)

		distance = max(1, nindex - self.next_index)
		self.next_index = nindex
		next_state, has_next_state = self.__build_state(self.next_index)

		done = False
		if not has_next_state: done = True

		return (next_state, reward / distance, done)

	def __build_state(self, index):
		if index >= len(self.dataset):
			return (None, False)

		row = self.dataset[index]
		hour = tf.keras.utils.to_categorical(row['hour'].split(':')[0], 24)
		
		state = []

		for i in range(index,index-48,-1):
			if not self.dataset[i]:
				state.append(self.dataset[i]['close'])
			else:
				state.append(row['close'])

		return (np.concatenate((hour, state)), True)
		# return (state, True)

	def __calculate_reward(self, action, index):
		if action == 0: # pass
			reward = -1
			nindex = index + 1
		if action == 1: # long
			reward, nindex = self.__calculate_position_reward(True, index)
			self.long_trades += 1

			if reward > 0: self.long_trades_won += 1
			self.gross_profit += max(0, reward)
			self.gross_loss += min(0, reward)
		if action == 2: # short
			reward, nindex = self.__calculate_position_reward(False, index)
			self.short_trades += 1

			if reward > 0: self.short_trades_won += 1
			self.gross_profit += max(0, reward)
			self.gross_loss += min(0, reward)
		

		return (reward, nindex)


	def __calculate_position_reward(self, long, index):
		open_bid, open_ask = \
			(
				self.dataset[index]['close'] - self.__points(self.spread_points) / 2,
				self.dataset[index]['close'] + self.__points(self.spread_points) / 2
			)
		
		open_price = open_ask if long else open_bid
		if long:
			open_position_tp_price = open_bid + self.__points(self.tp_points) + self.__points(self.spread_points)
			open_position_sl_price = open_bid - self.__points(self.sl_points) + self.__points(self.spread_points)
		else:
			open_position_tp_price = open_ask - self.__points(self.tp_points) - self.__points(self.spread_points)
			open_position_sl_price = open_ask + self.__points(self.sl_points) - self.__points(self.spread_points)

		index += 1
		while index < len(self.dataset):
			if long:
				high_bid = self.dataset[index]['high'] - self.__points(self.spread_points) / 2
				low_bid = self.dataset[index]['low'] - self.__points(self.spread_points) / 2

				if low_bid <= open_position_sl_price:
					# print("LONG SL ({}):  From {} to {} ({})".format(index, open_price, open_position_sl_price, round((open_position_sl_price - open_price) / self.points)))
					return ((open_position_sl_price - open_price) / self.points, index + 1)
				if high_bid >= open_position_tp_price:
					# print("LONG TP ({}):  From {} to {} ({})".format(index, open_price, open_position_tp_price, round((open_position_tp_price - open_price) / self.points)))
					return ((open_position_tp_price - open_price) / self.points, index + 1)
			else:
				high_ask = self.dataset[index]['high'] + self.__points(self.spread_points) / 2
				low_ask = self.dataset[index]['low'] + self.__points(self.spread_points) / 2

				if high_ask >= open_position_sl_price:
					# print("SHORT SL ({}): From {} to {} ({})".format(index, open_price, open_position_sl_price, round((open_price - open_position_sl_price) / self.points)))
					return ((open_price - open_position_sl_price) / self.points, index + 1)
				if low_ask <= open_position_tp_price:
					# print("SHORT TP ({}): From {} to {} ({})".format(index, open_price, open_position_tp_price, round((open_price - open_position_tp_price) / self.points)))
					return ((open_price - open_position_tp_price) / self.points, index + 1)

			index += 1

		# if I'm here, it's end of the dataset
		return (0, index)


	def __points(self, val):
		return val * 0.00001



In [None]:
EPOCHS = 100
THRESHOLD = 195
MONITOR = True

class DQN:
	def __init__(self, env_string, batch_size=128):
		self.env = Environment('EURUSD_H1.csv')
		self.memory = deque(maxlen=1000)
		self.input_size = self.env.input_size
		self.action_size = len(self.env.action_space)
		self.batch_size = batch_size
		self.gamma = 0.0
		self.epsilon = 1.0
		self.epsilon_min = 0.01
		self.epsilon_decay = 0.9998
		alpha=0.01
		alpha_decay=0.01
		dropout = 0.3

		# Init model
		self.model = Sequential()
		self.model.add(Dense(24 + 8, input_dim=self.input_size, activation='tanh', kernel_regularizer=l2(0.01), activity_regularizer=l2(0.01)))
		# self.model.add(Dropout(dropout))
		self.model.add(Dense(32, activation='tanh', kernel_regularizer=l2(0.01), activity_regularizer=l2(0.01)))
		self.model.add(Dropout(dropout))
		self.model.add(Dense(32, activation='tanh', kernel_regularizer=l2(0.01), activity_regularizer=l2(0.01)))
		self.model.add(Dropout(dropout))
		self.model.add(Dense(self.action_size, activation='tanh'))
		self.model.compile(loss='mse', optimizer=Adam(lr=alpha, decay=alpha_decay))

		self.model.summary()

	def remember(self, state, action, reward, next_state, done):
		self.memory.append((state, action, reward, next_state, done))

	def replay(self, batch_size):
		x_batch, y_batch = [], []
		minibatch = random.sample(self.memory, min(len(self.memory), batch_size))

		for state, action, reward, next_state, done in minibatch:
			y_target = self.model.predict(state) # y_target = [[1, 0, -1]]
			# print("action: {}, reward: {}".format(action, reward))
			# print(y_target[0])
			# print(reward)
			reward = max(min(reward, 1), -1)
			# print(y_target[0])
			# print("|")
			y_target[0][action] = reward if (done or self.gamma == 0) else reward + self.gamma * np.max(self.model.predict(next_state)[0])
			# print(y_target[0])
			# print(y_target[0][action])
			# print(self.model.predict(next_state))
			# y_target[0][action] = max(min(reward, 1), 0)
			# print(y_target[0])
			x_batch.append(state[0])
			y_batch.append(y_target[0])

		self.model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=1)

	def train(self):
		scores = deque(maxlen=100)
		avg_scores = []
		for e in range(EPOCHS):
			print("Training ({}):".format(e + 1))

			state = self.env.reset()
			state = self.preprocess_state(state)
			done = False

			while not done:
				action = self.choose_action(state, self.epsilon)


				# next_index = self.env.next_index
				next_state, reward, done = self.env.step(action)

				# if action == 1:
				# 	self.env.next_index = next_index
				# 	ns, rew, don = self.env.step(2)
				# 	print("1: {} => 2: {}".format(reward, rew))
				# elif action == 2:
				# 	self.env.next_index = next_index
				# 	ns, rew, don = self.env.step(1)
				# 	print("2: {} => 1: {}".format(reward, rew))


				next_state = self.preprocess_state(next_state)
				self.remember(state, action, reward, next_state, done)
				state = next_state
				self.epsilon = max(self.epsilon_min, self.epsilon_decay*self.epsilon)
				if self.env.next_index >= int(len(self.env.dataset) * 0.7): break

			self.replay(self.batch_size)
			print(self.env.report())

			# test on unoptimized data
			print("Evaluating: ")

			state = self.env.reset()
			self.env.next_index = int(len(self.env.dataset) * 0.7) + 1
			state = self.preprocess_state(state)
			done = False

			while not done:
				action = self.choose_action(state, -1)

				next_state, reward, done = self.env.step(action)
				if done: break

				state = self.preprocess_state(next_state)
				if self.env.next_index >= len(self.env.dataset) - 2: break

			print(self.env.report())
			print("Current epsilon: {}".format(self.epsilon))
			print()


		print('Did not solve after {} episodes :('.format(e))
		return avg_scores

	def choose_action(self, state, epsilon):
		# return random.choice(self.env.action_space)
		if np.random.random() <= epsilon:
			return random.choice(self.env.action_space)
		else:
			return np.argmax(self.model(state, training=epsilon>0))


	def preprocess_state(self, state):
		return np.reshape(state, [1, self.input_size])

env_string = 'CartPole-v0'
agent = DQN(env_string)
scores = agent.train()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                2336      
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 99        
Total params: 4,547
Trainable params: 4,547
Non-trainable params: 0
______________________________________________________

In [None]:
EPOCHS = 50
scores = agent.train()

In [None]:
EPOCHS = 50
scores = agent.train()

In [19]:
EPOCHS = 100
scores = agent.train()

Training (1):
Long Trades: 5398 (2719 won), Short Trades: 1 (0 won), won %: 0.5036117799592517, profit: 19500 (gross gain: 1359500, gross loss: -1340000)
Evaluating: 
Long Trades: 1156 (580 won), Short Trades: 0 (0 won), won %: 0.5017301038062284, profit: 2500 (gross gain: 290000, gross loss: -287500)
Current epsilon: 0.001

Training (2):
Long Trades: 5398 (2719 won), Short Trades: 1 (0 won), won %: 0.5036117799592517, profit: 19500 (gross gain: 1359500, gross loss: -1340000)
Evaluating: 
Long Trades: 1156 (580 won), Short Trades: 0 (0 won), won %: 0.5017301038062284, profit: 2500 (gross gain: 290000, gross loss: -287500)
Current epsilon: 0.001

Training (3):
Long Trades: 5397 (2719 won), Short Trades: 3 (1 won), won %: 0.5037037037037037, profit: 20000 (gross gain: 1360000, gross loss: -1340000)
Evaluating: 
Long Trades: 1156 (580 won), Short Trades: 0 (0 won), won %: 0.5017301038062284, profit: 2500 (gross gain: 290000, gross loss: -287500)
Current epsilon: 0.001

Training (4):
Long 

KeyboardInterrupt: 

In [None]:
state = agent.env.reset()
agent.env.next_index = 0
state = agent.preprocess_state(state)
done = False
long, short, pas = 0, 0 ,0
actions = []

while not done:
	action = agent.choose_action(state, -1)
	next_state, reward, done = agent.env.step(action)
	if done: break
	state = agent.preprocess_state(next_state)
	if action == 1:
		long += 1
	elif action == 0:
		pas += 1
	else:
		short += 1   
	actions.append(action)
	if agent.env.next_index >= len(agent.env.dataset) -1: break

print(agent.env.report())
print("pas: ", pas)
print("short: ", short)
print("long: ", long)