In [10]:
import numpy as np
import gym
import gym_anytrading

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, concatenate
from tensorflow.keras.optimizers import Adam

In [11]:
class DDPGAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size # 환경
        self.action_size = action_size # 행동
        self.gamma = 0.99  # 미래가치(할인율)
        self.tau = 0.005  # 타켓 네트워크의 소프트 업데이트 비율
        self.memory = deque(maxlen=10000) # relpay buffer
        self.batch_size = 64
        self.actor_model = self.build_actor() # Agent의 현재 정책을 나타내는 모델
        self.critic_model = self.build_critic() # 에이전트가 취한 행동의 가치를 나타내는 모델
        self.target_actor = self.build_actor() # Actor 타겟
        self.target_critic = self.build_critic() # Critic 타겟
        self.target_actor.set_weights(self.actor_model.get_weights())
        self.target_critic.set_weights(self.critic_model.get_weights())
        self.actor_optimizer = Adam(lr=0.001)
        self.critic_optimizer = Adam(lr=0.002)

    def build_actor(self):
        inputs = Input(shape=(self.state_size,))
        out = Dense(400, activation="relu")(inputs)
        out = Dense(300, activation="relu")(out) # Dense = 모든게 연결되어있다.
        outputs = Dense(self.action_size, activation="tanh")(out)
        model = Model(inputs, outputs)
        return model

    def build_critic(self):
        state_input = Input(shape=(self.state_size,))
        action_input = Input(shape=(self.action_size,)) # 상태와 액션을 묶어서 평가하겠다.
        concat = Concatenate()([state_input, action_input])
        out = Dense(400, activation="relu")(concat)
        out = Dense(300, activation="relu")(out)
        outputs = Dense(1, activation="linear")(out)
        model = Model([state_input, action_input], outputs)
        return model

    def remember(self, state, action, reward, next_state, done): # memory에 저장하는 함수
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        state = np.reshape(state, [1, self.state_size])
        return self.actor_model.predict(state)[0]

    # This function should be filled with code to train the actor and critic networks
    def replay(self):
        pass

    # Implement target model update
    def update_target(self, target_model, model):
        target_weights = target_model.get_weights()
        model_weights = model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = self.tau * model_weights[i] + (1 - self.tau) * target_weights[i]
        target_model.set_weights(target_weights)