In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
import gym
import numpy as np
import random as rand

In [None]:
class Agent(object):
  def __init__(self):
    self.env = gym.make('CartPole-v1')
    self.state_size = self.env.observation_space.shape[0]
    self.action_size = self.env.action_space.n

    self.node_num = 12
    self.learning_rate = 0.001
    self.epochs_cnt = 5
    self.model = self.build_model()

    self.discount_rate = 0.97
    self.penalty = -100

    self.episode_num = 500

    self.replay_memory_limit = 2048
    self.replay_size = 32
    self.replay_memory = []

    self.epsilon = 0.99
    self.epsilon_decay = 0.2 # 0.2 episode의 20%가 되면 수행, epsilon이 0이 된다.
    self.epsilon_min = 0.05

    self.moving_avg_size = 20
    self.reward_list = []
    self.count_list = []
    self.moving_avg_list = []

  def build_model(self):
    input_states = Input(shape=(1,self.state_size), name='input_states')
    x = (input_states)
    x = Dense(self.node_num, activation='relu')(x)
    out_actions = Dense(self.action_size, activation='linear', name='output')(x)
    model = tf.keras.models.Model(inputs=[input_states], outputs=[out_actions])
    model.compile(optimizer=Adam(lr=self.learning_rate),
                  loss='mean_squared_error'
                  )
    model.summary()
    return model
  
  def train(self):
    for episode in range(self.episode_num):
      state = self.env.reset()
      Q, count, reward_tot = self.take_action_and_append_memory(episode, state)

      if count < 500:
        reward_tot = reward_tot - self.penalty

      self.reward_list.append(reward_tot)
      self.count_list.append(count)
      self.moving_avg_list.append(self.moving_avg(self.count_list, self.moving_avg_size))

      self.train_mini_batch(Q)

      if (episode % 10 == 0):
        print("episode:{}, moving_avg:{}, rewards_avg:{}".format(episode, self.moving_avg_list[-1], np.mean(self.reward_list)))