In [27]:
import tensorflow as tf
import numpy as np
import gym
import random

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from collections import deque
# tf.compat.v1.disable_eager_execution()

'1.25.2'

In [23]:
class DQN:
  def __init__(self, state_dim, action_dim):
    self.state_dim = state_dim
    self.action_dim = action_dim

    self.main_model = self.create_model()
    self.target_model = self.create_model()
    self.target_model.set_weights(self.main_model.get_weights())

    self.target_main_delta = 0

  def create_model(self):
    model = Sequential()

    model.add(Dense(128, input_dim=self.state_dim, activation="relu"))
    model.add(Dense(128, activation="relu"))
    model.add(Dense(self.action_dim, activation="linear"))

    model.compile(optimizer="adam", loss="mean_squared_error")

    return model

  def query_main(self, states):
    return self.main_model.predict(np.array(states), verbose=0)

  def query_target(self, states):
    return self.target_model.predict(np.array(states), verbose=0)

  def update_target(self):
    self.target_model.set_weights(self.main_model.get_weights())
    self.target_main_delta = 0

  def fit_main(self, X, y):
    self.main_model.train_on_batch(np.array(X), np.array(y))

In [24]:
class Memory:
  def __init__(self, size):
    self.size = size
    self.replay_buffer = deque(maxlen=self.size)

  def len(self):
    return len(self.replay_buffer)

  def add(self, state, action, reward, next_state, done):
    self.replay_buffer.append((state, action, reward, next_state, done))

  def sample(self, batch_size):
    return random.sample(self.replay_buffer, batch_size)

In [25]:
class Agent:
  def __init__(self, state_dim, action_dim, batch_size, memory_size, epsilon, min_epsilon, epsilon_decay, discount, target_update):
    self.state_dim = state_dim
    self.action_dim = action_dim
    self.batch_size = batch_size
    self.memory_size = memory_size
    self.epsilon = epsilon
    self.min_epsilon = min_epsilon
    self.epsilon_decay = epsilon_decay
    self.discount = discount
    self.target_update = target_update

    self.dqn = DQN(self.state_dim, self.action_dim)
    self.memory = Memory(self.memory_size)

  def choose_action(self, state):
    if np.random.random() > self.epsilon:
      state = np.reshape(state, (1, self.state_dim))
      return np.argmax(self.dqn.query_main(state))
    else:
      return np.random.randint(0, self.action_dim)

  def decay_epsilon(self):
    if self.epsilon > self.min_epsilon:
      self.epsilon *= self.epsilon_decay
      self.epsilon = max(self.min_epsilon, self.epsilon)


  def train(self):
    if self.memory.len() < self.batch_size:
      return

    self.dqn.target_main_delta += 1
    batch = self.memory.sample(self.batch_size)

    X = [memory[0] for memory in batch]
    y = self.dqn.query_main(X)
    target_qs = self.dqn.query_target([memory[3] for memory in batch])
    for index, (state, action, reward, next_state, done) in enumerate(batch):
      if not done:
        target_q = np.max(target_qs[index])
        new_q = reward + self.discount * target_q
      else:
        new_q = reward

      y[index][action] = new_q
    self.dqn.fit_main(X, y)

    if self.dqn.target_main_delta % self.target_update == 0:
      self.dqn.update_target()

In [26]:
env = gym.make('MountainCar-v0')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

batch_size = 64
memory_size = 20_000
epsilon = 1
min_epsilon = 0.01
epsilon_decay = 0.999
discount = 0.97
target_update = 2_000

agent = Agent(state_dim, action_dim, batch_size, memory_size, epsilon, min_epsilon, epsilon_decay, discount, target_update)

num_episodes = 5_000

for episode in range(num_episodes):
  state = env.reset()
  done = False
  success = False
  sum_reward = 0
  while not done:
    action = agent.choose_action(state)

    next_state, reward, done, _ = env.step(action)
    sum_reward += reward

    agent.memory.add(state, action, reward, next_state, done)

    state = next_state

    if state[0] >= env.unwrapped.goal_position:
            success = True
    agent.train()

  print(f"Episode {episode}: Success = {success}, Reward = {sum_reward}")

  agent.decay_epsilon()

Episode 0: Success = False, Reward = -200.0
Episode 1: Success = False, Reward = -200.0
Episode 2: Success = False, Reward = -200.0
Episode 3: Success = False, Reward = -200.0
Episode 4: Success = False, Reward = -200.0
Episode 5: Success = False, Reward = -200.0
Episode 6: Success = False, Reward = -200.0
Episode 7: Success = False, Reward = -200.0
Episode 8: Success = False, Reward = -200.0
Episode 9: Success = False, Reward = -200.0
Episode 10: Success = False, Reward = -200.0
Episode 11: Success = False, Reward = -200.0
Episode 12: Success = False, Reward = -200.0
Episode 13: Success = False, Reward = -200.0
Episode 14: Success = False, Reward = -200.0
Episode 15: Success = False, Reward = -200.0
Episode 16: Success = False, Reward = -200.0
Episode 17: Success = False, Reward = -200.0
Episode 18: Success = False, Reward = -200.0
Episode 19: Success = False, Reward = -200.0
Episode 20: Success = False, Reward = -200.0
Episode 21: Success = False, Reward = -200.0
Episode 22: Success 

KeyboardInterrupt: 