In [16]:
import tensorflow as tf
import gym
import random
import numpy as np
from collections import deque
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

env = gym.make("LunarLander-v2", render_mode = "human")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

GAMMA = 0.99
BATCH_SIZE = 64
LR = 1e-4
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 0.995
MEMORY_CAPACITY = 100000
TARGET_UPDATE = 1000
SOFT_UPDATE_TAU = 0.005
EPISODES = 500
MAX_TIMESTEPS = 1000
RENDER_AFTER = 1000

def build_model(input_shape, output_shape):
    inp = Input(shape=input_shape)
    x = Dense(128, activation="relu")(inp)
    x = Dense(128, activation="relu")(x)
    x = Dense(output_shape, activation="linear")(x)
    return Model(inputs=inp, outputs=x)

learning_network = build_model((state_dim,), action_dim)
target_network = build_model((state_dim,), action_dim)
target_network.set_weights(learning_network.get_weights())


optimizer = Adam(learning_rate=LR)


class ReplayBuffer:
    def __init__(self, buffer_size):
        self.buffer = deque(maxlen=buffer_size)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        states, actions, rewards, next_states, dones = zip(*random.sample(self.buffer, batch_size))
        return (
            np.array(states),
            np.array(actions),
            np.array(rewards),
            np.array(next_states),
            np.array(dones),
        )

    def __len__(self):
        return len(self.buffer)


def select_action(state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.choice(action_dim)
    state = tf.convert_to_tensor(state, dtype=tf.float32)[tf.newaxis, :]
    q_values = learning_network(state)
    return tf.argmax(q_values, axis=1).numpy()[0]


def train_step():
    if len(replay_buffer) < BATCH_SIZE:
        return

    states, actions, rewards, next_states, dones = replay_buffer.sample(BATCH_SIZE)
    states = tf.convert_to_tensor(states, dtype=tf.float32)
    actions = tf.convert_to_tensor(actions, dtype=tf.int32)
    rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
    next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
    dones = tf.convert_to_tensor(dones, dtype=tf.float32)

    #=target Q-values
    next_q_values = target_network(next_states)
    next_actions = tf.argmax(learning_network(next_states), axis=1)
    target_q_values = rewards + GAMMA * tf.reduce_sum(
        tf.one_hot(next_actions, action_dim) * next_q_values, axis=1
    ) * (1.0 - dones)

    #current Q-values
    with tf.GradientTape() as tape:
        q_values = learning_network(states)
        action_q_values = tf.reduce_sum(
            tf.one_hot(actions, action_dim) * q_values, axis=1
        )
        loss = tf.reduce_mean(tf.square(target_q_values - action_q_values))

    grads = tape.gradient(loss, learning_network.trainable_variables)
    optimizer.apply_gradients(zip(grads, learning_network.trainable_variables))

# Soft update target network weights
def soft_update():
    for target_weights, learning_weights in zip(
        target_network.trainable_variables, learning_network.trainable_variables
    ):
        target_weights.assign(SOFT_UPDATE_TAU * learning_weights + (1.0 - SOFT_UPDATE_TAU) * target_weights)


replay_buffer = ReplayBuffer(MEMORY_CAPACITY)
epsilon = EPSILON_START

for episode in range(EPISODES):
    state = env.reset()[0]
    total_reward = 0

    for t in range(MAX_TIMESTEPS):
        if episode >= RENDER_AFTER:
            env.render()

        action = select_action(state, epsilon)
        next_state, reward, done, _, _ = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done)

        state = next_state
        total_reward += reward

        train_step()
        soft_update()

        if done:
            break

    epsilon = max(EPSILON_END, epsilon * EPSILON_DECAY)
    print(f"Episode: {episode}, Total Reward: {total_reward}, Epsilon: {epsilon:.3f}")

    if episode % TARGET_UPDATE == 0:
        target_network.set_weights(learning_network.get_weights())

env.close()

target_network.save("LunarLander_DQN_best_model.keras")


Episode: 0, Total Reward: -130.324636685211, Epsilon: 0.995
Episode: 1, Total Reward: -436.11390949540794, Epsilon: 0.990
Episode: 2, Total Reward: 27.407045112888667, Epsilon: 0.985
Episode: 3, Total Reward: -172.28648103884257, Epsilon: 0.980
Episode: 4, Total Reward: -67.00033599349246, Epsilon: 0.975
Episode: 5, Total Reward: -188.67133300830983, Epsilon: 0.970
Episode: 6, Total Reward: -158.7345760816637, Epsilon: 0.966
Episode: 7, Total Reward: -145.56532802365035, Epsilon: 0.961
Episode: 8, Total Reward: -91.13401574501015, Epsilon: 0.956
Episode: 9, Total Reward: -94.0800457673635, Epsilon: 0.951
Episode: 10, Total Reward: -172.40861657618927, Epsilon: 0.946
Episode: 11, Total Reward: -473.39309357107607, Epsilon: 0.942
Episode: 12, Total Reward: -188.05612511154936, Epsilon: 0.937
Episode: 13, Total Reward: -120.02067006011927, Epsilon: 0.932
Episode: 14, Total Reward: -209.34936573318424, Epsilon: 0.928
Episode: 15, Total Reward: -310.2075947739928, Epsilon: 0.923
Episode: 16

In [None]:
import tensorflow as tf
import gym
import random
from collections import deque
import numpy as np
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense, Input
Lunar_lander_model = load_model("LunarLander_DQN_best_model.keras")

environment = gym.make("LunarLander-v2", render_mode="human")
action_dimension = environment.action_space.n
state = environment.reset()[0]

done = False
total_rew = 0
while not done:
    state = np.expand_dims(state, axis=0)
    action = np.argmax(Lunar_lander_model.predict(state, verbose=0))
    next_state, reward, done, _, _ = environment.step(action)
    state = next_state
    total_rew += reward

print("Total reward: ", total_rew)
environment.close()
