In [4]:
import gym
import matplotlib.pyplot as plt
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import random
from IPython.display import clear_output
from collections import deque
from tqdm.notebook import tqdm
from collections import deque

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras.backend as K

# tf.compat.v1.disable_eager_execution()

In [5]:
class DQAgent():
    def __init__(self, 
                 observation_space, 
                 action_space, 
                 gamma=0.99, 
                 lr=0.001):
        self.observation_space = observation_space
        self.action_space = action_space
        self.gamma = gamma
        self.lr = lr
        self.model = self.build_model(name='model')
        
    def build_model(self, name):
        model = keras.Sequential(name = name)
        model.add(keras.Input(shape=self.observation_space))
        model.add(keras.layers.Dense(128, activation='relu'))
        model.add(keras.layers.Dense(128, activation='relu'))
        model.add(keras.layers.Dense(128, activation='relu'))
        model.add(keras.layers.Dense(self.action_space, activation='linear'))
        
        model.compile(
            optimizer=keras.optimizers.legacy.Adam(learning_rate=self.lr),
            loss='mse'
        )
        return model
    
    def predict(self, observation):
        return self.model.predict(np.array([observation]), verbose=False)[0]
    
    def predict_action(self, observation):
        return np.argmax(self.predict(observation))
    
    def e_greedy(self, observation, e=0.1):
        if random.random() >= e:
            return self.predict_action(observation)
        return random.randint(0, self.action_space-1)
    
    def train(self, s, a, r, s_, terminated):
        q_values = self.predict(s)
        new_q = np.max(self.predict(s_))
        y = q_values
        if not terminated:
            y[a] = r + self.gamma*new_q
        else:
            y[a] = r
        self.model.fit(np.array([s]), np.array([y]), verbose=False)

In [6]:
env = gym.make("LunarLander-v2")
agent = DQAgent(env.observation_space.shape, env.action_space.n)

episodes = 2000
max_t = 500
e=0.05
scores = []
avgs = []

for episode in range(episodes):
    observation, _ = env.reset()
    
    score = 0
    for t in range(max_t):
        s = observation
        a = agent.e_greedy(s, e=e)
        observation, reward, terminated, _, _ = env.step(a)
        score +=  reward
        s_ = observation
        agent.train(s, a, reward, s_, terminated)
        if terminated:
            break
    scores.append(score)
    avgs.append(np.sum(scores[-50:])/len(scores[-50:]))
    print(f"episode: {episode}, e: {e}, t: {t}, score: {score : .2f}, avg score: {avgs[-1]: .2f}")
    if avgs[-1] >= 200:
        break

plt.plot(scores)
plt.plot(avgs)
plt.show()

episode: 0, e: 0.05, t: 88, score: -407.63, avg score: -407.63
episode: 1, e: 0.05, t: 101, score: -361.56, avg score: -384.60
episode: 2, e: 0.05, t: 49, score: -422.63, avg score: -397.27
episode: 3, e: 0.05, t: 63, score: -526.63, avg score: -429.61
episode: 4, e: 0.05, t: 61, score: -484.16, avg score: -440.52
episode: 5, e: 0.05, t: 85, score: -755.04, avg score: -492.94
episode: 6, e: 0.05, t: 52, score: -327.22, avg score: -469.27
episode: 7, e: 0.05, t: 82, score: -700.90, avg score: -498.22
episode: 8, e: 0.05, t: 77, score: -792.00, avg score: -530.86
episode: 9, e: 0.05, t: 61, score: -562.06, avg score: -533.98
episode: 10, e: 0.05, t: 76, score: -580.79, avg score: -538.24
episode: 11, e: 0.05, t: 77, score: -636.13, avg score: -546.40
episode: 12, e: 0.05, t: 66, score: -488.79, avg score: -541.96
episode: 13, e: 0.05, t: 53, score: -490.44, avg score: -538.28
episode: 14, e: 0.05, t: 67, score: -638.14, avg score: -544.94
episode: 15, e: 0.05, t: 59, score: -432.99, avg 