In [1]:
import random
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from collections import deque
import tensorflow as tf
import numpy as np
import gymnasium as gym

In [2]:
class Transition:
    def __init__(self, state, action, reward, next_state, terminated):
        self.state = state
        self.action = action
        self.reward = reward
        self.next_state = next_state
        self.terminated = terminated
        
        self.data = (self.state, self.action, self.reward, self.next_state, self.terminated)
        
    def __repr__(self):
        return f"Transition{self.data}"
    def __iter__(self):
        return iter(self.data)


In [3]:
class Memory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = deque(maxlen=capacity)

    def store(self, transition):
        self.memory.append(transition)

    def sample(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        return batch

In [4]:
class Agent:
    def __init__(self, policy, memory):

        self.policy = policy
        self.memory = memory

    def select_action(self, state):
        return self.policy.select_action(state)

    def decay(self):
        self.policy.epsilon *= self.policy.epsilon_decay_rate
        
    def train(self, batch_size):
        
        # zorg ervoor dat we niet crashen omdat er niet genoeg Transitions in de memory zitten!
        if batch_size > len(self.memory.memory):
            return
        
        batch = self.memory.sample(batch_size)
        for state, action, reward, next_state, terminated in batch:
            target = self.policy.model.predict(np.expand_dims(np.array(state), axis=0), verbose = 0)
            
            # als de state terminal is kunnen we natuurlijk geen future Q value berekenen
            if terminated:
                target[0][action] = reward
            else:
                next_action = np.argmax(self.policy.model.predict(np.expand_dims(np.array(next_state), axis=0), verbose=0))
                next_q = self.policy.target_model.predict(np.expand_dims(np.array(next_state), axis=0), verbose=0)[0][next_action]
                target[0][action] = reward + self.policy.gamma * next_q
                
            self.policy.model.fit(np.expand_dims(state, axis=0), target, verbose=0)
        if self.policy.epsilon > self.policy.min_epsilon:
            self.decay()
        
    def align_target_network(self):
        model_weights = self.policy.model.get_weights()
        target_model_weights = self.policy.target_model.get_weights()
        for i in range(len(target_model_weights)):
            target_model_weights[i] = self.policy.tau * model_weights[i] + (1 - self.policy.tau) * target_model_weights[i]
        self.policy.target_model.set_weights(target_model_weights)
        
    def save_model(self, path):
        self.policy.model.save(path)

        
    def load_model(self, path):
        self.policy.model = keras.models.load_model(path)

        
        

In [5]:
class EpsilonGreedyPolicy:
    
    def __init__(self,epsilon, epsilon_decay_rate, state_size, action_size, alpha, gamma, min_epsilon, tau):
        self.epsilon = epsilon
        self.epsilon_decay_rate = epsilon_decay_rate
        self.alpha = alpha
        self.gamma = gamma
        self.state_size = state_size
        self.action_size = action_size
        self.min_epsilon = min_epsilon
        self.tau = tau
        
        self.model = self.build_model()
        self.target_model = self.build_model()
        
    def select_action(self, state):
        if random.uniform(0,1) < self.epsilon:
            action = random.choice(range(self.action_size))
        else:
            action = np.argmax(self.model.predict(np.expand_dims(np.array(state), axis=0), verbose = 0))
        
        return action
    
    def build_model(self):
        
        model = Sequential()
        model.add(Dense(32, input_dim=self.state_size, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.alpha))
    
        return model

In [6]:
agent = Agent(policy = EpsilonGreedyPolicy(epsilon = 1, epsilon_decay_rate = 0.99941, state_size = 8, action_size = 4, alpha = 0.0001, gamma = 0.99, min_epsilon = 0.05, tau = 1), memory = Memory(capacity = 250000))

In [None]:
env = gym.make("LunarLander-v2", render_mode="rgb_array")

episodes = 5000
for episode in range(episodes):
    
    state, info = env.reset()
    ep_reward = 0
    while True:
        action = agent.select_action(state)
        next_state, reward, terminated, truncated, info = env.step(action)
        ep_reward += reward
        agent.memory.store(Transition(state, action, reward, next_state, terminated))
        state = next_state

        if terminated or truncated:
            break
    print(f"episode: {episode}\nreward: {ep_reward}\nepsilon {agent.policy.epsilon}\n")
    agent.train(32)
    agent.align_target_network()

episode: 0
reward: -87.47672131110915
epsilon 1

episode: 1
reward: -312.69658051532645
epsilon 0.99941

episode: 2
reward: -168.73025914748004
epsilon 0.9988203481000001

episode: 3
reward: -175.3297580717918
epsilon 0.9982310440946212

episode: 4
reward: -128.32086925224488
epsilon 0.9976420877786053

episode: 5
reward: -171.46019473872093
epsilon 0.997053478946816

episode: 6
reward: -118.21458979457884
epsilon 0.9964652173942373

episode: 7
reward: -159.51275627081344
epsilon 0.9958773029159748

episode: 8
reward: -311.9353139055627
epsilon 0.9952897353072544

episode: 9
reward: -182.23306359785192
epsilon 0.9947025143634232

episode: 10
reward: -140.3631542879391
epsilon 0.9941156398799488

episode: 11
reward: -87.7178758601919
epsilon 0.9935291116524196

episode: 12
reward: -339.75461737876947
epsilon 0.9929429294765447

episode: 13
reward: -180.66060686930464
epsilon 0.9923570931481536

episode: 14
reward: -105.3862258713829
epsilon 0.9917716024631962

episode: 15
reward: -315.3

In [None]:
agent.save_model("model")