In [8]:
import random
import gym
import numpy as np

In [9]:
import tensorflow as tf
from collections import deque

In [12]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [13]:
ENV_NAME = "CartPole-v1"

GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000
BATCH_SIZE = 20

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995

EPISODES = 1000

In [14]:
class DQNSolver:

    #Constructor for DQNSolver
    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX
        
        #Vi sætter acction space og laver vores memory (deque)
        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        #Opretter vores Neurale netværk
        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        
        #Sidste lag er af størrelsen på vores action_space
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    #Metode til at tilføje til vores memory en del af vores "experience replay"   
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    #Epsilon-greedy algoritme: til at vurdere om vi laver en random eller optimal action
    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        
        q_values = self.model.predict(state)
        
        return np.argmax(q_values[0])

    #Gem data til altså "Experience replay"
    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        
        #random.sample udvælger samples random
        batch = random.sample(self.memory, BATCH_SIZE)
        
        
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
        
            if not terminal:
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
            
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)
    
    
    def save_model(self):
        self.model.save("cartpole_weight_v2.h5")
        
    def load_trained_model(weights_path):
       self.model.load_weights(weights_path)

In [None]:
env = gym.make(ENV_NAME)
    
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n

dqn_solver = DQNSolver(observation_space, action_space)
run = 0

for index_episode in range(EPISODES):
    run += 1
    state = env.reset()
    state = np.reshape(state, [1, observation_space])
    step = 0
    try:
        while True:
            step += 1
            #vælg action baseret på Epsilon-greedy
            action = dqn_solver.act(state)
            #Udfør action
            state_next, reward, terminal, info = env.step(action)
            
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            
            #Gem step
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))

                break
            dqn_solver.experience_replay()
    finally:        
        dqn_solver.save_model()

  super(Adam, self).__init__(name, **kwargs)


Run: 1, exploration: 1.0, score: 17
Run: 2, exploration: 0.8475428503023453, score: 36
Run: 3, exploration: 0.7705488893118823, score: 20
Run: 4, exploration: 0.7076077347272662, score: 18
Run: 5, exploration: 0.6832098777212641, score: 8
Run: 6, exploration: 0.6180388156137953, score: 21
Run: 7, exploration: 0.5819594443402982, score: 13
Run: 8, exploration: 0.5535075230322891, score: 11
Run: 9, exploration: 0.510849320360386, score: 17
Run: 10, exploration: 0.4810273709480478, score: 13
Run: 11, exploration: 0.46211964903917074, score: 9
Run: 12, exploration: 0.42013897252428334, score: 20
Run: 13, exploration: 0.3976004408064698, score: 12
Run: 14, exploration: 0.3614809303671764, score: 20
Run: 15, exploration: 0.3472722151889232, score: 9
Run: 16, exploration: 0.32864265128599696, score: 12
Run: 17, exploration: 0.3125753549412418, score: 11
Run: 18, exploration: 0.29285644267656924, score: 14
Run: 19, exploration: 0.27853872940185365, score: 11
Run: 20, exploration: 0.26492100726