In [1]:
import gym
import pylab
import random
import tflearn
import numpy as np

from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam

Using TensorFlow backend.


## Task: fill empty spaces in the following agent code

In [2]:
class DeepQAgent:
    def __init__(self, state_size, action_size, render=True):
        # Tip: if you are training this on AWS the best way is to turn off rendering
        # and load it later with the serialized model
        self.render = render
        self.state_size = state_size
        self.action_size = action_size

        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.005
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / 50000
        self.batch_size = 64
        self.train_start = 1000
        # replay memory
        self.memory = deque(maxlen=10000)

        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(32, input_shape=(self.state_size,)))
        model.add(Activation('relu'))
        model.add(Dense(32))
        model.add(Activation('relu'))
        model.add(Dense(self.action_size))
        model.add(Activation('linear'))
        model.compile(optimizer=Adam(lr=self.learning_rate), loss='mse')
        return model

    def update_target_model(self):
        """Update your target model to the model you are currently learning at regular time intervals"""
        self.target_model.set_weights(self.model.get_weights())

    def get_action(self, state):
        """The choice of action uses the epsilon-greedy policy for the current network."""
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    def replay_memory(self, state, action, reward, next_state, done):
        """Save <s, a, r, s'> to replay_memory"""
        if action == 2:
            action = 1
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
            # print(len(self.memory))

    def train_replay(self):
        """Random sampling of batch_size samples from replay memory"""
        if len(self.memory) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.action_size))

        for i in range(batch_size):
            state, action, reward, next_state, done = mini_batch[i]
            target = self.model.predict(state)[0]

            # As in queuing, it gets the maximum Q Value at s'. However, it is imported from the target model.
            if done:
                target[action] = reward
            else:
                target[action] = reward + self.discount_factor * \
                                          np.amax(self.target_model.predict(next_state)[0])
            update_input[i] = state
            update_target[i] = target

        # You can create a minibatch of the correct target answer and the current value of your own,
        self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)

    def load_model(self, name):
        self.model.load_model(name)

    def save_model(self, name):
        self.model.save(name)


In [4]:
env = gym.make('MountainCar-v0')
state_size = env.observation_space.shape[0] # should be equal 2
ACTION_SIZE = 2
agent = DeepQAgent(state_size, ACTION_SIZE, False)
# agent.load_model("./save_model/<your_saved_model_name>")
scores, episodes = [], []
N_EPISODES = 4000

In [5]:
for e in range(N_EPISODES):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    print(state)

    # Action 0 (left), 1 (do nothing), 3 (declare fake_action to avoid doing nothing
    fake_action = 0

    # Counter for the same action 4 times
    action_count = 0

    while not done:
        if agent.render:
            env.render()

        # Select an action in the current state and proceed to a step
        action_count = action_count + 1

        if action_count == 4:
            action = agent.get_action(state)
            action_count = 0

            if action == 0:
                fake_action = 0
            elif action == 1:
                fake_action = 2

        # Take 1 step with the selected action
        next_state, reward, done, info = env.step(fake_action)
        next_state = np.reshape(next_state, [1, state_size])
        # Give a penalty of -100 for actions that end an episode
        # reward = reward if not done else -100

        # Save <s, a, r, s'> to replay memory
        agent.replay_memory(state, fake_action, reward, next_state, done)
        # Continue to learn every time step
        agent.train_replay()
        score += reward
        state = next_state

        if done:
            env.reset()
            # Copy the learning model for each episode to the target model
            agent.update_target_model()

            # For each episode, the time step where cartpole stood is plot
            scores.append(score)
            episodes.append(e)
            print("episode:", e, "  score:", score, "  memory length:", len(agent.memory),
                  "  epsilon:", agent.epsilon)

    # Save model for every 50 episodes
    if e % 50 == 0:
        agent.save_model("./save_model/dl_model")

[[-0.41502999  0.        ]]
episode: 0   score: -200.0   memory length: 200   epsilon: 0.9960200000000077
[[-0.43181353  0.        ]]
episode: 1   score: -200.0   memory length: 400   epsilon: 0.9920400000000154
[[-0.57525762  0.        ]]
episode: 2   score: -200.0   memory length: 600   epsilon: 0.988060000000023
[[-0.49792732  0.        ]]
episode: 3   score: -200.0   memory length: 800   epsilon: 0.9840800000000307
[[-0.40654661  0.        ]]
episode: 4   score: -200.0   memory length: 1000   epsilon: 0.9801000000000384
[[-0.46646807  0.        ]]
episode: 5   score: -200.0   memory length: 1200   epsilon: 0.9761200000000461
[[-0.53694406  0.        ]]
episode: 6   score: -200.0   memory length: 1400   epsilon: 0.9721400000000537
[[-0.51648133  0.        ]]
episode: 7   score: -200.0   memory length: 1600   epsilon: 0.9681600000000614
[[-0.55446494  0.        ]]
episode: 8   score: -200.0   memory length: 1800   epsilon: 0.9641800000000691
[[-0.57313031  0.        ]]
episode: 9   s

episode: 76   score: -200.0   memory length: 10000   epsilon: 0.6935400000005911
[[-0.4364738  0.       ]]
episode: 77   score: -200.0   memory length: 10000   epsilon: 0.6895600000005988
[[-0.48176328  0.        ]]
episode: 78   score: -200.0   memory length: 10000   epsilon: 0.6855800000006065
[[-0.53538093  0.        ]]
episode: 79   score: -200.0   memory length: 10000   epsilon: 0.6816000000006142
[[-0.42575221  0.        ]]
episode: 80   score: -200.0   memory length: 10000   epsilon: 0.6776200000006218
[[-0.46137891  0.        ]]
episode: 81   score: -200.0   memory length: 10000   epsilon: 0.6736400000006295
[[-0.51970632  0.        ]]
episode: 82   score: -200.0   memory length: 10000   epsilon: 0.6696600000006372
[[-0.46921449  0.        ]]
episode: 83   score: -200.0   memory length: 10000   epsilon: 0.6656800000006449
[[-0.59276909  0.        ]]
episode: 84   score: -200.0   memory length: 10000   epsilon: 0.6617000000006525
[[-0.49107611  0.        ]]
episode: 85   score: 

episode: 151   score: -200.0   memory length: 10000   epsilon: 0.3963335000008752
[[-0.56741032  0.        ]]
episode: 152   score: -160.0   memory length: 10000   epsilon: 0.39314950000087245
[[-0.51523703  0.        ]]
episode: 153   score: -179.0   memory length: 10000   epsilon: 0.3895874000008694
[[-0.51691597  0.        ]]
episode: 154   score: -200.0   memory length: 10000   epsilon: 0.38560740000086596
[[-0.57679519  0.        ]]
episode: 155   score: -200.0   memory length: 10000   epsilon: 0.38162740000086254
[[-0.50541603  0.        ]]
episode: 156   score: -164.0   memory length: 10000   epsilon: 0.37836380000085973
[[-0.58799171  0.        ]]
episode: 157   score: -200.0   memory length: 10000   epsilon: 0.3743838000008563
[[-0.44941759  0.        ]]
episode: 158   score: -200.0   memory length: 10000   epsilon: 0.3704038000008529
[[-0.51613051  0.        ]]
episode: 159   score: -158.0   memory length: 10000   epsilon: 0.3672596000008502
[[-0.47467917  0.        ]]
episod

episode: 226   score: -154.0   memory length: 10000   epsilon: 0.14260850000080663
[[-0.44635538  0.        ]]
episode: 227   score: -90.0   memory length: 10000   epsilon: 0.1408175000008076
[[-0.48546077  0.        ]]
episode: 228   score: -161.0   memory length: 10000   epsilon: 0.1376136000008093
[[-0.44034822  0.        ]]
episode: 229   score: -88.0   memory length: 10000   epsilon: 0.13586240000081024
[[-0.42257184  0.        ]]
episode: 230   score: -86.0   memory length: 10000   epsilon: 0.13415100000081115
[[-0.41521844  0.        ]]
episode: 231   score: -172.0   memory length: 10000   epsilon: 0.13072820000081298
[[-0.55117948  0.        ]]
episode: 232   score: -149.0   memory length: 10000   epsilon: 0.12776310000081456
[[-0.41474666  0.        ]]
episode: 233   score: -161.0   memory length: 10000   epsilon: 0.12455920000081595
[[-0.56695136  0.        ]]
episode: 234   score: -146.0   memory length: 10000   epsilon: 0.12165380000081548
[[-0.58366546  0.        ]]
episod

episode: 300   score: -145.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.51667795  0.        ]]
episode: 301   score: -146.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.45750554  0.        ]]
episode: 302   score: -153.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.46187163  0.        ]]
episode: 303   score: -153.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.44724922  0.        ]]
episode: 304   score: -158.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.4733464  0.       ]]
episode: 305   score: -166.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.50725535  0.        ]]
episode: 306   score: -149.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.4743594  0.       ]]
episode: 307   score: -150.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.50791574  0.        ]]
episode: 308   score: -157.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.44907247  0.       

episode: 374   score: -107.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.45800591  0.        ]]
episode: 375   score: -94.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.49419669  0.        ]]
episode: 376   score: -153.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.49149174  0.        ]]
episode: 377   score: -189.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.47478731  0.        ]]
episode: 378   score: -102.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.47819693  0.        ]]
episode: 379   score: -114.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.4464161  0.       ]]
episode: 380   score: -94.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.47530163  0.        ]]
episode: 381   score: -104.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.54321789  0.        ]]
episode: 382   score: -149.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.48099282  0.       

episode: 448   score: -146.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.4938338  0.       ]]
episode: 449   score: -173.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.4659034  0.       ]]
episode: 450   score: -97.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.47971461  0.        ]]
episode: 451   score: -106.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.53013152  0.        ]]
episode: 452   score: -150.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.54173336  0.        ]]
episode: 453   score: -149.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.44461785  0.        ]]
episode: 454   score: -89.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.49398056  0.        ]]
episode: 455   score: -181.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.46332081  0.        ]]
episode: 456   score: -96.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.45186935  0.        ]]

KeyboardInterrupt: 