In [1]:
import gym
import pylab
import random
import numpy as np
from collections import deque
import tflearn

import tflearn
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import sgd

Using TensorFlow backend.


## Task: fill empty spaces in the following agent code

In [2]:
class DeepQAgent:
    def __init__(self, state_size, action_size, render=True):
        # Tip: if you are training this on AWS the best way is to turn off rendering
        # and load it later with the serialized model
        self.render = render
        self.state_size = state_size
        self.action_size = action_size

        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.005
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / 50000
        self.batch_size = 64
        self.train_start = 1000
        # replay memory
        self.memory = deque(maxlen=10000)

        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(16, input_dim = self.state_size))
        model.add(Dense(16))
        model.add(Dense(self.action_size))
        
        model.compile(sgd(lr=self.learning_rate), "mse")
        return model

    def update_target_model(self):
        """Update your target model to the model you are currently learning at regular time intervals"""
        self.target_model.set_weights(self.model.get_weights())

    def get_action(self, state):
        """The choice of action uses the epsilon-greedy policy for the current network."""
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    def replay_memory(self, state, action, reward, next_state, done):
        """Save <s, a, r, s'> to replay_memory"""
        if action == 2:
            action = 1
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
            # print(len(self.memory))

    def train_replay(self):
        """Random sampling of batch_size samples from replay memory"""
        if len(self.memory) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.action_size))

        for i in range(batch_size):
            state, action, reward, next_state, done = mini_batch[i]
            target = self.model.predict(state)[0]

            # As in queuing, it gets the maximum Q Value at s'. However, it is imported from the target model.
            if done:
                target[action] = reward
            else:
                target[action] = reward + self.discount_factor * \
                                          np.amax(self.target_model.predict(next_state)[0])
            update_input[i] = state
            update_target[i] = target

        # You can create a minibatch of the correct target answer and the current value of your own,
        self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)

    def load_model(self, name):
        self.model.load_model(name)

    def save_model(self, name):
        self.model.save(name)


In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
from IPython import display

In [4]:
env = gym.make('MountainCar-v0')
state_size = env.observation_space.shape[0] # should be equal 2
ACTION_SIZE = 2
agent = DeepQAgent(state_size, ACTION_SIZE)
# agent.load_model("./save_model/saved_model")
scores, episodes = [], []
N_EPISODES = 1000

In [5]:
for e in range(N_EPISODES):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    print(state)

    # Action 0 (left), 1 (do nothing), 3 (declare fake_action to avoid doing nothing
    fake_action = 0

    # Counter for the same action 4 times
    action_count = 0

    while not done:
        #if agent.render:
        #    env.render()

        # Select an action in the current state and proceed to a step
        action_count = action_count + 1

        if action_count == 4:
            action = agent.get_action(state)
            action_count = 0

            if action == 0:
                fake_action = 0
            elif action == 1:
                fake_action = 2

        # Take 1 step with the selected action
        next_state, reward, done, info = env.step(fake_action)
        next_state = np.reshape(next_state, [1, state_size])
        # Give a penalty of -100 for actions that end an episode
        # reward = reward if not done else -100

        # Save <s, a, r, s'> to replay memory
        agent.replay_memory(state, fake_action, reward, next_state, done)
        # Continue to learn every time step
        agent.train_replay()
        score += reward
        state = next_state

        if done:
            env.reset()
            # Copy the learning model for each episode to the target model
            agent.update_target_model()

            # For each episode, the time step where cartpole stood is plot
            scores.append(score)
            episodes.append(e)
            print("episode:", e, "  score:", score, "  memory length:", len(agent.memory),
                  "  epsilon:", agent.epsilon)
    # Save model for every 50 episodes
    if e % 50 == 0:
        agent.save_model("./save_model/saved_model")

[[-0.52352783  0.        ]]
episode: 0   score: -200.0   memory length: 200   epsilon: 0.9960200000000077
[[-0.40452396  0.        ]]
episode: 1   score: -200.0   memory length: 400   epsilon: 0.9920400000000154
[[-0.49222922  0.        ]]
episode: 2   score: -200.0   memory length: 600   epsilon: 0.988060000000023
[[-0.46732916  0.        ]]
episode: 3   score: -200.0   memory length: 800   epsilon: 0.9840800000000307
[[-0.52638911  0.        ]]
episode: 4   score: -200.0   memory length: 1000   epsilon: 0.9801000000000384
[[-0.4410795  0.       ]]
episode: 5   score: -200.0   memory length: 1200   epsilon: 0.9761200000000461
[[-0.53362492  0.        ]]
episode: 6   score: -200.0   memory length: 1400   epsilon: 0.9721400000000537
[[-0.51388514  0.        ]]
episode: 7   score: -200.0   memory length: 1600   epsilon: 0.9681600000000614
[[-0.51407775  0.        ]]
episode: 8   score: -200.0   memory length: 1800   epsilon: 0.9641800000000691
[[-0.48864559  0.        ]]
episode: 9   sco

episode: 76   score: -200.0   memory length: 10000   epsilon: 0.6935400000005911
[[-0.55055602  0.        ]]
episode: 77   score: -200.0   memory length: 10000   epsilon: 0.6895600000005988
[[-0.52784067  0.        ]]
episode: 78   score: -200.0   memory length: 10000   epsilon: 0.6855800000006065
[[-0.5456937  0.       ]]
episode: 79   score: -200.0   memory length: 10000   epsilon: 0.6816000000006142
[[-0.41756499  0.        ]]
episode: 80   score: -200.0   memory length: 10000   epsilon: 0.6776200000006218
[[-0.48039923  0.        ]]
episode: 81   score: -200.0   memory length: 10000   epsilon: 0.6736400000006295
[[-0.58429303  0.        ]]
episode: 82   score: -200.0   memory length: 10000   epsilon: 0.6696600000006372
[[-0.48533569  0.        ]]
episode: 83   score: -200.0   memory length: 10000   epsilon: 0.6656800000006449
[[-0.42389421  0.        ]]
episode: 84   score: -200.0   memory length: 10000   epsilon: 0.6617000000006525
[[-0.51862279  0.        ]]
episode: 85   score: 

episode: 151   score: -200.0   memory length: 10000   epsilon: 0.3950400000008741
[[-0.515157  0.      ]]
episode: 152   score: -200.0   memory length: 10000   epsilon: 0.39106000000087066
[[-0.54552033  0.        ]]
episode: 153   score: -200.0   memory length: 10000   epsilon: 0.38708000000086723
[[-0.58055073  0.        ]]
episode: 154   score: -200.0   memory length: 10000   epsilon: 0.3831000000008638
[[-0.47360517  0.        ]]
episode: 155   score: -146.0   memory length: 10000   epsilon: 0.3801946000008613
[[-0.49133542  0.        ]]
episode: 156   score: -200.0   memory length: 10000   epsilon: 0.3762146000008579
[[-0.55785466  0.        ]]
episode: 157   score: -200.0   memory length: 10000   epsilon: 0.37223460000085445
[[-0.51667368  0.        ]]
episode: 158   score: -200.0   memory length: 10000   epsilon: 0.36825460000085103
[[-0.48176228  0.        ]]
episode: 159   score: -156.0   memory length: 10000   epsilon: 0.36515020000084836
[[-0.48213667  0.        ]]
episode: 

episode: 226   score: -200.0   memory length: 10000   epsilon: 0.0985101000008117
[[-0.53428599  0.        ]]
episode: 227   score: -200.0   memory length: 10000   epsilon: 0.09453010000081105
[[-0.49891621  0.        ]]
episode: 228   score: -200.0   memory length: 10000   epsilon: 0.0905501000008104
[[-0.44674946  0.        ]]
episode: 229   score: -200.0   memory length: 10000   epsilon: 0.08657010000080975
[[-0.47007686  0.        ]]
episode: 230   score: -200.0   memory length: 10000   epsilon: 0.0825901000008091
[[-0.5158789  0.       ]]
episode: 231   score: -200.0   memory length: 10000   epsilon: 0.07861010000080845
[[-0.40070614  0.        ]]
episode: 232   score: -200.0   memory length: 10000   epsilon: 0.0746301000008078
[[-0.56010004  0.        ]]
episode: 233   score: -200.0   memory length: 10000   epsilon: 0.07065010000080715
[[-0.53300134  0.        ]]
episode: 234   score: -200.0   memory length: 10000   epsilon: 0.0666701000008065
[[-0.58139022  0.        ]]
episode:

episode: 300   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.49288524  0.        ]]
episode: 301   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.42069284  0.        ]]
episode: 302   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.47071493  0.        ]]
episode: 303   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.47287561  0.        ]]
episode: 304   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.46242048  0.        ]]
episode: 305   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.53362162  0.        ]]
episode: 306   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.55054985  0.        ]]
episode: 307   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.46236511  0.        ]]
episode: 308   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.57291952  0.   

episode: 374   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.50584332  0.        ]]
episode: 375   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.42878416  0.        ]]
episode: 376   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.56984714  0.        ]]
episode: 377   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.43802788  0.        ]]
episode: 378   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.4039137  0.       ]]
episode: 379   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.58690379  0.        ]]
episode: 380   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.42925964  0.        ]]
episode: 381   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.44441996  0.        ]]
episode: 382   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.42107655  0.     

episode: 448   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.51804838  0.        ]]
episode: 449   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.55189044  0.        ]]
episode: 450   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.52767123  0.        ]]
episode: 451   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.42586425  0.        ]]
episode: 452   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.52644941  0.        ]]
episode: 453   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.41142632  0.        ]]
episode: 454   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.53774814  0.        ]]
episode: 455   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.59306809  0.        ]]
episode: 456   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.41939306  0.   

episode: 522   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.42398449  0.        ]]
episode: 523   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.51153612  0.        ]]
episode: 524   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.4398031  0.       ]]
episode: 525   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.56387039  0.        ]]
episode: 526   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.44481742  0.        ]]
episode: 527   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.51031326  0.        ]]
episode: 528   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.50864888  0.        ]]
episode: 529   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.40748831  0.        ]]
episode: 530   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.58886162  0.     

episode: 596   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.58054603  0.        ]]
episode: 597   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.54741432  0.        ]]
episode: 598   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.42703588  0.        ]]
episode: 599   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.46950175  0.        ]]
episode: 600   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.5076545  0.       ]]
episode: 601   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.45967447  0.        ]]
episode: 602   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.41568011  0.        ]]
episode: 603   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.41685015  0.        ]]
episode: 604   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.48841872  0.     

episode: 670   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.44083867  0.        ]]
episode: 671   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.56623256  0.        ]]
episode: 672   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.48767444  0.        ]]
episode: 673   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.50084742  0.        ]]
episode: 674   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.5851766  0.       ]]
episode: 675   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.50460861  0.        ]]
episode: 676   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.5966974  0.       ]]
episode: 677   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.4180996  0.       ]]
episode: 678   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.59467716  0.        ]

episode: 744   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.46985664  0.        ]]
episode: 745   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.41300135  0.        ]]
episode: 746   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.55585383  0.        ]]
episode: 747   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.49672531  0.        ]]
episode: 748   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.58592201  0.        ]]
episode: 749   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.56048187  0.        ]]
episode: 750   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.41481121  0.        ]]
episode: 751   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.42060534  0.        ]]
episode: 752   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.53021642  0.   

episode: 818   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.51560602  0.        ]]
episode: 819   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.57593335  0.        ]]
episode: 820   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.40365317  0.        ]]
episode: 821   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.54338938  0.        ]]
episode: 822   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.41803144  0.        ]]
episode: 823   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.55149838  0.        ]]
episode: 824   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.46470843  0.        ]]
episode: 825   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.42470374  0.        ]]
episode: 826   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.44112172  0.   

episode: 892   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.48119151  0.        ]]
episode: 893   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.5384065  0.       ]]
episode: 894   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.47946656  0.        ]]
episode: 895   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.54262519  0.        ]]
episode: 896   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.41265329  0.        ]]
episode: 897   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.49631394  0.        ]]
episode: 898   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.41167357  0.        ]]
episode: 899   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.48623443  0.        ]]
episode: 900   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.50731232  0.     

episode: 966   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.44457355  0.        ]]
episode: 967   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.49612933  0.        ]]
episode: 968   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.48249255  0.        ]]
episode: 969   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.44500968  0.        ]]
episode: 970   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.41477624  0.        ]]
episode: 971   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.52411019  0.        ]]
episode: 972   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.43053615  0.        ]]
episode: 973   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.40975111  0.        ]]
episode: 974   score: -200.0   memory length: 10000   epsilon: 0.004980100000801017
[[-0.55812117  0.   