In [None]:
import gym
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from collections import deque
from IPython import display
from IPython.display import clear_output

# Create the Cart-Pole game environment
env = gym.make('CartPole-v0') 

In [None]:
train_episodes = 600           # max number of episodes to learn from
max_steps = 200                # max steps in an episode
gamma = 0.99                   # future reward discount

# Exploration parameters
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Network parameters
hidden_size = 16               # number of units in each Q-network hidden layer
learning_rate = 0.001         # Q-network learning rate

# Memory parameters
memory_size = 10000            # memory capacity
batch_size = 32                # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory

In [1]:
class QNetwork:
    def __init__(self, learning_rate=0.01, state_size=4,
                 action_size=2, hidden_size=10):
        # state inputs to the Q-network
        self.model = Sequential()

        self.model.add(Dense(hidden_size, activation='relu',
                             input_dim=state_size))
        self.model.add(Dense(hidden_size, activation='relu'))
        self.model.add(Dense(action_size, activation='linear'))

        self.optimizer = Adam(lr=learning_rate)
        self.model.compile(loss='mse', optimizer=self.optimizer)


class Memory():
    def __init__(self, max_size=1000):
        self.buffer = deque(maxlen=max_size)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)),
                               size=batch_size,
                               replace=False)
        return [self.buffer[ii] for ii in idx]


mainQN = QNetwork(hidden_size=hidden_size, learning_rate=learning_rate)

###################################
## Populate the experience memory
###################################

# Initialize the simulation
env.reset()
# Take one random step to get the pole and cart moving
state, reward, done, _ = env.step(env.action_space.sample())
state = np.reshape(state, [1, 4])

memory = Memory(max_size=memory_size)

# Make a bunch of random actions and store the experiences
for ii in range(pretrain_length):
    # Uncomment the line below to watch the simulation
    # env.render()

    # Make a random action
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1, 4])

    if done:
        # The simulation fails so no next state
        next_state = np.zeros(state.shape)
        # Add experience to memory
        memory.add((state, action, reward, next_state))

        # Start new episode
        env.reset()
        # Take one random step to get the pole and cart moving
        state, reward, done, _ = env.step(env.action_space.sample())
        state = np.reshape(state, [1, 4])
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state))
        state = next_state

#############
## Training
#############
step = 0
for ep in range(1, train_episodes):
    total_reward = 0
    t = 0
    while t < max_steps:
        step += 1
        # Uncomment this next line to watch the training
        # env.render()

        # Explore or Exploit
        explore_p = explore_stop + (explore_start - explore_stop)*np.exp(-decay_rate*step)
        if explore_p > np.random.rand():
            # Make a random action
            action = env.action_space.sample()
        else:
            # Get action from Q-network
            Qs = mainQN.model.predict(state)[0]
            action = np.argmax(Qs)

        # Take action, get new state and reward
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, 4])
        total_reward += reward

        if done:
            # the episode ends so no next state
            next_state = np.zeros(state.shape)
            t = max_steps

            print('Episode: {}'.format(ep),
                  'Total reward: {}'.format(total_reward),
                  'Explore P: {:.4f}'.format(explore_p))

            # Add experience to memory
            memory.add((state, action, reward, next_state))

            # Start new episode
            env.reset()
            # Take one random step to get the pole and cart moving
            state, reward, done, _ = env.step(env.action_space.sample())
            state = np.reshape(state, [1, 4])
        else:
            # Add experience to memory
            memory.add((state, action, reward, next_state))
            state = next_state
            t += 1

        # Replay
        inputs = np.zeros((batch_size, 4))
        targets = np.zeros((batch_size, 2))

        minibatch = memory.sample(batch_size)
        for i, (state_b, action_b, reward_b, next_state_b) in enumerate(minibatch):
            inputs[i:i+1] = state_b
            target = reward_b
            if not (next_state_b == np.zeros(state_b.shape)).all(axis=1):
                target_Q = mainQN.model.predict(next_state_b)[0]
                target = reward_b + gamma * np.amax(mainQN.model.predict(next_state_b)[0])
            targets[i] = mainQN.model.predict(state_b)
            targets[i][action_b] = target
        mainQN.model.fit(inputs, targets, epochs=1, verbose=0)

  result = entry_point.load(False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode: 1 Total reward: 6.0 Explore P: 0.9994
Episode: 2 Total reward: 13.0 Explore P: 0.9981
Episode: 3 Total reward: 18.0 Explore P: 0.9963
Episode: 4 Total reward: 29.0 Explore P: 0.9935
Episode: 5 Total reward: 17.0 Explore P: 0.9918
Episode: 6 Total reward: 10.0 Explore P: 0.9908
Episode: 7 Total reward: 25.0 Explore P: 0.9884
Episode: 8 Total reward: 17.0 Explore P: 0.9867
Episode: 9 Total reward: 16.0 Explore P: 0.9852
Episode: 10 Total reward: 18.0 Explore P: 0.9834
Episode: 11 Total reward: 10.0 Explore P: 0.9824
Episode: 12 Total reward: 15.0 Explore P: 0.9810
Episode: 13 Total reward: 23.0 Explore P: 0.9787
Episode: 14 Total reward: 65.0 Explore P: 0.9725
Episode: 15 Total reward: 11.0 Explore P: 0.9714
Episode: 16 Total reward: 14.0 Explore P: 0.9701
Episode: 17 Total reward: 14.0 Explore P: 0.9687
Episode: 18 Total reward: 13.0 Explore P: 0.9675
Episode: 19 Total re

Episode: 165 Total reward: 34.0 Explore P: 0.5817
Episode: 166 Total reward: 44.0 Explore P: 0.5792
Episode: 167 Total reward: 134.0 Explore P: 0.5716
Episode: 168 Total reward: 104.0 Explore P: 0.5658
Episode: 169 Total reward: 50.0 Explore P: 0.5630
Episode: 170 Total reward: 115.0 Explore P: 0.5567
Episode: 171 Total reward: 165.0 Explore P: 0.5478
Episode: 172 Total reward: 18.0 Explore P: 0.5468
Episode: 173 Total reward: 114.0 Explore P: 0.5407
Episode: 174 Total reward: 172.0 Explore P: 0.5317
Episode: 175 Total reward: 27.0 Explore P: 0.5302
Episode: 176 Total reward: 151.0 Explore P: 0.5225
Episode: 177 Total reward: 85.0 Explore P: 0.5181
Episode: 178 Total reward: 199.0 Explore P: 0.5081
Episode: 179 Total reward: 97.0 Explore P: 0.5033
Episode: 180 Total reward: 78.0 Explore P: 0.4995
Episode: 181 Total reward: 26.0 Explore P: 0.4982
Episode: 182 Total reward: 144.0 Explore P: 0.4912
Episode: 183 Total reward: 92.0 Explore P: 0.4868
Episode: 184 Total reward: 108.0 Explore 

Episode: 327 Total reward: 199.0 Explore P: 0.0547
Episode: 328 Total reward: 199.0 Explore P: 0.0538
Episode: 329 Total reward: 199.0 Explore P: 0.0529
Episode: 330 Total reward: 154.0 Explore P: 0.0523
Episode: 331 Total reward: 120.0 Explore P: 0.0518
Episode: 332 Total reward: 19.0 Explore P: 0.0517
Episode: 333 Total reward: 13.0 Explore P: 0.0516
Episode: 334 Total reward: 11.0 Explore P: 0.0516
Episode: 335 Total reward: 12.0 Explore P: 0.0515
Episode: 336 Total reward: 8.0 Explore P: 0.0515
Episode: 337 Total reward: 10.0 Explore P: 0.0515
Episode: 338 Total reward: 10.0 Explore P: 0.0514
Episode: 339 Total reward: 16.0 Explore P: 0.0514
Episode: 340 Total reward: 113.0 Explore P: 0.0509
Episode: 341 Total reward: 138.0 Explore P: 0.0503
Episode: 342 Total reward: 199.0 Explore P: 0.0495
Episode: 343 Total reward: 199.0 Explore P: 0.0488
Episode: 344 Total reward: 199.0 Explore P: 0.0480
Episode: 345 Total reward: 199.0 Explore P: 0.0472
Episode: 346 Total reward: 199.0 Explore

Episode: 489 Total reward: 199.0 Explore P: 0.0127
Episode: 490 Total reward: 199.0 Explore P: 0.0126
Episode: 491 Total reward: 199.0 Explore P: 0.0125
Episode: 492 Total reward: 199.0 Explore P: 0.0125
Episode: 493 Total reward: 199.0 Explore P: 0.0124
Episode: 494 Total reward: 199.0 Explore P: 0.0124
Episode: 495 Total reward: 144.0 Explore P: 0.0124
Episode: 496 Total reward: 9.0 Explore P: 0.0124
Episode: 497 Total reward: 12.0 Explore P: 0.0124
Episode: 498 Total reward: 10.0 Explore P: 0.0124
Episode: 499 Total reward: 12.0 Explore P: 0.0124
Episode: 500 Total reward: 11.0 Explore P: 0.0124
Episode: 501 Total reward: 10.0 Explore P: 0.0124
Episode: 502 Total reward: 8.0 Explore P: 0.0123
Episode: 503 Total reward: 7.0 Explore P: 0.0123
Episode: 504 Total reward: 11.0 Explore P: 0.0123
Episode: 505 Total reward: 10.0 Explore P: 0.0123
Episode: 506 Total reward: 7.0 Explore P: 0.0123
Episode: 507 Total reward: 43.0 Explore P: 0.0123
Episode: 508 Total reward: 51.0 Explore P: 0.01

In [11]:
state = env.reset()

while True:
    state = np.expand_dims(state, axis=0)
    Qs = mainQN.model.predict(state)[0]
    # Act greedly
    action = np.argmax(Qs)

    # Uncomment to render the visual state in a window
    env.render()

    # Step through environment using chosen action
    state, reward, done, _ = env.step(action)
    if done:
        break