# Continuous Action Mountain Car trained with Deep Deterministic Policy Gradients

## Step 0: Import necessary packages

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import gym
import random

## Step 1: Activate the environment and examine state/action spaces

In [2]:
env = gym.make('MountainCarContinuous-v0')



In [3]:
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)
print(env.action_space)
print(env.action_space.high)
print(env.action_space.low)

Box(2,)
[0.6  0.07]
[-1.2  -0.07]
Box(1,)
[1.]
[-1.]


In [4]:
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
action_high = env.action_space.high
action_low = env.action_space.low

Let's take some random actions in the environment and see what happens...

In [None]:
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        env.render()
        print(observation)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

## Step 2: Create and train the agent

In [5]:
from agent import DDPG
from collections import deque

In [6]:
# Define all hyperparameters here
ACTOR_LR = 1e-4
CRITIC_LR = 1e-3
RANDOM_SEED = 42
MU = 0.0
THETA = 0.15
SIGMA = 0.2
BUFFER_SIZE = 1e5
BATCH_SIZE = 128
GAMMA = 0.99
TAU = 1e-3
N_TIME_STEPS = 1
N_LEARN_UPDATES = 1

if tf.test.is_gpu_available():
    DEVICE = "/GPU:0"
else:
    DEVICE = "/device:CPU:0"

In [None]:
agent = DDPG(state_size, action_size, action_high, action_low, ACTOR_LR, CRITIC_LR,
             RANDOM_SEED, MU, THETA, SIGMA, BUFFER_SIZE, BATCH_SIZE,
             GAMMA, TAU, N_TIME_STEPS, N_LEARN_UPDATES, DEVICE)

In [None]:
def ddpg(n_episodes=1000, print_every=100):
    scores_deque = deque(maxlen=print_every)
    scores = []
    
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        agent.reset()
        score = 0
        t = 0
        
        while(True):
            t += 1
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.step(t, state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break 
        scores_deque.append(score)
        scores.append(score)
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)), end="")
        agent.actor_local.model.save('checkpoint_actor.h5')
        agent.critic_local.model.save('checkpoint_critic.h5')
        if i_episode % print_every == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            
        if np.mean(scores_deque) >= 90.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
            agent.actor_local.model.save('checkpoint_actor.h5')
            agent.critic_local.model.save('checkpoint_critic.h5')
            break
            
    return scores

scores = ddpg()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

## Step 3: See the trained agent in action

In [9]:
from tensorflow.keras.models import load_model

trained_model = load_model('checkpoint_actor.h5', compile=False)

In [10]:
for i_episode in range(20):
    next_state = env.reset()
    
    score = 0.0
    
    for t in range(999):
        env.render()
        next_state = np.expand_dims(next_state, axis=0)
        action = trained_model(next_state).numpy()[0]
        next_state, reward, done, info = env.step(action)
        score += reward
        if done:
            break
    
    print("Episode {0} finished after {1} timesteps. Total score: {2}".format(i_episode+1, t+1, score))
            
env.close()

Episode 1 finished after 107 timesteps. Total score: 92.6259056646498
Episode 2 finished after 77 timesteps. Total score: 94.92700402670195
Episode 3 finished after 105 timesteps. Total score: 92.24396102552018
Episode 4 finished after 107 timesteps. Total score: 92.1137368917725
Episode 5 finished after 77 timesteps. Total score: 94.89446283916597
Episode 6 finished after 107 timesteps. Total score: 91.51780576851142
Episode 7 finished after 104 timesteps. Total score: 92.31759281236737
Episode 8 finished after 104 timesteps. Total score: 92.289101656774
Episode 9 finished after 103 timesteps. Total score: 92.28996766335256
Episode 10 finished after 103 timesteps. Total score: 92.16521049372446
Episode 11 finished after 103 timesteps. Total score: 92.31726429213056
Episode 12 finished after 103 timesteps. Total score: 92.13241779875503
Episode 13 finished after 104 timesteps. Total score: 92.29394132765881
Episode 14 finished after 103 timesteps. Total score: 92.16346638377718
Episode