# DeepQ Learning: Mountain Car example in Gym

In [1]:
import gym
import numpy as np
from rl.core import Processor
from keras.layers import Dense, Flatten
from keras.models import Sequential
from keras.optimizers import RMSprop

from rl.agents.dqn import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import EpsGreedyQPolicy,LinearAnnealedPolicy,BoltzmannQPolicy

  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  'hamming': pil_image.HAMMING,
  'box': pil_image.BOX,
  'lanczos': pil_image.LANCZOS,


# 1. Import the MountainCar environment

In [2]:
env = gym.make('MountainCar-v0')
print("Shape of state = ", env.observation_space.shape[0])
print("No of actions =", env.action_space.n)


Shape of state =  2
No of actions = 3


# 2.  Try some random moves to provide a baseline

In [3]:
episodes = 5
cumulative_reward = 0

for e in range(1,episodes+1):

    state = env.reset()
    done = False

    total_reward = 0
    
    while not done:
        #env.render()
        action = env.action_space.sample()
        print(">",action, end='')
        new_state, reward, done, info = env.step(action)
        total_reward += reward
    
    print(f'\n Episode {e} Total Reward = {total_reward}')
    cumulative_reward += total_reward
    
average_reward = cumulative_reward/episodes 
print('Average Reward =',average_reward )

> 0> 2> 1> 2> 2> 1> 0> 2> 2> 1> 0> 2> 1> 1> 0> 1> 2> 2> 1> 1> 0> 0> 0> 0> 2> 1> 0> 0> 1> 2> 0> 1> 1> 1> 2> 2> 2> 2> 2> 2> 1> 2> 0> 2> 0> 1> 2> 0> 2> 1> 0> 0> 0> 0> 1> 2> 0> 0> 2> 2> 0> 0> 0> 0> 2> 2> 0> 2> 1> 1> 1> 1> 2> 1> 0> 0> 2> 1> 2> 1> 1> 0> 0> 1> 2> 2> 2> 2> 2> 0> 0> 1> 2> 1> 2> 0> 1> 1> 0> 1> 1> 1> 0> 0> 2> 2> 1> 0> 2> 2> 2> 1> 2> 2> 0> 1> 1> 1> 0> 1> 0> 1> 0> 0> 0> 0> 2> 1> 0> 2> 0> 0> 2> 1> 0> 2> 2> 0> 2> 2> 2> 2> 0> 1> 1> 1> 0> 0> 2> 2> 2> 2> 2> 1> 0> 0> 0> 1> 1> 0> 2> 0> 1> 2> 0> 1> 2> 2> 2> 1> 0> 2> 0> 0> 2> 0> 1> 1> 0> 1> 0> 2> 1> 0> 2> 1> 2> 2> 1> 0> 0> 0> 0> 2> 1> 0> 1> 1> 1> 1
 Episode 1 Total Reward = -200.0
> 0> 2> 2> 2> 2> 2> 2> 2> 0> 0> 0> 1> 2> 0> 2> 1> 1> 2> 2> 1> 0> 2> 2> 1> 0> 0> 1> 0> 1> 1> 1> 2> 1> 1> 2> 0> 1> 0> 0> 2> 0> 0> 0> 1> 1> 0> 0> 0> 0> 0> 2> 2> 2> 0> 1> 0> 1> 1> 0> 0> 2> 0> 0> 2> 0> 0> 2> 1> 0> 0> 1> 1> 0> 2> 0> 1> 1> 2> 0> 1> 2> 1> 2> 1> 0> 1> 1> 2> 1> 1> 0> 1> 2> 1> 2> 2> 1> 0> 0> 1> 1> 0> 2> 0> 0> 2> 1> 1> 1> 2> 1> 1> 2> 1> 1> 1> 1> 0> 2> 1> 2> 1

# 3. Use the DQN model for the MountainCar example

In [4]:

def create_agent(states, actions):
    model = Sequential()

    model.add(Flatten(input_shape = (1, states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(48, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    
    return model

print(env.observation_space.shape[0])

# no_states = (env.observation_space.high - env.observation_space.low)*\
#                 np.array([10, 100])
# no_states = np.round(no_states, 0).astype(int) + 1
no_states = 18
print(no_states)
model = create_agent( no_states , env.action_space.n)
model.summary()


2
18
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 18)                0         
                                                                 
 dense (Dense)               (None, 24)                456       
                                                                 
 dense_1 (Dense)             (None, 24)                600       
                                                                 
 dense_2 (Dense)             (None, 48)                1200      
                                                                 
 dense_3 (Dense)             (None, 3)                 147       
                                                                 
Total params: 2,403
Trainable params: 2,403
Non-trainable params: 0
_________________________________________________________________


# 4. Defining the connection between  the environment and agent


In [5]:
# 1, change state to hot encoding encoding and other 
class MountainCarProcessor(Processor):
    def process_observation(self, observation):
        #print(observation)
        one_hot = np.zeros(18)

        # identify the index of observation state in x-axis:
        i = int(np.round(( observation[0] + 1.2 / 0.1 )))
        one_hot[i] = 1
        return one_hot
        
    def process_reward(self, reward):
        if (env.state[0] >= 0.5):
            new_reward = 2
        else: 
            new_reward = (env.state[0] + 1.2) / 1.8 - 1
        return new_reward

In [6]:
env.reset()

memory = SequentialMemory(limit= 100000, window_length= 1 )

processor = MountainCarProcessor()

policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', 
                             value_max=1., value_min=.1, value_test=.05, nb_steps=10000)

dqn = DQNAgent(model=model, nb_actions= env.action_space.n, processor = processor ,
               memory= memory, nb_steps_warmup=100, gamma=0.99, policy=policy, 
               enable_double_dqn= True,target_model_update= 1e-3 )



# 5. Compile and train

In [7]:
dqn.compile(RMSprop(lr=1e-3), metrics=['mae'])
res_train = dqn.fit(env, nb_steps=100000, visualize=False, verbose=1 )

  super(RMSprop, self).__init__(name, **kwargs)
2022-05-30 12:57:16.642014: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-30 12:57:16.654502: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled


Training for 100000 steps ...
Interval 1 (0 steps performed)
   78/10000 [..............................] - ETA: 6s - reward: -0.6278   

  updates=self.state_updates,


50 episodes - episode_reward: -123.947 [-129.383, -118.710] - loss: 0.026 - mae: 2.079 - mean_q: -3.065 - mean_eps: 0.545

Interval 2 (10000 steps performed)
50 episodes - episode_reward: -124.515 [-131.287, -113.853] - loss: 0.166 - mae: 5.479 - mean_q: -8.110 - mean_eps: 0.100

Interval 3 (20000 steps performed)
50 episodes - episode_reward: -124.314 [-132.508, -111.625] - loss: 0.412 - mae: 8.536 - mean_q: -12.653 - mean_eps: 0.100

Interval 4 (30000 steps performed)
50 episodes - episode_reward: -119.040 [-127.398, -106.605] - loss: 0.723 - mae: 11.146 - mean_q: -16.535 - mean_eps: 0.100

Interval 5 (40000 steps performed)
50 episodes - episode_reward: -116.783 [-127.294, -109.972] - loss: 0.981 - mae: 13.284 - mean_q: -19.720 - mean_eps: 0.100

Interval 6 (50000 steps performed)
50 episodes - episode_reward: -116.371 [-126.160, -104.230] - loss: 1.247 - mae: 14.916 - mean_q: -22.153 - mean_eps: 0.100

Interval 7 (60000 steps performed)
50 episodes - episode_reward: -113.923 [-119.

In [8]:

print(np.average(res_train.history['episode_reward']))

-117.6267274810171


# 6. Test the DQN agent

In [9]:
env.reset()
res = dqn.test(env, nb_episodes= 10 , visualize=False)
print(np.average(res.history['episode_reward']))

Testing for 10 episodes ...
Episode 1: reward: -108.514, steps: 200
Episode 2: reward: -109.238, steps: 200
Episode 3: reward: -108.856, steps: 200
Episode 4: reward: -107.750, steps: 200
Episode 5: reward: -109.415, steps: 200
Episode 6: reward: -108.309, steps: 200
Episode 7: reward: -109.122, steps: 200
Episode 8: reward: -108.476, steps: 200
Episode 9: reward: -107.550, steps: 200
Episode 10: reward: -107.893, steps: 200
-108.51229085464233
