In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Reinforcement Learning

## Session 09a

## Mountain Car with DQN

<img src='../../images/prasami_color_tutorials_small.png' style = 'width:400px;' alt="By Pramod Sharma : pramod.sharma@prasami.com" align="left"/>

In [2]:
# Import statements

import os
import random
import gc
import numpy as np
import matplotlib.pyplot as plt

import gym
import tensorflow as tf
from collections import deque

%matplotlib inline

2022-12-24 08:51:10.431441: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-24 08:51:11.224872: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/pks/RL/lib/python3.10/site-packages/cv2/../../lib64:
2022-12-24 08:51:11.224924: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/pks/RL/lib/python3.10/site-packages/cv2/../../lib64:


In [3]:
# Some basic parameters
inpDir = '../input'
outDir = '../output'
modelDir = '../models'

# parameters for Matplotlib
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (12, 9),
          'axes.labelsize': 'x-large',
          'axes.titlesize':'x-large',
          'xtick.labelsize':'x-large',
          'ytick.labelsize':'x-large'
         }

plt.rcParams.update(params)

RANDOM_STATE = 24

STEPS = 400

NUM_ITER = 201

BATCH = 32

SMALL_ENOUGH = 1e-3

GAMMA = 0.99

ALPHA = 0.001

EPSILON = 1

EPSILON_DECAY = 0.05

EPSILON_MIN = 0.01

MAX_LENGHT = 20000

ENV_NAME = 'MountainCar-v0'

In [4]:
class ClearMemory(tf.keras.callbacks.Callback):
    
    def on_epoch_end(self, epoch, logs=None):
        
        #print ('clearing at end of {}.'.format(epoch))
        
        gc.collect()
        
        tf.keras.backend.clear_session()

In [5]:
class MountainCarTrain:
    
    #Initialize Class Variables
    def __init__(self,env):
        
        self.env = env
        self.gamma = GAMMA

        self.epsilon = 1
        self.epsilon_decay = EPSILON_DECAY

        self.epsilon_min = EPSILON_MIN


        self.learingRate = ALPHA

        self.replayBuffer = deque(maxlen=MAX_LENGHT)
        
        self.trainNetwork = self.createNetwork()

        self.episodeNum = STEPS

        self.iterationNum = NUM_ITER

        self.numPickFromBuffer = BATCH

        self.targetNetwork = self.createNetwork()

        self.targetNetwork.set_weights(self.trainNetwork.get_weights())
        
        self.modelPath = os.path.join(modelDir,'mountain_car_models', 'trainNetworkInEPS{}.h5')

        
    # Create a Sequential Model 
    def createNetwork(self):
        
        model = tf.keras.models.Sequential()
        
        state_shape = self.env.observation_space.shape

        model.add ( tf.keras.layers.Dense(24, 
                                          activation='relu',
                                          input_shape=state_shape ) )
        
        model.add(tf.keras.layers.Dense(48,
                                        activation='relu') )
        
        model.add(tf.keras.layers.Dense(self.env.action_space.n,
                                        activation='linear'))

        model.compile(loss='mse', 
                      optimizer=tf.keras.optimizers.Adam(lr=self.learingRate) )
        
        return model
    
    # Epsilon Greedy Policy
    def getBestAction(self,state):

        self.epsilon = max(self.epsilon_min, self.epsilon)

        if np.random.rand(1) < self.epsilon:
            
            action = np.random.randint(0, self.env.action_space.n) # Random Action 
        
        else:
            action=np.argmax(self.trainNetwork.predict(state, verbose = 0)[0]) # Best action

        return action

    
    # Training the model from the buffer we have saved
    def trainFromBuffer(self):
        
        if len(self.replayBuffer) < self.numPickFromBuffer:
            return
        
        samples = random.sample(self.replayBuffer,self.numPickFromBuffer)
        
        npSamples = np.array(samples, dtype=object)
        
        statesTemp, actionsTemp, rewardsTmp, newStatesTemp, donesTmp = np.hsplit(npSamples, 5)
        
        states = np.concatenate((np.squeeze(statesTemp[:])), axis = 0)
        
        rewards = rewardsTmp.reshape(self.numPickFromBuffer,).astype(float)
        
        targets = self.trainNetwork.predict(states, verbose = 0)
        
        newStates = np.concatenate(np.concatenate(newStatesTemp))
        
        dones = np.concatenate(donesTmp).astype(bool)
        
        notdones = ~dones
        
        notdones = notdones.astype(float)
        
        dones = dones.astype(float)
        
        Q_futures = self.targetNetwork.predict(newStates, verbose = 0).max(axis = 1)
        
        targets[(np.arange(self.numPickFromBuffer), actionsTemp.reshape(self.numPickFromBuffer,).astype(int))] = rewards * dones + (rewards + Q_futures * self.gamma)*notdones
        
        self.trainNetwork.fit(states, targets, epochs=1, verbose=0, callbacks=ClearMemory())
        
        
    def orginalTry(self, currentState, eps):
        
        rewardSum = 0
        
        max_position=-99

        for i in range(self.iterationNum):
            
            bestAction = self.getBestAction(currentState)

            #show the animation every 50 eps
            #if eps%50==0:
            #    #env.render()
            #    #pass

            newState, reward, terminate, truncate, info = env.step(bestAction)
            
            done = terminate or truncate

            newState = newState.reshape(1, 2)

            # Keep track of max position
            if newState[0][0] > max_position:
                
                max_position = newState[0][0]


            # Adjust reward for task completion
            if newState[0][0] >= 0.5:
                reward += 10

            self.replayBuffer.append([currentState, bestAction, reward, newState, done])

            self.trainFromBuffer()

            rewardSum += reward

            currentState = newState

            if done:
                break

        if i >= 199:
            
            print("Failed to finish task in epsoide {}".format(eps))
        
        else:
            
            print("Success in epsoide {}, used {} iterations!".format(eps, i))
            
            self.trainNetwork.save( self.modelPath.format(eps) )

        #Sync
        self.targetNetwork.set_weights(self.trainNetwork.get_weights())

        print("now epsilon is {}, the reward is {} maxPosition is {}".format(max(self.epsilon_min, self.epsilon), rewardSum,max_position))
        self.epsilon -= self.epsilon_decay

    def start(self):
        
        for eps in range(self.episodeNum):
            
            currentState=env.reset()[0].reshape(1,2)
            
            self.orginalTry(currentState, eps)


In [6]:
env = gym.make(ENV_NAME)

In [None]:
dqn=MountainCarTrain(env=env)
dqn.start()

2022-12-24 08:51:12.088566: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-24 08:51:12.107747: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-24 08:51:12.107968: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-24 08:51:12.108915: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

Failed to finish task in epsoide 0
now epsilon is 1, the reward is -200.0 maxPosition is -0.42732512950897217
Failed to finish task in epsoide 1
now epsilon is 0.95, the reward is -200.0 maxPosition is -0.39987611770629883
Failed to finish task in epsoide 2
now epsilon is 0.8999999999999999, the reward is -200.0 maxPosition is -0.3053654432296753
Failed to finish task in epsoide 3
now epsilon is 0.8499999999999999, the reward is -200.0 maxPosition is -0.3684562146663666
Failed to finish task in epsoide 4
now epsilon is 0.7999999999999998, the reward is -200.0 maxPosition is -0.42298054695129395
Failed to finish task in epsoide 5
now epsilon is 0.7499999999999998, the reward is -200.0 maxPosition is -0.30854588747024536
Failed to finish task in epsoide 6
now epsilon is 0.6999999999999997, the reward is -200.0 maxPosition is -0.24878241121768951
Failed to finish task in epsoide 7
now epsilon is 0.6499999999999997, the reward is -200.0 maxPosition is -0.3384263515472412
Failed to finish t