https://aleksandarhaber.com/deep-q-networks-dqn-in-python-from-scratch-by-using-openai-gym-and-tensorflow-reinforcement-learning-tutorial/

In [None]:
%pip install gymnasium
%pip install gym==0.26.2 
%pip install tensorflow
%pip install moviepy
%pip install pygame --pre
%pip install ffmpeg --upgrade

In [1]:
import gym
import tensorflow
print(gym.__version__)
print(tensorflow.__version__)

0.26.2
2.12.0


In [2]:
# import the necessary libraries
import numpy as np
import random
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
from collections import deque 
from tensorflow import gather_nd
from tensorflow.keras.losses import mean_squared_error 
from tensorflow import keras

In [3]:
class DeepQLearning:
     
    ###########################################################################
    #   START - __init__ function
    ###########################################################################
    # INPUTS: 
    # env - Cart Pole environment
    # gamma - discount rate
    # epsilon - parameter for epsilon-greedy approach
    # numberEpisodes - total number of simulation episodes
     
             
    def __init__(self,env,gamma,epsilon,numberEpisodes):  
        self.env=env
        self.gamma=gamma
        self.epsilon=epsilon
        self.numberEpisodes=numberEpisodes
         
        # state dimension
        self.stateDimension=4
        # action dimension
        self.actionDimension=2
        # this is the maximum size of the replay buffer
        self.replayBufferSize=300
        # this is the size of the training batch that is randomly sampled from the replay buffer
        self.batchReplayBufferSize=100
         
        # number of training episodes it takes to update the target network parameters
        # that is, every updateTargetNetworkPeriod we update the target network parameters
        self.updateTargetNetworkPeriod=100
         
        # this is the counter for updating the target network 
        # if this counter exceeds (updateTargetNetworkPeriod-1) we update the network 
        # parameters and reset the counter to zero, this process is repeated until the end of the training process
        self.counterUpdateTargetNetwork=0
         
        # this sum is used to store the sum of rewards obtained during each training episode
        self.sumRewardsEpisode=[]
         
        # replay buffer
        self.replayBuffer=deque(maxlen=self.replayBufferSize)
         
        # this is the main network
        # create network
        self.mainNetwork=self.createNetwork()
         
        # this is the target network
        # create network
        self.targetNetwork=self.createNetwork()
         
        # copy the initial weights to targetNetwork
        self.targetNetwork.set_weights(self.mainNetwork.get_weights())
         
        # this list is used in the cost function to select certain entries of the 
        # predicted and true sample matrices in order to form the loss
        self.actionsAppend=[]

    # create a neural network
    def createNetwork(self):
        model=Sequential()
        model.add(Dense(128,input_dim=self.stateDimension,activation='relu'))
        model.add(Dense(56,activation='relu'))
        model.add(Dense(self.actionDimension,activation='linear'))
        # compile the network with the custom loss defined in my_loss_fn
        model.compile(optimizer = RMSprop(), loss = self.my_loss_fn, metrics = ['accuracy'])
        return model
    
    def my_loss_fn(self,y_true, y_pred):
        
       s1,s2=y_true.shape
       #print(s1,s2)
        
       # this matrix defines indices of a set of entries that we want to 
       # extract from y_true and y_pred
       # s2=2
       # s1=self.batchReplayBufferSize
       indices=np.zeros(shape=(s1,s2))
       indices[:,0]=np.arange(s1)
       indices[:,1]=self.actionsAppend
        
       # gather_nd and mean_squared_error are TensorFlow functions
       loss = mean_squared_error(gather_nd(y_true,indices=indices.astype(int)), gather_nd(y_pred,indices=indices.astype(int)))
       #print(loss)
       return loss
    
    def trainingEpisodes(self):
    
         
        # here we loop through the episodes
        for indexEpisode in range(self.numberEpisodes):
             
            # list that stores rewards per episode - this is necessary for keeping track of convergence 
            rewardsEpisode=[]
                        
            print("Simulating episode {}".format(indexEpisode))
             
            # reset the environment at the beginning of every episode
            (currentState,_)=self.env.reset()
                       
            # here we step from one state to another
            # this will loop until a terminal state is reached
            terminalState=False
            while not terminalState:
                                       
                # select an action on the basis of the current state, denoted by currentState
                action = self.selectAction(currentState,indexEpisode)
                 
                # here we step and return the state, reward, and boolean denoting if the state is a terminal state
                (nextState, reward, terminalState,_,_) = self.env.step(action)          
                rewardsEpisode.append(reward)
          
                # add current state, action, reward, next state, and terminal flag to the replay buffer
                self.replayBuffer.append((currentState,action,reward,nextState,terminalState))
                 
                # train network
                self.trainNetwork()
                 
                # set the current state for the next step
                currentState=nextState
             
            print("Sum of rewards {}".format(np.sum(rewardsEpisode)))        
            self.sumRewardsEpisode.append(np.sum(rewardsEpisode))

    def selectAction(self,state,index):
       import numpy as np
        
       # first index episodes we select completely random actions to have enough exploration
       # change this
       if index<1:
           return np.random.choice(self.actionDimension)   
            
       # Returns a random real number in the half-open interval [0.0, 1.0)
       # this number is used for the epsilon greedy approach
       randomNumber=np.random.random()
        
       # after index episodes, we slowly start to decrease the epsilon parameter
       if index>200:
           self.epsilon=0.999*self.epsilon
        
       # if this condition is satisfied, we are exploring, that is, we select random actions
       if randomNumber < self.epsilon:
           # returns a random action selected from: 0,1,...,actionNumber-1
           return np.random.choice(self.actionDimension)            
        
       # otherwise, we are selecting greedy actions
       else:
           # we return the index where Qvalues[state,:] has the max value
           # that is, since the index denotes an action, we select greedy actions
                       
           Qvalues=self.mainNetwork.predict(state.reshape(1,4), verbose=0)
          
           return np.random.choice(np.where(Qvalues[0,:]==np.max(Qvalues[0,:]))[0])
           # here we need to return the minimum index since it can happen
           # that there are several identical maximal entries, for example 
           # import numpy as np
           # a=[0,1,1,0]
           # np.where(a==np.max(a))
           # this will return [1,2], but we only need a single index
           # that is why we need to have np.random.choice(np.where(a==np.max(a))[0])
           # note that zero has to be added here since np.where() returns a tuple

    def trainNetwork(self):
 
        # if the replay buffer has at least batchReplayBufferSize elements,
        # then train the model 
        # otherwise wait until the size of the elements exceeds batchReplayBufferSize
        if (len(self.replayBuffer)>self.batchReplayBufferSize):
             
 
            # sample a batch from the replay buffer
            randomSampleBatch=random.sample(self.replayBuffer, self.batchReplayBufferSize)
             
            # here we form current state batch 
            # and next state batch
            # they are used as inputs for prediction
            currentStateBatch=np.zeros(shape=(self.batchReplayBufferSize,4))
            nextStateBatch=np.zeros(shape=(self.batchReplayBufferSize,4))            
            # this will enumerate the tuple entries of the randomSampleBatch
            # index will loop through the number of tuples
            for index,tupleS in enumerate(randomSampleBatch):
                # first entry of the tuple is the current state
                currentStateBatch[index,:]=tupleS[0]
                # fourth entry of the tuple is the next state
                nextStateBatch[index,:]=tupleS[3]
             
            # here, use the target network to predict Q-values 
            QnextStateTargetNetwork=self.targetNetwork.predict(nextStateBatch, verbose=0)
            # here, use the main network to predict Q-values 
            QcurrentStateMainNetwork=self.mainNetwork.predict(currentStateBatch, verbose=0)
             
            # now, we form batches for training
            # input for training
            inputNetwork=currentStateBatch
            # output for training
            outputNetwork=np.zeros(shape=(self.batchReplayBufferSize,2))
             
            # this list will contain the actions that are selected from the batch 
            # this list is used in my_loss_fn to define the loss-function
            self.actionsAppend=[]            
            for index,(currentState,action,reward,nextState,terminated) in enumerate(randomSampleBatch):
                 
                # if the next state is the terminal state
                if terminated:
                    y=reward                  
                # if the next state if not the terminal state    
                else:
                    y=reward+self.gamma*np.max(QnextStateTargetNetwork[index])
                 
                # this is necessary for defining the cost function
                self.actionsAppend.append(action)
                 
                # this actually does not matter since we do not use all the entries in the cost function
                outputNetwork[index]=QcurrentStateMainNetwork[index]
                # this is what matters
                outputNetwork[index,action]=y
             
            # here, we train the network
            self.mainNetwork.fit(inputNetwork,outputNetwork,batch_size = self.batchReplayBufferSize, verbose=0, epochs=100)     
             
            # after updateTargetNetworkPeriod training sessions, update the coefficients 
            # of the target network
            # increase the counter for training the target network
            self.counterUpdateTargetNetwork+=1 
            if (self.counterUpdateTargetNetwork>(self.updateTargetNetworkPeriod-1)):
                # copy the weights to targetNetwork
                self.targetNetwork.set_weights(self.mainNetwork.get_weights())        
                print("Target network updated!")
                print("Counter value {}".format(self.counterUpdateTargetNetwork))
                # reset the counter
                self.counterUpdateTargetNetwork=0

In [None]:
#------------------------------------------------------------------------------------------------------
# training (can skip)
#------------------------------------------------------------------------------------------------------

env=gym.make('CartPole-v1')
# select the parameters
gamma=1
# probability parameter for the epsilon-greedy approach
epsilon=0.1
numberEpisodes=1000

# create an object
LearningQDeep=DeepQLearning(env,gamma,epsilon,numberEpisodes)
# run the learning process
LearningQDeep.trainingEpisodes()
# get the obtained rewards in every episode
LearningQDeep.sumRewardsEpisode

#  summarize the model
LearningQDeep.mainNetwork.summary()
# save the model, this is important, since it takes long time to train the model 
# and we will need model in another file to visualize the trained model performance
LearningQDeep.mainNetwork.save("trained_model_temp.h5")

In [5]:
from time import sleep
import gymnasium

loaded_model = keras.models.load_model("trained_model.h5",custom_objects={'my_loss_fn':DeepQLearning.my_loss_fn})

sumObtainedRewards=0
# simulate the learned policy for verification

# create the environment, here you need to keep render_mode='rgb_array' since otherwise it will not generate the movie
env = gym.make("CartPole-v1", render_mode='rgb_array')
# reset the environment
(currentState, prob)=env.reset()

# Wrapper for recording the video
# https://gymnasium.farama.org/api/wrappers/misc_wrappers/#gymnasium.wrappers.RenderCollection
# the name of the folder in which the video is stored is "stored_video"
# length of the video in the number of simulation steps
# if we do not specify the length, the video will be recorded until the end of the episode 
# that is, when terminalState becomes TRUE
# just make sure that this parameter is smaller than the expected number of 
# time steps within an episode
# for some reason this parameter does not produce the expected results, for smaller than 450 it gives OK results
video_length=400
# the step_trigger parameter is set to 1 in order to ensure that we record the video every step
#env = gym.wrappers.RecordVideo(env, 'stored_video',step_trigger = lambda x: x == 1, video_length=video_length)
env = gymnasium.wrappers.RecordVideo(env, 'stored_video', video_length=video_length)


# since the initial state is not a terminal state, set this flag to false
terminalState=False
while not terminalState:
    # get the Q-value (1 by 2 vector)
    Qvalues=loaded_model.predict(currentState.reshape(1,4), verbose=0)
    # select the action that gives the max Qvalue
    action=np.random.choice(np.where(Qvalues[0,:]==np.max(Qvalues[0,:]))[0])
    # if you want random actions for comparison
    #action = env.action_space.sample()
    # apply the action
    (currentState, currentReward, terminalState,_,_) = env.step(action)
    # sum the rewards
    sumObtainedRewards+=currentReward
    env.render()

env.reset()
env.close()

  logger.warn("Unable to save last video! Did you call close()?")
  logger.warn(


Moviepy - Building video d:\RL\gym_version_new\stored_video\rl-video-episode-0.mp4.
Moviepy - Writing video d:\RL\gym_version_new\stored_video\rl-video-episode-0.mp4



                                                               

Moviepy - Done !
Moviepy - video ready d:\RL\gym_version_new\stored_video\rl-video-episode-0.mp4


KeyboardInterrupt: 

In [7]:
env = gym.make("CartPole-v1", render_mode='human')
env.reset()
terminalState=False
while not terminalState:
    # get the Q-value (1 by 2 vector)
    Qvalues=loaded_model.predict(currentState.reshape(1,4), verbose=0)
    # select the action that gives the max Qvalue
    action=np.random.choice(np.where(Qvalues[0,:]==np.max(Qvalues[0,:]))[0])
    # if you want random actions for comparison
    #action = env.action_space.sample()
    # apply the action
    (currentState, currentReward, terminalState,_,_) = env.step(action)
    # sum the rewards
    sumObtainedRewards+=currentReward
    env.render()


env.close()

KeyboardInterrupt: 

In [None]:
env.close()