In [None]:
try:
    from google.colab import drive
    %tensorflow_version 2.x
    global COLAB
    COLAB = True
    print("Note: using Google CoLab")
except:
    print("Note: not using Google CoLab")
    COLAB = False

if COLAB:
  !sudo apt-get install -y xvfb ffmpeg
  !pip install -q 'gym==0.10.11'
  !pip install -q 'imageio==2.4.0'
  !pip install -q PILLOW
  !pip install -q 'pyglet==1.3.2'
  !pip install -q pyvirtualdisplay
  !pip install -q --upgrade tensorflow-probability
  !pip install -q tf-agents

  # ROM dependencies for atari games
  ! wget http://www.atarimania.com/roms/Roms.rar
  ! mkdir /content/ROM/
  ! unrar e /content/Roms.rar /content/ROM/
  ! python -m atari_py.import_roms /content/ROM/

  # For visualisation on colab
  !pip install gym pyvirtualdisplay > /dev/null 2>&1
  !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

  # gym installation
  !apt-get update > /dev/null 2>&1
  !apt-get install cmake > /dev/null 2>&1
  !pip install --upgrade setuptools 2>&1
  !pip install ez_setup > /dev/null 2>&1
  !pip install gym[atari] > /dev/null 2>&1

  # Required to save models in HDF5 format
  !pip install pyyaml h5py
  
  # For use of GPU
  %tensorflow_version 2.x
  import tensorflow as tf
  device_name = tf.test.gpu_device_name()
  if device_name != '/device:GPU:0':
    print('GPU device not found!!! Using CPU')
    # raise SystemError('GPU device not found')
    global CPU
    CPU = True
  print('Found GPU at: {}'.format(device_name))

In [6]:
import gym
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay
from google.colab import files,drive

display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment 
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    print(video)
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [None]:
# env = wrap_env(gym.make("MountainCar-v0"))
with tf.device('/device:GPU:0'):   

  for i in range(3):
    env = wrap_env(gym.make("MsPacman-v0")) 
    observation = env.reset()
    while True:
          
          # env.render()
          #your agent goes here
          action = env.action_space.sample() 
          if(i==1): action = env.action_space.sample()

          observation, reward, done, info = env.step(action)    
              
          if done:
              env.close()
              break;

show_video()

%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [8]:
import tensorflow as tf
import numpy as np
import gym
import scipy.misc
import os
import random
import time
import pickle
import matplotlib.pyplot as plt
from collections import deque
from skimage import transform
from google.colab import files,drive
from tensorflow.keras import layers
from tensorflow.math import add,reduce_mean
from tensorflow.keras.layers import Input, Add, Dense, NoisyDense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, \
                                    AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D
from tensorflow.keras.models import Model, load_model, Sequential
from tensorflow.keras.losses import MeanSquaredError 
from tensorflow.keras.metrics import MeanSquaredError
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.initializers import random_uniform, glorot_uniform, constant, identity


In [2]:
env = wrap_env(gym.make('MsPacman-v0'))
input_shape = (1,84,84,4)
classes = env.action_space.n
env.close()
print(input_shape, classes)

In [None]:
# For saving results in drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def identity_block(X, f, filters, training=True, initializer=random_uniform):

    # Retrieve Filters
    F1, F2, F3 = filters
    
    # Save the input value. You'll need this later to add back to the main path. 
    X_shortcut = X
    
    # First component of main path
    X = Conv2D(filters = F1, kernel_size = 1, strides = (1,1), padding = 'valid', kernel_initializer = initializer(seed=0))(X)
    X = BatchNormalization(axis = 3)(X, training = training) # Default axis
    X = Activation('relu')(X)

    ## Second component of main path (≈3 lines)
    X = Conv2D(filters = F2, kernel_size = f, strides = (1,1), padding = 'same', kernel_initializer = initializer(seed=0))(X)
    X = BatchNormalization(axis = 3)(X, training = training) # Default axis
    X = Activation('relu')(X)

    ## Third component of main path (≈2 lines)
    X = Conv2D(filters = F3, kernel_size = 1, strides = (1,1), padding = 'valid', kernel_initializer = initializer(seed=0))(X)
    X = BatchNormalization(axis = 3)(X, training = training) # Default axis
    
    ## Final step: Add shortcut value to main path, and pass it through a RELU activation (≈2 lines)
    X = Add()([X_shortcut,X])
    X = Activation('relu')(X)

    return X

In [None]:
def convolutional_block(X, f, filters, s = 2, training=True, initializer=glorot_uniform):

    # Retrieve Filters
    F1, F2, F3 = filters
    
    # Save the input value
    X_shortcut = X

    ##### MAIN PATH #####
    
    # First component of main path glorot_uniform(seed=0)
    X = Conv2D(filters = F1, kernel_size = 1, strides = (s, s), padding='valid', kernel_initializer = initializer(seed=0))(X)
    X = BatchNormalization(axis = 3)(X, training=training)
    X = Activation('relu')(X)

    ## Second component of main path (≈3 lines)
    X = Conv2D(filters = F2, kernel_size = f, strides = (1,1), padding = 'same', kernel_initializer = initializer(seed=0))(X)
    X = BatchNormalization(axis = 3)(X, training = training) # Default axis
    X = Activation('relu')(X)

    ## Third component of main path (≈2 lines)
    X = Conv2D(filters = F3, kernel_size = 1, strides = (1,1), padding = 'valid', kernel_initializer = initializer(seed=0))(X)
    X = BatchNormalization(axis = 3)(X, training = training) # Default axis
    
    ##### SHORTCUT PATH ##### (≈2 lines)
    X_shortcut = Conv2D(filters = F3, kernel_size = 1, strides = (s,s), padding = 'valid', kernel_initializer = initializer(seed=0))(X_shortcut)
    X_shortcut = BatchNormalization(axis = 3)(X_shortcut, training = training) # Default axis
    
    # Final step: 
    X = Add()([X, X_shortcut])
    X = Activation('relu')(X)
    
    return X

In [None]:
def ResNet50(input_shape, classes): # input and classes depends on gym environment
    """
    Stage-wise implementation of the architecture of the popular ResNet50:
    CONV2D -> BATCHNORM -> RELU -> MAXPOOL -> CONVBLOCK -> IDBLOCK*2 -> CONVBLOCK -> IDBLOCK*3
    -> CONVBLOCK -> IDBLOCK*5 -> CONVBLOCK -> IDBLOCK*2 -> AVGPOOL -> FLATTEN -> DENSE 

    Arguments:
    input_shape -- shape of the images of the dataset
    classes -- integer, number of classes

    Returns:
    model -- a Model() instance in Keras
    """
    
    # Define the input as a tensor with shape input_shape
    X_input = Input(input_shape)
    
    # Zero-Padding
    X = ZeroPadding2D((3, 3))(X_input)
    
    # Stage 1
    X = Conv2D(64, (7, 7), strides = (2, 2), kernel_initializer = glorot_uniform(seed=0))(X)
    X = BatchNormalization(axis = 3)(X)
    X = Activation('relu')(X)
    X = MaxPooling2D((3, 3), strides=(2, 2))(X)

    # Stage 2
    X = convolutional_block(X, f = 3, filters = [64, 64, 256], s = 1)
    X = identity_block(X, 3, [64, 64, 256])
    X = identity_block(X, 3, [64, 64, 256])
    
    ## Stage 3 (≈4 lines)
    X = convolutional_block(X, f = 3, filters = [128, 128, 512], s = 2)
    X = identity_block(X, 3, [128, 128, 512])
    X = identity_block(X, 3, [128, 128, 512])
    X = identity_block(X, 3, [128, 128, 512]) 
    
    ## Stage 4 (≈6 lines)
    X = convolutional_block(X, f = 3, filters = [256, 256, 1024], s = 2)
    X = identity_block(X, 3, [256, 256, 1024])
    X = identity_block(X, 3, [256, 256, 1024])
    X = identity_block(X, 3, [256, 256, 1024])
    X = identity_block(X, 3, [256, 256, 1024])
    X = identity_block(X, 3, [256, 256, 1024])

    ## Stage 5 (≈3 lines)
    X = convolutional_block(X, f = 3, filters = [512, 512, 2048], s = 2)
    X = identity_block(X, 3, [512, 512, 2048])
    X = identity_block(X, 3, [512, 512, 2048])

    ## AVGPOOL (≈1 line). Use "X = AveragePooling2D(...)(X)"
    X = AveragePooling2D(pool_size = (2,2))(X)

    # output layer
    X = Flatten()(X)
    X = Dense(classes, activation='softmax', kernel_initializer = glorot_uniform(seed=0))(X)
    
    
    # Create model
    model = Model(inputs = X_input, outputs = X)

    return model

In [None]:
def Deepminds_model(input_shape, classes):
        
        
    X_input = Input(input_shape)
    
    X = Conv2D(32, (8, 8), strides = (4, 4), kernel_initializer = glorot_uniform(seed=0))(X_input)
    X = Activation('relu')(X)
    X = Conv2D(64, (4, 4), strides = (2, 2), kernel_initializer = glorot_uniform(seed=0))(X)
    X = Activation('relu')(X)
    X = Conv2D(64, (3, 3), strides = (1, 1), kernel_initializer = glorot_uniform(seed=0))(X)
    X = Activation('relu')(X)
    X = Flatten()(X)
    X = Dense(512, activation='relu', kernel_initializer = glorot_uniform(seed=0))(X)
    X = Dense(classes, activation= None, kernel_initializer = glorot_uniform(seed=0))(X)
    
    model = Model(inputs = X_input, outputs = X)

    return model
    
    

In [None]:
def ConvDuelingDQN(input_shape, classes):

    """
    We split the network into two separate streams, one for estimating the state-value and
    the other for estimating state-dependent action advantages.

    The last module of the neural network implements forward mapping shown below: 

    Q(s,a;theta,alpha.beta) = V(s;theta,beta)  + A(s,a;theta,alpha) - sigma A(s,a;theta,alpha)/|A|

    # Here argmax_a A can also be used instead of mean A
    """

    X_input = Input(input_shape)
    
    X = Conv2D(32, (8, 8), strides = (4, 4), kernel_initializer = glorot_uniform(seed=0))(X_input)
    X = Activation('relu')(X)
    X = Conv2D(64, (4, 4), strides = (2, 2), kernel_initializer = glorot_uniform(seed=0))(X)
    X = Activation('relu')(X)
    X = Conv2D(64, (3, 3), strides = (1, 1), kernel_initializer = glorot_uniform(seed=0))(X)
    X = Activation('relu')(X)
    X = Flatten()(X)

    # Value stream
    V = Dense(128,activation='relu', kernel_initializer = glorot_uniform(seed=0))(X)
    V = Dense(1,activation=None, kernel_initializer = glorot_uniform(seed=0))(V)

    # Action stream
    A = Dense(128,activation='relu',kernel_initializer = glorot_uniform(seed=0))(X)
    A = Dense(classes, activation=None, kernel_initializer= glorot_uniform(seed=0))(A)

    Q = add(V, add(A, -reduce_mean(A)/ classes ))

    model = Model(inputs = X_input, outputs = Q)
    
    return model


In [None]:
def NoisyNet_Dueling(input_shape, classes):
    """
    Input :Env Environment; ε set of random variables of the network
    Input :DUELING Boolean; "true" for NoisyNet-Dueling and "false" for NoisyNet-DQN
    Input :B empty replay buffer; ζ initial network parameters; ζ − initial target network parameters
    Input :N B replay buffer size; N T training batch size; N − target network replacement frequency
    Output :Q(·, ε; ζ) action-value function

    for episode e ∈ {1, . . . , M } do
      Initialise state sequence x 0 ∼ Env
        for t ∈ {1, . . . } do
              /* l[−1] is the last element of the list l */
              Set x ← x 0
              Sample a noisy network ξ ∼ ε
              Select an action a ← argmax b∈A Q(x, b, ξ; ζ)
              Sample next state y ∼ P (·|x, a), receive reward r ← R(x, a) and set x 0 ← y
              Add transition (x, a, r, y) to the replay buffer B[−1] ← (x, a, r, y)
              if |B| > N B then
                  Delete oldest transition from B
              end
              /* D is a distribution over the replay, it can be uniform or
              implementing prioritised replay  */
              Sample a minibatch of N T transitions ((x j , a j , r j , y j ) ∼ D)_{j=1}^{N T}
              /* Construction of the target values.  */
              Sample the noisy variable for the online network ξ ∼ ε
              Sample the noisy variables for the target network ξ 0 ∼ ε
              if DUELING then
                  Sample the noisy variables for the action selection network ξ 00 ∼ ε
              for j ∈ {1, . . . , N T } do
                  if y j is a terminal state then
                    Q' ← r j
                  if DUELING then
                    b ∗ (y j ) = arg max b∈A Q(y j , b, ξ 00 ; ζ)
                    Q' ← r j + γQ(y j , b ∗ (y j ), ξ 0 ; ζ − )
                  else
                    Q' ← r j + γ max b∈A Q(y j , b, ξ 0 ; ζ − )
                    Q' − Q(x j , a j , ξ; ζ)) 2
                    Do a gradient step with loss ( Q
              end
                  if t ≡ 0 (mod N − ) then
              Update the target network: ζ − ← ζ
        end
    end

Instead of choosing a noise sample before feeding the states into the model,
we choose the noise samle in the model itself such that whenever a batch or a single state is 
passed a different set of gaussian noise sample is obtained.

There is a NoisyDense layer in tf.model.layers :)
    """
    X_input = Input(input_shape)
    
    X = Conv2D(32, (8, 8), strides = (4, 4), kernel_initializer = glorot_uniform(seed=0))(X_input)
    X = Activation('relu')(X)
    X = Conv2D(64, (4, 4), strides = (2, 2), kernel_initializer = glorot_uniform(seed=0))(X)
    X = Activation('relu')(X)
    X = Conv2D(64, (3, 3), strides = (1, 1), kernel_initializer = glorot_uniform(seed=0))(X)
    X = Activation('relu')(X)
    X = Flatten()(X)

    # Value stream
    V = Dense(128,activation='relu', kernel_initializer = glorot_uniform(seed=0))(X)
    V = NoisyDense(1, activation=None, use_factorised = True, kernel_initializer = glorot_uniform(seed=0))(V)

    # Action stream
    A = Dense(128,activation='relu',kernel_initializer = glorot_uniform(seed=0))(X)
    A = NoisyDense(classes, activation=None, use_factorised = True, kernel_initializer= glorot_uniform(seed=0))(A)

    Q = add(V, add(A, -reduce_mean(A)/ classes ))

    model = Model(inputs = X_input, outputs = Q)

    return model


In [None]:
def scale_lumininance(img):
    return np.dot(img[...,:3],[0.299,0.587,0.114])

In [None]:
def preprocess_observation(obs):
    """
    To reduce the computation load we preprocess the date obtained from the atari-game env by
    reducing the resolution and adjusting the color.
    
    DeepMind took the maximum pixel value over subsequent frames to reduce flickering
    caused by the limitations of the Atari platform and then scale it from its current 
    210×160×3 resolution to 84×84.
    
    To convert this, we will take the luminance channel (denoted as Y) from the image, which is the our RGB channel, and apply linear weights to 
    the channel to transform it according to the relative luminance.
    Y = 0.299R + 0.587G + 0.114B
    """
    
    obs_gray = scale_lumininance(obs)
    obs_trans = transform.resize(obs_gray,(84,84))
    return np.moveaxis(obs_trans, 1, 0)

In [None]:
class Prioritized_replay_buffer():
    def __inti__(self,buffer_size):
      """
      Before application of Prioritized experience replay samples were choosen randomly using random.sample
      But now every sample were choosen according the their priority and the weights of the network were also updated 
      in accordance to these priorities.

      There can be two types of prioritization.
      Here we are using proportional based with hyperparameter a = 0.7, b = 0.5 

      # sampling                              # parameter updates
      error = Q(s,a) - Q_target               loss = error^2
      p = |error| + offset                    theta -> theta - alpha * w * grad
      p_r(i) = p_i^a/(sigma p^a)              w_i = (1/N * 1/Pr_(i))^b  # importance_weight
      
      a => priority_scale (0.7)
      # remember to add this update at the time of eps-decay
      b => 0.6 -- 1       # fully compensates for the non-uniform probabilities
      """      


      self.buffer = deque(maxlen = buffer_size)
      self.buffer_size = buffer_size
      self.priorities = deque(maxlen = buffer_size) # a different deque
      self.initialize(initial_collect_step)
      
    def initialize(self,initial_collect_step):

        observation = env.reset()
        current_state = pre_process(observation) # 84,84 grascale frame
        current_state = np.stack((current_state, current_state, current_state, current_state), axis=2) # for initialization the first frame is repeated four times
        current_state = np.reshape([current_state], (1, 84, 84, 4))        
        
        for i in range(initial_collect_steps):
        
            current_state = preprocess_observation(observation)
            action = env.action_space.sample()
            observation, reward, done , _ = env.step(action)
            next_state = pre_process(observation) # 84,84 grascale frame 
            next_state = np.reshape([next_state], (1, 84, 84, 1))
            next_state = np.append(next_state, current_state[:, :, :, :3], axis=3)

            if done: 
              observation = env.reset()
        
            experience = (current_state, action, reward, next_state, done)
            self.add(experience)

            current_state = next_state
        
        env.close()
    
    def add(self,experience):
      self.buffer.append(experience)
      self.priorities.append(max(self.priorities, default = 1))

    def get_probabilities(self, priority_scale):
      scaled_priorities = np.array(self.priorities) ** priority_scale
      sample_probabilities = scaled_priorities/ sum(scaled_priorities)

    def get_importance(self, probabilities):
      importance = 1/self.buffer_size * 1/probabilities
      # this has been normalized to keep the update_step size bounded
      importance_normalized = importance/max(importance)
      return importance_normalized

    def sample(self, batch_size, priority_scale = 1.0):
      sample_size = min(len(self.buffer), batch_size)
      sample_probs = self.get_probabilities(priority_scale)
      sample_indices = random.choices(range(self.buffer_size), k=sample_size, weights=sample_probs)
      samples = np.array(self.buffer)[sample_indices]
      importance = self.get_importance(sample_probs[sample_indices])
      return samples, importance , sample_indices

    def set_priorities(self, indices, errors, offset=0.1):
        for i,e in zip(indices, errors):
            self.priorities[i] = abs(e) + offset

    def save(self,save_index):
        buffer_dir = '/content/drive/MyDrive/pacman_SOC_outputs/buffers'
        if not os.path.exists(buffer_dir):
              raise SystemError('buffer_dir not found: make a directory at path /content/drive/MyDrive/pacman_SOC_outputs/buffers')
        pickle.dump(self.buffer,open(os.path.join(buffer_dir,'replay-{},dat'.format(save_index),'wb')))


In [None]:
def initialization(input_shape , classes, initial_collect_steps, buffer_size):
    """
    The target_network and online_network are initialized as Noisy_Dueling CNN architecture with trainability
    of online_network set to False.
    
    The Experience Reply is initialized as a deque whose each element 
    is a tuple (current state, action, reward , done, next state)
    Few actions are executed with the environment to bootstrap the replay data.
    """
    global target_network, prediction_network 
    target_network = Noisy_Dueling(input_shape, classes)
    print(target_network.summary())
    
    prediction_network = Noisy_Dueling(input_shape, classes)
    print(prediction_network.summary())
    
    experience_replay = Prioritized_replay_buffer(buffer_size,initial_collect_step)

In [None]:
def plot(CR , episode, save_fig = True):
    
    """
    plot of cumulative reward at the end of each episode
    plot being saved at each log_interval 
    """
    plt.plot([i+1 for i in range(episode)],CR, linewidth=4, label = "NoisyNet n-step PDD DQN")
    plt.xlabel("Episode")
    plt.ylabel("Episode_reward")
    leg = plt.legend(loc='upper left', shadow=True)
    plt.grid()
    ax = plt.gca()
    plt.xlim([0, episode])
    
    plot_dir = '/content/drive/MyDrive/pacman_SOC_outputs/plots'
    cum_rewards_dir = '/content/drive/MyDrive/pacman_SOC_outputs/cum_rewards'
    if not os.path.exists(cum_rewards_dir):
        raise SystemError('cum_rewards_dir not found: make a directory at path /content/drive/MyDrive/pacman_SOC_outputs/cum_rewards')
    if not os.path.exists(plot_dir):
        raise SystemError('plot_dir not found: make a directory at path /content/drive/MyDrive/pacman_SOC_outputs/plots')

    pickle.dump(self.buffer,open(os.path.join(cum_rewards_dir,'cum_reward-{},dat'.format(episode),'wb'))) 
    if save_fig:
        plt.savefig(os.path.join(plot_dir, "plot-{}th-episode.png".format(episode)), bbox_inches="tight")
    else:
        plt.show()
    plt.close()

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def get_n_step_info(self,n_step_buffer,gamma):
    """
    This function has been made for the implementation of n_step Temporal Difference method in Prioritized Replay Buffer
    n_step_buffer stores the (s,a,r,n_s,d) for n_step (3 here) previous states
    S_{t-n_step+1} --> S_{t-n_step+2}
    S_{t-n_step+2} --> S_{t-n_step+3}
    ....
    S_t  --> S_{t+1}
    
    In our n_step_buffer (a list of size n_step) we are adding temp_transition : (S_t, action_t, r_t, S_{t+1}, done_t)
    
    action_t => action for transition S_t -> S_{t+1}
    r_t => reward for transition S_t -> S_{t+1}
    done_t => done for S_{t+1} i.e, whether it is terminal or not

    This function takes in the n_step_buffer and gamma and outputs the experience to be fed into the experience_replay.
    experience:  (S_{t-n_step+1} , action_{t-n_step+1}, rho, S_{t+1}, done_t)

    action_{t-n_step+1} => action for transition S_{t-n_step+1} -> S_{t-n_step+2}
    rho => sum of discounted reward 
    done_t => done for S_{t+1} i.e, whether it is terminal or not

    These values of experience ensures that we don't need to change the rest of the code for training the online network
    """

    # info of the last transition
    reward, next_state, done = n_step_buffer[-1][-3:]

    for transition in reversed(list(n_step_buffer)[:-1]):
        r, n_s, d = transition[-3:]

        reward = r + gamma * reward * (1 - d)
        next_state, done = (n_s, d) if d else (next_state, done)

    return reward, next_state, done



In [None]:
def experiment(N_episodes, gamma, learning_rate, target_model_change, batch_size, log_interval, priority_scale = 0.7,n_step = 3): 
  
  # Instantiate an optimizer to train the model.
  optimizer = tf.keras.optimizers.RMSprop(learning_rate) # used by deepminds
  # Instantiate a loss function.
  loss_fn = tf.keras.losses.MeanSquaredError()
  # Prepare the metrics.
  train_acc_metric = tf.keras.metrics.MeanSquaredError()
  
  CR = []
  with tf.device('/device:GPU:0'):

    for episode in range(N_episodes):
        start_time = time.time()
        time_step = 0
        env = wrap_env(gym.make('MsPacman-v0'))
        
        observation = env.reset()
        current_state = pre_process(observation) # 84,84 grascale frame
        current_state = np.stack((current_state, current_state, current_state, current_state), axis=2) # for initialization the first frame is repeated four times
        current_state = np.reshape([current_state], (1, 84, 84, 4))          
        
        n_step_buffer = []
        
        rewards = [] # compared to total points earned in the game
        while True:
            
            with tf.GradientTape() as tape:
                
                Q = prediction_network(np.array([current_state]), training=False)
                action = np.random.choice(np.flatnonzero(Q.numpy() == Q.numpy().max())) 
                new_observation , reward , current_done , _ = env.step(action)
                rewards.append(reward)
                
                # this part of the code appends the next_state to the set of four previous states which constitute the current_state
                next_state = pre_process(new_observation)
                next_state = np.reshape([next_state], (1, 84, 84, 1))
                next_state = np.append(next_state, current_state[:, :, :, :3], axis=3) # next_state appended in the set of last four states 
                
                temp_transition = (current_state, action, reward, next_state, current_done)

                # n_step replay buffer
                ####################################################################################################              
                n_step_buffer.append(temp_transition)
                if len(n_step_buffer) == n_step:  # fill the n-step buffer for the first translation
                    # add a multi step transition
                    rho , state , done = get_n_step_info(n_step_buffer, gamma)
                    n_prev_state, action = self.n_step_buffer[0][:2]
                ####################################################################################################

                
                # adding this time step experience into experience_replay               
                experience = (n_prev_state, action, rho, state, done)
                experience_replay.add(experience)
                
                
                # sampling batch out of experience replay, forward prop to find Q_target and Q_prediction
                ####################################################################################################
                minibatch, importance , indices = experience_replay.sample(batch_size, priority_scale)
                n_prev_state_minibatch = np.array([n_prev_state for (n_prev_state, action, rho , state, done) in minibatch])
                actions = np.array([action for (n_prev_state, action, rho , state, done) in minibatch])
                state_minibatch = np.array([state for (n_prev_state, action, rho , state, done) in minibatch])            

                Q_target = target_network(state_minibatch,training = False)

                target = []
                for i, (n_prev_state, action, rho , state, done) in enumerate(minibatch):
                    if done:
                        target.append(rho)
                    else:                      
                        target.append(rho + (gamma**n_step) * Q_target.numpy()[i].max())

                prediction_network_output = prediction_network(n_prev_state_minibatch, training = True)
                prediction = [Q[action] for Q,action in zip(prediction_network_output,actions)]
                ####################################################################################################
                
                
                # error calculation, update of priority of this experience based on error, loss and backprop
                ####################################################################################################
                errors = abs(target - prediction)
                experience_replay.set_priorities(indices, errors)
                loss_value = loss_fn(target , prediction)
                grads = tape.gradient(loss_value, prediction_network.trainable_variables)
                optimizer.apply_gradients(zip(grads, prediction_network.trainable_variables))
                ####################################################################################################

                # Update training metric.
                train_acc_metric.update_state(target, prediction)

                # S -> S'
                current_state = next_state
                time_step+=1
                
                if time_step%100 == 0 :
                  print("Time_step:{}".format(time_step))
                
                if episode%target_model_change ==0: 
                    show_video()
                    target_network.set_weights(prediction_network.get_weights()) 
              
                if current_done:
                
                    # Display metrics at the end of each episode.
                    train_acc = train_acc_metric.result()

                    # Reset training metrics at the end of each episode
                    train_acc_metric.reset_states()
                    
                    CR.append(np.squeeze(np.sum(np.array(rewards))))

                    env.close()
                    if (episode%log_interval==0):
                      models_dir = '/content/drive/MyDrive/pacman_SOC_outputs/models'
                      if not os.path.exists(output_dir): 
                          raise SystemError('model_dir not found: make a directory at path /content/drive/MyDrive/pacman_SOC_outputs/models')
                      prediction_network.save(os.path.join(models_dir,"model-{}.h5".format(episode/log_interval)))
                      experience_replay.save(episode/log_interval)
                      plot(CR,episode+1)
                    print("Episode {} finished after {} timesteps".format(episode+1, time_step+1))
                    print("{}s taken to finish this episode".format(time.time() - start_time))
                    break

In [None]:
learning_rate = 2.5e-3

log_interval = 20

initial_collect_steps = 750
replay_buffer_max_length = 50000

gamma = 0.99
n_TD = 3 
target_model_change = 100
batch_size = 32
N_episodes = 12000
priority_scale = 0.7

initialization(input_shape,classes,initial_collect_steps,replay_buffer_max_length)
experiment(N_episodes, gamma, learning_rate, target_model_change, batch_size,eps,log_interval,priority_scale,n_TD)
