In [7]:
import gym
env = gym.make( "MsPacman-v4")


In [8]:
#pip install gym[accept-rom-license]

In [9]:
#pip install --user tensorflow


In [10]:
import random
import gym
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation,Flatten,Conv2D,MaxPooling2D
from tensorflow.keras.optimizers import Adam
from collections import deque

In [11]:
env=gym.make("MsPacman-v4",render_mode="rgb_array")
state_size=(88,80,1)
action_space=env.action_space.n

In [12]:
color=np.array([210,164,74]).mean()
def preprocess_state(state):
    image=state[1:176:2,::2]
    image=image.mean(axis=2)
    image[image==color]=0
    image=(image-128)/128-1
    image=np.expand_dims(image.reshape(88,80,1),axis=0)
    return image

In [13]:
class DQN:
    def __init__(self,state_size,action_size):
        self.state_size=state_size
        self.action_size=action_size
        self.replay_buffer=deque(maxlen=5000)
        self.gamma=0.9
        self.epsilon=0.8
        self.update_rate=1000
        self.main_network=self.build_network()
        self.target_network=self.build_network()
        self.target_network.set_weights(self.main_network.get_weights())
    
    def build_network(self):
        model=Sequential()
        model.add(Conv2D(32,(8,8),strides=4,padding='same',input_shape=self.state_size))
        model.add(Activation('relu'))
        
        model.add(Conv2D(64,(4,4),strides=2,padding='same'))
        model.add(Activation('relu'))
        
        model.add(Conv2D(64,(3,3),strides=1,padding='same'))
        model.add(Activation('relu'))
        
        model.add(Flatten())
        model.add(Dense(512,activation='relu'))
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam())
        return model

    def store_trasition(self,state,action,reward,next_state,done):
        self.replay_buffer.append((state,action,reward,next_state,done))

    def epsilon_greedy(self,state):
        if random.uniform(0,1)<self.epsilon:
            return np.random.randint(self.action_size)
      
        Q_values=self.main_network.predict(state)
        return np.argmax(Q_values[0])


    def train(self,batch_size):
        minibatch=random.sample(self.replay_buffer,batch_size)
        for state,action,reward,next_state,done in minibatch:
            if not done:
                target_Q=(reward+self.gamma*np.amax(self.target_network.predict(next_state)))
            else:
                target_Q=reward

        Q_values=self.main_network.predict(state)
        Q_values[0][action]=target_Q
        self.main_network.fit(state,Q_values,epochs=1,verbose=0)

    def update_target_network(self):
        self.target_network.set_weights(self.main_network.get_weights())

In [14]:
env=gym.make("MsPacman-v4",render_mode="rgb_array")
state_size=(88,80,1)
action_size=env.action_space.n
print(action_size)
print(state_size)
num_episodes=10
num_timesteps=50
batch_size=8
num_screens=4
dqn=DQN(state_size,action_size)
done=False
time_step=0
print()
for i in range(num_episodes):
    Return=0
    s,info=env.reset()
    
    state=preprocess_state(s)
    for t in range(num_timesteps):
        env.render()
        time_step+=1
        if time_step % dqn.update_rate==0:
            dqn.update_target_network()

        action=dqn.epsilon_greedy(state)
        next_state,reward,done,info,_=env.step(action)
        next_state=preprocess_state(next_state)
        dqn.store_trasition(state,action,reward,next_state,done)
        state=next_state
        Return+=reward
        if done:
            print("Episode: ",i,"....","Return",Return)
            break

        if len(dqn.replay_buffer)>batch_size:
            dqn.train(batch_size)

9
(88, 80, 1)



























































In [15]:
import gym
env = gym.make("MsPacman-v4",render_mode="human")
import time 
num_steps = 1500
obs = env.reset()
for step in range(num_steps):    
    action = env.action_space.sample()    
    obs, reward, done, info,k = env.step(action)    
    env.render()    
    time.sleep(0.001)   
    if done:
        env.reset()

# Close the env
env.close()