In [1]:
import os
import random
import numpy as np
import torch 
import torch.nn as nn # neural network module
import torch.optim as optim # optimizers
import torch.nn.functional as F 
import torch.autograd as autograd # for stochastci gradient descent 
from torch.autograd import variable
from collections import deque, namedtuple
import gymnasium as gym
from PIL import Image
from torchvision import transforms
from torch.utils.data import DataLoader, TensorDataset

  from .autonotebook import tqdm as notebook_tqdm


Network Architecture

In [16]:
class Network(nn.Module):
    def __init__(self, action_size, seed = 42):
        super(Network, self).__init__()
        self.seed = torch.manual_seed(seed)
        # Giving Eyes to the Agent in the form of a CNN
        self.conv1 = nn.Conv2d(3, 32, kernel_size = 8, stride = 4) # 3 input channels refer to RGB
        # When performing a convolution operation, the stride determines how many units the filter shifts at each step.
        self.bn1 = nn.BatchNorm2d(32) # num features should be same as the output size of the previous layer
        self.conv2 = nn.Conv2d(32, 64, kernel_size = 4, stride = 2)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 64, kernel_size = 3, stride = 1)
        self.bn3 = nn.BatchNorm2d(64)
        self.conv4 = nn.Conv2d(64, 128, kernel_size = 3, stride = 1)
        self.bn4 = nn.BatchNorm2d(128)
        # Giving a brain to the Agent in the form of an ANN
        self.fc1 = nn.Linear(10 * 10 * 128, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, action_size)

        ### know the equation for number of flattened neurons

    def forward(self, state):
        # propogates the images to the CNN
        x = F.relu(self.bn1(self.conv1(state)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = F.relu(self.bn4(self.conv4(x)))
        x = x.view(x.size(0), -1) # keeps the 1st dimenions and flattens the rest
        x = F.relu(self.fc1(x)) 
        x = F.relu(self.fc2(x))

        return self.fc3(x)

Setting up the environment

In [17]:
env = gym.make('MsPacmanDeterministic-v0', full_action_space=False)
state_shape = env.observation_space.shape
state_size = env.observation_space.shape[0]
number_actions = env.action_space.n

print(state_shape)
print(state_size)
print(number_actions)

(210, 160, 3)
210
9


  logger.deprecation(


Initialize Hyperparameters

In [18]:
learning_rate = 5e-4
minibatch_size = 64
discount_factor = 0.99
# we wont be implementing experience replay since the inputs are not vectors anymore
# instead they are images which need a lot of memory implemented

Preprocess Images from frames

In [19]:
def preprocess_frame(frame):
  frame = Image.fromarray(frame)
  preprocess = transforms.Compose([transforms.Resize((128, 128)), transforms.ToTensor()])
  return preprocess(frame).unsqueeze(0)

Implementing the DCQN class

In [20]:
class Agent():
    def __init__(self, action_size):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.action_size = action_size
        self.local_qnetwork = Network(action_size).to(self.device)
        self.target_qnetwork = Network(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.local_qnetwork.parameters(), lr = learning_rate)
        self.memory = deque(maxlen=10000) # double ended queue
    
    def step(self, state, action, reward, next_state, done):
        state = preprocess_frame(state)
        next_state = preprocess_frame(next_state)
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) > minibatch_size:
            experiences = random.sample(self.memory, k = minibatch_size)
            self.learn(experiences, discount_factor)


    def act(self, state, epsilon):
        # we'll implement epsilon greedy method
        # make the state variable into a torch tensor
        state = preprocess_frame(state).to(self.device) # we use unsqueeze to add another dimension at the beginning which corresponds to the batch id
        self.local_qnetwork.eval()
        with torch.no_grad():
            action_values = self.local_qnetwork(state)
            self.local_qnetwork.train()
        if random.random() > epsilon:
        # if the random number is greather than epsilon then choose the action with the highest Q-value
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, discount_factor):
        states, actions, rewards, next_states, dones = zip(*experiences)
        states = torch.from_numpy(np.vstack(states)).float().to(self.device)
        actions = torch.from_numpy(np.vstack(actions)).long().to(self.device)
        rewards = torch.from_numpy(np.vstack(rewards)).float().to(self.device)
        next_states = torch.from_numpy(np.vstack(next_states)).float().to(self.device)
        dones = torch.from_numpy(np.vstack(dones).astype(np.uint8)).float().to(self.device)
        next_q_targets = self.target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1)
        q_targets = rewards + discount_factor * next_q_targets * (1 - dones)
        q_expected = self.local_qnetwork(states).gather(1, actions)
        loss = F.mse_loss(q_expected, q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

Initialize the Agent

In [21]:
agent = Agent(number_actions)

Training the DCQN Agent

In [22]:
number_episodes = 2000
maximum_number_timesteps_per_episode = 10000
epsilon_starting_value  = 1.0
epsilon_ending_value  = 0.01
epsilon_decay_value  = 0.995
epsilon = epsilon_starting_value
scores_on_100_episodes = deque(maxlen = 100)

for episode in range(1, number_episodes + 1):
  state = env.reset()[0]
  score = 0
  for t in range(maximum_number_timesteps_per_episode):
    action = agent.act(state, epsilon)
    next_state, reward, done, _, _ = env.step(action)
    agent.step(state, action, reward, next_state, done)
    state = next_state
    score += reward
    if done:
      break
  scores_on_100_episodes.append(score)
  epsilon = max(epsilon_ending_value, epsilon_decay_value * epsilon)
  print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)), end = "")
  if episode % 100 == 0:
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)))
  if np.mean(scores_on_100_episodes) >= 500.0:
    print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(episode - 100, np.mean(scores_on_100_episodes)))
    torch.save(agent.local_qnetwork.state_dict(), 'checkpoint.pth')
    break

Episode 100	Average Score: 284.50
Episode 200	Average Score: 368.70
Episode 300	Average Score: 330.20
Episode 400	Average Score: 339.90
Episode 468	Average Score: 416.60

KeyboardInterrupt: 