In [1]:
### Train an RL agent using DQN architecture in a Unity environment (bananaModel)

In [2]:
from unityagents import UnityEnvironment
import numpy as np
import matplotlib.pyplot as plt
from collections import deque, defaultdict
import time
import sys
from tqdm import tqdm
from dqnetwork import DQNetwork
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
env = UnityEnvironment(file_name="Banana.app")

In [None]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

### Action space:
- `0` - walk forward 
- `1` - walk backward
- `2` - turn left
- `3` - turn right

### State space:
- `37` - dimensions.
- some samples include the agent's velocity.
- ray-based perception in the forward direction of the agent.

### Reward:

- `+1` - Yellow Banana collected.
- `-1` - Blue Banana collected.

In [None]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents in the environment
print('Number of agents:', len(env_info.agents))

# number of actions
action_size = brain.vector_action_space_size
print('Number of actions:', action_size)

# examine the state space 
state = env_info.vector_observations[0]
print('States look like:', state)
state_size = len(state)
print('States have length:', state_size)

In [None]:
env_info = env.reset(train_mode=False)[brain_name] # reset the environment
state = env_info.vector_observations[0]            # get the current state
score = 0                                          # initialize the score
i = 0
while True:
    i+=1
    action = np.random.randint(action_size)        # select an action
    env_info = env.step(action)[brain_name]        # send the action to the environment
    next_state = env_info.vector_observations[0]   # get the next state
    reward = env_info.rewards[0]                   # get the reward
    done = env_info.local_done[0]                  # see if episode has finished
    score += reward                                # update the score
    state = next_state                             # roll over the state to next time step
    if(reward != 0):
        print(reward)
    if done:                                       # exit loop if episode finished
        break
    
print("Score: {}".format(score))

In [None]:
print("iterations:",i)

In [None]:
# (37x128) -> (128x64) -> (64x32) -> (32x4)
input_features = [state_size, 128, 64, 32]
output_features = [128, 64, 32, action_size]

In [None]:
model = DQNetwork(input_features, output_features)

In [None]:
model.forward(state)

In [4]:
### define the agent class

In [9]:
import torch
import torch.nn.functional as F
import torch.optim as optim
from dqnetwork import DQNetwork
import random

In [10]:
class Agent():
    """Defines the agent class for DQN using Double Q-learning and Prioritized Experience Replay architecture"""
    def __init__(self, state_size=37, action_size=4, gamma=0.99, lr=0.001):
        """
        Initializes the model.
        ----
        @param:
        1. state_size: size of input # of states.
        2. action_size: size of # of actions.
        3. gamma: discounted return rate.
        4. lr: learning rate for the model.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.input_features = [state_size, 128, 64, 32]
        self.output_features = [128, 64, 32, action_size]
        
        #Q-network : defines the 2 DQN (using doubling Q-learning architecture via fixed Q target)
        self.qnetwork_local = DQNetwork(input_features, output_features)
        self.qnetwork_target = DQNetwork(input_features, output_features)
        
        #define the optimizer
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

In [11]:
#define the replay buffer class

In [138]:
class ReplayBuffer():
    """Defines the standard fixed size Experience Replay"""
    def __init__(self, buffer_size, batch_size):
        """
        Initializes a ReplayBuffer object
        ---
        @param:
        1. buffer_size: (int) max. length of the buffer (usually a deque or heap)
        2. batch_size: (int) size of the buffer. usually, 32 or 64.
        """
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        
        #Experience Replay init
        self.replay_memory = deque(maxlen=self.buffer_size) #initialize experience replay buffer (circular)
        
    def add(self, transition):
        """
        Appends to the underlying replay memory.
        ---
        @param:
        1. transition: (tuple) set of state-action value pair.
            when extracted, (state, action, reward, next_state, done)
        """
        self.replay_memory.append(transition) #store observed state-action tuples in replay memory.
    
    def sample(self):
        """
        Gausian based shuffling for retrieving experiences from the replay_memory.
        """
        experiences = random.sample(self.replay_memory, k=(self.batch_size if self.isSampling() else len(self.replay_memory)))
        return tuple(zip(*experiences)) #unzips into individual states, actions, rewards, next_actions, done(s)
    
    def isSampling(self):
        """Determines if sampling condition has been met, i.e. len(memory) > num_batches"""
        return self.batch_size < len(self.replay_memory)

In [139]:
buffer = ReplayBuffer(buffer_size=10, batch_size=6)