In [1]:
### Train an RL agent using DQN architecture in a Unity environment (bananaModel)

In [2]:
from unityagents import UnityEnvironment
import numpy as np
import matplotlib.pyplot as plt
from collections import deque, defaultdict
import time
import sys
from tqdm import tqdm
from dqnetwork import DQNetwork
import torch
import torch.nn as nn
import torch.nn.functional as F

In [177]:
env = UnityEnvironment(file_name="Banana.app")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [178]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

### Action space:
- `0` - walk forward 
- `1` - walk backward
- `2` - turn left
- `3` - turn right

### State space:
- `37` - dimensions.
- some samples include the agent's velocity.
- ray-based perception in the forward direction of the agent.

### Reward:

- `+1` - Yellow Banana collected.
- `-1` - Blue Banana collected.

In [179]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents in the environment
print('Number of agents:', len(env_info.agents))

# number of actions
action_size = brain.vector_action_space_size
print('Number of actions:', action_size)

# examine the state space 
state = env_info.vector_observations[0]
print('States look like:', state)
state_size = len(state)
print('States have length:', state_size)

Number of agents: 1
Number of actions: 4
States look like: [1.         0.         0.         0.         0.84408134 0.
 0.         1.         0.         0.0748472  0.         1.
 0.         0.         0.25755    1.         0.         0.
 0.         0.74177343 0.         1.         0.         0.
 0.25854847 0.         0.         1.         0.         0.09355672
 0.         1.         0.         0.         0.31969345 0.
 0.        ]
States have length: 37


In [None]:
env_info = env.reset(train_mode=False)[brain_name] # reset the environment
state = env_info.vector_observations[0]            # get the current state
score = 0                                          # initialize the score
i = 0
while True:
    i+=1
    action = np.random.randint(action_size)        # select an action
    env_info = env.step(action)[brain_name]        # send the action to the environment
    next_state = env_info.vector_observations[0]   # get the next state
    reward = env_info.rewards[0]                   # get the reward
    done = env_info.local_done[0]                  # see if episode has finished
    score += reward                                # update the score
    state = next_state                             # roll over the state to next time step
    if(reward != 0):
        print(reward)
    if done:                                       # exit loop if episode finished
        break
    
print("Score: {}".format(score))

In [None]:
print("iterations:",i)

In [None]:
# (37x128) -> (128x64) -> (64x32) -> (32x4)
input_features = [state_size, 128, 64, 32]
output_features = [128, 64, 32, action_size]

In [None]:
model = DQNetwork(input_features, output_features)

In [None]:
model.forward(state)

In [156]:
### define the agent class

In [158]:
### DEFINE CONSTANTS

In [213]:
BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 64         # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR = 5e-4               # learning rate 
UPDATE_EVERY = 4        # how often to update the network

In [214]:
import torch
# import torch.nn.functional as F
import torch.optim as optim
from dqnetwork import DQNetwork
import random
from buffer import ReplayBuffer

In [220]:
class Agent():
    """Defines the agent class for DQN using Double Q-learning and Prioritized Experience Replay architecture"""
    def __init__(self, state_size=37, action_size=4, gamma=0.99, lr=0.001, update_every=5):
        """
        Initializes the model.
        ----
        @param:
        1. state_size: size of input # of states.
        2. action_size: size of # of actions.
        3. gamma: discounted return rate.
        4. lr: learning rate for the model.
        5. update_every: update target_model every X time-steps.
        """
        self.state_size = state_size
        self.action_size = action_size
        self.input_features = [state_size, 128, 64, 32]
        self.output_features = [128, 64, 32, action_size]
        
        #Q-network : defines the 2 DQN (using doubling Q-learning architecture via fixed Q target)
        self.qnetwork_local = DQNetwork(input_features, output_features)
        self.qnetwork_target = DQNetwork(input_features, output_features)
        #define the optimizer
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        
        #replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE)
        self.update_every = update_every
        self.target_update_counter = 0
        
    def step(self, transition):
        """Performs forward pass of the tranisition
        @param:
        1. transition : (tuple) state, action, reward, next_state, done
        """
        # Save experience in replay memory
        self.memory.add(transition)
        self.target_update_counter = (self.target_update_counter + 1) % self.update_every
        
        #Update target network to local network
        if(self.target_update_counter == 0):
            #primary condition to check if len(buffer) > batch_size
            experiences = self.memory.sample()
            self.train(experiences, self.gamma)
    
    def get_action(self, state, eps=0.0):
        """
        Determines the action to perform based on epsilon-greedy method
        @param:
        1. state - list of current observations to determine an action for
        2. eps - value for epsilon, stochastic measure.
        @return:
        - action = action chosen by either equiprobably π or using Q-table
        """
        state = torch.from_numpy(state).float().unsqueeze(0)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_val = self.qnetwork_local(state)
        
        self.qnetwork_local.train()#train local network
        
        #Epsilon-greedy selection
        if(random.random() > eps):#exploit
            return np.argmax(action_val.cpu().data.numpy())
        
        return random.choice(np.arange(self.action_size))#explore
    
    def train(self, experiences):
        """
        Train the model.
        @param:
        1. experiences: (Tuple[torch.Variable]) (s,a,r,s',done)
        """
        states, actions, rewards, next_states, done = experiences
        
        #Implement SGD using Adam as regularizer
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - done))
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        #set loss as mse.
        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        #Update target network using soft update
        self.soft_update(self.qnetwork_local, self.qnetwork_target)
    
    def soft_update(self, local_model, target_model):
        """
        Update target network to local network using a soft update param, τ.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        ------
        @param:
        1. local_model: (DQNetwork) local network model (weights will be copied from)
        2. target_model: (DQNetwork) target network model (weights will be copied into)
        """
        
        target_model = TAU*local_model + (1 - TAU) * target_model
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

In [221]:
#define the replay buffer class

In [222]:
agent = Agent()

<generator object Module.parameters at 0x7fac012e9f10>


ValueError: optimizer got an empty parameter list