In [None]:
### Train an RL agent using DQN architecture in a Unity environment (bananaModel)

In [41]:
from unityagents import UnityEnvironment
import numpy as np
import matplotlib.pyplot as plt
from collections import deque, defaultdict
import time
import sys
from tqdm import tqdm
from dqnetwork import DQNetwork
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
env = UnityEnvironment(file_name="Banana.app")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

### Action space:
- `0` - walk forward 
- `1` - walk backward
- `2` - turn left
- `3` - turn right

### State space:
- `37` - dimensions.
- some samples include the agent's velocity.
- ray-based perception in the forward direction of the agent.

### Reward:

- `+1` - Yellow Banana collected.
- `-1` - Blue Banana collected.

In [43]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents in the environment
print('Number of agents:', len(env_info.agents))

# number of actions
action_size = brain.vector_action_space_size
print('Number of actions:', action_size)

# examine the state space 
state = env_info.vector_observations[0]
print('States look like:', state)
state_size = len(state)
print('States have length:', state_size)

Number of agents: 1
Number of actions: 4
States look like: [1.         0.         0.         0.         0.27741081 0.
 0.         1.         0.         0.10737146 0.         1.
 0.         0.         0.35369602 0.         1.         0.
 0.         0.4190014  0.         1.         0.         0.
 0.34156528 0.         1.         0.         0.         0.4338823
 1.         0.         0.         0.         0.23626557 0.
 0.        ]
States have length: 37


In [5]:
env_info = env.reset(train_mode=False)[brain_name] # reset the environment
state = env_info.vector_observations[0]            # get the current state
score = 0                                          # initialize the score
i = 0
while True:
    i+=1
    action = np.random.randint(action_size)        # select an action
    env_info = env.step(action)[brain_name]        # send the action to the environment
    next_state = env_info.vector_observations[0]   # get the next state
    reward = env_info.rewards[0]                   # get the reward
    done = env_info.local_done[0]                  # see if episode has finished
    score += reward                                # update the score
    state = next_state                             # roll over the state to next time step
    if(reward != 0):
        print(reward)
    if done:                                       # exit loop if episode finished
        break
    
print("Score: {}".format(score))

1.0
Score: 1.0


In [6]:
print("iterations:",i)

iterations: 300


In [7]:
# (37x128) -> (128x64) -> (64x32) -> (32x4)
input_features = [state_size, 128, 64, 32]
output_features = [128, 64, 32, action_size]

In [42]:
model = DQNetwork(input_features, output_features)

In [44]:
model.forward(state)

tensor([[-0.1538,  0.0850,  0.0901, -0.0468]], grad_fn=<AddmmBackward>)