In [1]:
from unityagents import UnityEnvironment
import numpy as np

# Intial Environment Exploration

In [2]:
env = UnityEnvironment(file_name='./Reacher_Linux/Reacher.x86_64')

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


## Main Loop with Random Actions

In [11]:
class ScoreKeeper():
    def __init__(self):
        self.TARGET_SCORE = 30.0
        self.WINDOW_LEN = 100

        self.scores = []
        self.scores_window = deque(maxlen=self.WINDOW_LEN)
    
    def update(self, score, i_episode):
        self.scores.append(score)
        self.scores_window.append(score)
        
        self._check_solved()
        
    def _check_solved(self, i_episode):
        print(f'\rEpisode {i_episode}\t Score: {self.scores[-1]:.2f}', end='', flush=True)

        if i_episode >= 100 and i_episode % 10 == 0:
            print(f'\rEpisode {i_episode}\tAverage Score (over past 100 episodes): {np.mean(scores_window):.2f}')

        if np.mean(scores_window)>=target:
            print(f'Environment solved in {i_episode-self.WINDOW_LEN} episodes!\tAverage Score: {np.mean(scores_window):.2f}')
            return True

        return False

In [12]:
class DummyAgent():
    def __init__(self):
        pass
    
    def act(self, state):
        actions = np.random.randn(num_agents, action_size)
        actions = np.clip(actions, -1, 1)
        
        return actions
    
    def step(self, state, action, reward, next_state, done):
        pass
    
    def reset(self):
        pass
    
    def load_networks(self):
        pass

In [13]:
from collections import deque

from tqdm import tqdm


# Hyperparameters
max_t = 1000
n_episodes = 5

# ------ Instantiations ------
# Agent
agent = DummyAgent()
# Scorekeeper
scorekeeper = ScoreKeeper()

for i_episode in tqdm(range(1, n_episodes+1)):
    # ------ Resets ------
    # Environment
    env_info = env.reset(train_mode=False)[brain_name]
    state = env_info.vector_observations
    # Agent
    agent.reset()
    # Scorekeeper
    score = np.zeros(num_agents)

    # Collect Episode
    for t in tqdm(range(max_t)):
        # Take Action
        action = agent.act(state)
        env_info = env.step(action)[brain_name]
        next_state, reward, done = env_info.vector_observations, env_info.rewards, env_info.local_done
        
        # Update Environment and Agent
        agent.step(state, action, reward, next_state, done)
        state = next_state
        
        # Monitoring
        score += reward

        if np.any(done):
            break
    
    # Monitoring
    ScoreKeeper.update(score)

env.close()

  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/1000 [00:00<?, ?it/s][A
  1%|▏         | 14/1000 [00:00<00:07, 137.29it/s][A
  3%|▎         | 31/1000 [00:12<06:18,  2.56it/s] [A
  0%|          | 0/5 [00:12<?, ?it/s]


KeyboardInterrupt: 