In [1]:
from unityagents import UnityEnvironment
import numpy as np

# Intial Environment Exploration

In [2]:
env = UnityEnvironment(file_name='./Reacher_Linux/Reacher.x86_64', no_graphics=True)

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 20
Size of each action: 4
There are 20 agents. Each observes a state with length: 33
The state for the first agent looks like: [ 0.00000000e+00 -4.00000000e+00  0.00000000e+00  1.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -1.00000000e+01  0.00000000e+00
  1.00000000e+00 -0.00000000e+00 -0.00000000e+00 -4.37113883e-08
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  5.75471878e+00 -1.00000000e+00
  5.55726624e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
 -1.68164849e-01]


## Main Loop with Random Actions

In [27]:
class ScoreKeeper():
    def __init__(self, num_agents=20, target_score=30.0, window_len=100):
        self.NUM_AGENTS = num_agents
        self.TARGET_SCORE = target_score
        self.WINDOW_LEN = window_len

        self.scores = []
        self.scores_window = deque(maxlen=self.WINDOW_LEN)
    
    def reset(self):
        self.curr_score = np.zeros(self.NUM_AGENTS)
    
    def update_timestep(self, rewards):
        self.curr_score += rewards
    
    def update_episode(self, i_episode):
        score = np.mean(self.curr_score)
        self.scores.append(score)
        self.scores_window.append(score)
        
        self._check_solved(i_episode)
        
        self.reset()
        
    def _check_solved(self, i_episode):
        print(f'\rEpisode {i_episode}\t Score: {self.scores[-1]:.2f}', end='', flush=True)

        if i_episode >= 100 and i_episode % 10 == 0:
            print(f'\rEpisode {i_episode}\tAverage Score (over past 100 episodes): {np.mean(self.scores_window):.2f}')

        if np.mean(self.scores_window)>=self.TARGET_SCORE:
            print(f'Environment solved in {i_episode-self.WINDOW_LEN} episodes!\tAverage Score: {np.mean(self.scores_window):.2f}')
            return True

        return False

In [28]:
class DummyAgent():
    def __init__(self):
        pass
    
    def act(self, state):
        actions = np.random.randn(num_agents, action_size)
        actions = np.clip(actions, -1, 1)
        
        return actions
    
    def step(self, state, action, reward, next_state, done):
        pass
    
    def reset(self):
        pass
    
    def load_networks(self):
        pass

In [None]:
from utils import ScoreKeeper


In [29]:
from collections import deque

from tqdm import tqdm


# Hyperparameters
max_t = 100
n_episodes = 5

# ------ Instantiations ------
# Agent
agent = DummyAgent()
# Scorekeeper
scorekeeper = ScoreKeeper()

for i_episode in tqdm(range(1, n_episodes+1)):
    # ------ Resets ------
    # Environment
    env_info = env.reset(train_mode=False)[brain_name]
    state = env_info.vector_observations
    # Agent
    agent.reset()
    # Scorekeeper
    scorekeeper.reset()

    # Collect Episode
    for t in tqdm(range(max_t)):
        # Take Action
        action = agent.act(state)
        env_info = env.step(action)[brain_name]
        next_state, reward, done = env_info.vector_observations, env_info.rewards, env_info.local_done
        
        # Update Environment and Agent
        agent.step(state, action, reward, next_state, done)
        state = next_state
        
        # Monitoring
        scorekeeper.update_timestep(reward)

        if np.any(done):
            break
    
    # Monitoring
    scorekeeper.update_episode(i_episode)

env.close()

  0%|          | 0/5 [00:00<?, ?it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:07, 13.93it/s][A
  4%|▍         | 4/100 [00:00<00:07, 12.63it/s][A
  6%|▌         | 6/100 [00:00<00:07, 12.48it/s][A
  8%|▊         | 8/100 [00:00<00:07, 12.80it/s][A
 10%|█         | 10/100 [00:00<00:07, 12.50it/s][A
 12%|█▏        | 12/100 [00:00<00:06, 12.84it/s][A
 14%|█▍        | 14/100 [00:01<00:06, 12.59it/s][A
 16%|█▌        | 16/100 [00:01<00:06, 12.42it/s][A
 18%|█▊        | 18/100 [00:01<00:06, 12.69it/s][A
 20%|██        | 20/100 [00:01<00:06, 12.51it/s][A
 22%|██▏       | 22/100 [00:01<00:06, 12.78it/s][A
 24%|██▍       | 24/100 [00:01<00:06, 12.56it/s][A
 26%|██▌       | 26/100 [00:02<00:05, 12.41it/s][A
 28%|██▊       | 28/100 [00:02<00:05, 12.68it/s][A
 30%|███       | 30/100 [00:02<00:05, 12.49it/s][A
 32%|███▏      | 32/100 [00:02<00:05, 12.74it/s][A
 34%|███▍      | 34/100 [00:02<00:05, 12.54it/s][A
 36%|███▌      | 36/100 [00:02<00:05, 1

Episode 1	 Score: 0.01


 20%|██        | 1/5 [00:07<00:31,  7.97s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:07, 12.47it/s][A
  4%|▍         | 4/100 [00:00<00:07, 12.98it/s][A
  6%|▌         | 6/100 [00:00<00:07, 12.56it/s][A
  8%|▊         | 8/100 [00:00<00:07, 12.90it/s][A
 10%|█         | 10/100 [00:00<00:07, 12.55it/s][A
 12%|█▏        | 12/100 [00:00<00:07, 12.38it/s][A
 14%|█▍        | 14/100 [00:01<00:06, 12.73it/s][A
 16%|█▌        | 16/100 [00:01<00:06, 12.50it/s][A
 18%|█▊        | 18/100 [00:01<00:06, 12.79it/s][A
 20%|██        | 20/100 [00:01<00:06, 12.56it/s][A
 22%|██▏       | 22/100 [00:01<00:06, 12.40it/s][A
 24%|██▍       | 24/100 [00:01<00:05, 12.68it/s][A
 26%|██▌       | 26/100 [00:02<00:05, 12.47it/s][A
 28%|██▊       | 28/100 [00:02<00:05, 12.74it/s][A
 30%|███       | 30/100 [00:02<00:05, 12.55it/s][A
 32%|███▏      | 32/100 [00:02<00:05, 12.41it/s][A
 34%|███▍      | 34/100 [00:02<00:05, 12.68it/s][A
 36%|███▌      | 36/100 [00:02

Episode 2	 Score: 0.00


 40%|████      | 2/5 [00:15<00:23,  7.97s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:07, 12.33it/s][A
  4%|▍         | 4/100 [00:00<00:07, 13.04it/s][A
  6%|▌         | 6/100 [00:00<00:07, 12.58it/s][A
  8%|▊         | 8/100 [00:00<00:07, 12.32it/s][A
 10%|█         | 10/100 [00:00<00:07, 12.76it/s][A
 12%|█▏        | 12/100 [00:00<00:07, 12.50it/s][A
 14%|█▍        | 14/100 [00:01<00:06, 12.75it/s][A
 16%|█▌        | 16/100 [00:01<00:06, 12.58it/s][A
 18%|█▊        | 18/100 [00:01<00:06, 12.38it/s][A
 20%|██        | 20/100 [00:01<00:06, 12.67it/s][A
 22%|██▏       | 22/100 [00:01<00:06, 12.49it/s][A
 24%|██▍       | 24/100 [00:01<00:05, 12.78it/s][A
 26%|██▌       | 26/100 [00:02<00:05, 12.56it/s][A
 28%|██▊       | 28/100 [00:02<00:05, 12.40it/s][A
 30%|███       | 30/100 [00:02<00:05, 12.70it/s][A
 32%|███▏      | 32/100 [00:02<00:05, 12.50it/s][A
 34%|███▍      | 34/100 [00:02<00:05, 12.78it/s][A
 36%|███▌      | 36/100 [00:02

Episode 3	 Score: 0.00


 60%|██████    | 3/5 [00:23<00:15,  7.97s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:07, 12.45it/s][A
  4%|▍         | 4/100 [00:00<00:07, 12.23it/s][A
  6%|▌         | 6/100 [00:00<00:07, 12.75it/s][A
  8%|▊         | 8/100 [00:00<00:07, 12.47it/s][A
 10%|█         | 10/100 [00:00<00:07, 12.78it/s][A
 12%|█▏        | 12/100 [00:00<00:07, 12.55it/s][A
 14%|█▍        | 14/100 [00:01<00:06, 12.38it/s][A
 16%|█▌        | 16/100 [00:01<00:06, 12.69it/s][A
 18%|█▊        | 18/100 [00:01<00:06, 12.48it/s][A
 20%|██        | 20/100 [00:01<00:06, 12.77it/s][A
 22%|██▏       | 22/100 [00:01<00:06, 12.54it/s][A
 24%|██▍       | 24/100 [00:01<00:06, 12.37it/s][A
 26%|██▌       | 26/100 [00:02<00:05, 12.69it/s][A
 28%|██▊       | 28/100 [00:02<00:05, 12.49it/s][A
 30%|███       | 30/100 [00:02<00:05, 12.76it/s][A
 32%|███▏      | 32/100 [00:02<00:05, 12.51it/s][A
 34%|███▍      | 34/100 [00:02<00:05, 12.42it/s][A
 36%|███▌      | 36/100 [00:02

Episode 4	 Score: 0.01


 80%|████████  | 4/5 [00:31<00:07,  7.97s/it]
  0%|          | 0/100 [00:00<?, ?it/s][A
  2%|▏         | 2/100 [00:00<00:07, 13.87it/s][A
  4%|▍         | 4/100 [00:00<00:07, 12.75it/s][A
  6%|▌         | 6/100 [00:00<00:07, 13.00it/s][A
  8%|▊         | 8/100 [00:00<00:07, 12.64it/s][A
 10%|█         | 10/100 [00:00<00:07, 12.41it/s][A
 12%|█▏        | 12/100 [00:00<00:06, 12.75it/s][A
 14%|█▍        | 14/100 [00:01<00:06, 12.53it/s][A
 16%|█▌        | 16/100 [00:01<00:06, 12.76it/s][A
 18%|█▊        | 18/100 [00:01<00:06, 12.57it/s][A
 20%|██        | 20/100 [00:01<00:06, 12.31it/s][A
 22%|██▏       | 22/100 [00:01<00:06, 12.61it/s][A
 24%|██▍       | 24/100 [00:01<00:06, 12.46it/s][A
 26%|██▌       | 26/100 [00:02<00:05, 12.72it/s][A
 28%|██▊       | 28/100 [00:02<00:05, 12.51it/s][A
 30%|███       | 30/100 [00:02<00:05, 12.41it/s][A
 32%|███▏      | 32/100 [00:02<00:05, 12.69it/s][A
 34%|███▍      | 34/100 [00:02<00:05, 12.49it/s][A
 36%|███▌      | 36/100 [00:02

Episode 5	 Score: 0.00


100%|██████████| 5/5 [00:39<00:00,  7.97s/it]
