# Continuous Control

---


### Start the Environment

Run the next code cell to install a few packages. This might take a few minutes to run.

In [1]:
import numpy as np
import gym
import random
import torch
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

from ddpg_agent import Agent
from IPython.display import clear_output

In [2]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name="Tennis_Windows_x86_64/Tennis.app")

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [3]:
# reset the environment
env_info = env.reset(train_mode=False)[brain_name]

# number of agents
num_agents = len(env_info.agents)

# size of each action
action_size = brain.vector_action_space_size

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]

### Watch a random agent

Run the code cells below to watch a random agent.

In [None]:
# main function that sets up environments
# perform training loop

from buffer import ReplayBuffer
from maddpg import MADDPG
import torch
import numpy as np
from tensorboardX import SummaryWriter
import os
from utilities import transpose_list, transpose_to_tensor

# keep training awake
#from workspace_utils import keep_awake
action_weight_growth = 1 #0.99

def main():
    np.random.seed(1)
    torch.manual_seed(1)
    # number of parallel agents
    parallel_envs = 1
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 1000
    episode_length = 100
    batchsize = 1000
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    action_weight = 0
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd()+"/log"
    model_dir= os.getcwd()+"/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    #torch.set_num_threads(parallel_envs)
    #env = envs.make_parallel_env(parallel_envs)
    env_info = env.reset(train_mode=False)[brain_name]

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000*episode_length))

    # initialize policy and critic
    maddpg = MADDPG(action_size, state_size)
    logger = SummaryWriter(logdir=log_path)
    agent0_reward = []
    agent1_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = ['episode: ', pb.Counter(),'/',str(number_of_episodes),' ', 
              pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    #for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):
    for episode in range(0,number_of_episodes):
        
        action_weight = 1-action_weight_growth**episode

        timer.update(episode)


        reward_this_episode = np.zeros((parallel_envs, 2))
        env_info = env.reset(train_mode=False)[brain_name]
        states = env_info.vector_observations
        obs = [agent_obs for agent_obs in states]
        obs_full = states.flatten()
        #all_obs = env.reset() #
        #obs, obs_full = transpose_list(all_obs)

        #for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = ((episode) % save_interval < parallel_envs or episode==number_of_episodes-parallel_envs)

        for episode_t in range(episode_length):

            t += parallel_envs


            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            #actions = maddpg.act(transpose_to_tensor([obs]), noise=noise)
            noise *= noise_reduction
            
            actions_array = np.random.randn(num_agents, action_size) # select an action (for each agent)
            actions_array = np.clip(actions_array, -1, 1)  

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            #actions_for_env = np.rollaxis(actions_array,1)

            # step forward one frame
            #next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env)

            env_info = env.step(actions_array)[brain_name]  

            next_states = env_info.vector_observations
            next_obs = [agent_obs for agent_obs in states]
            next_obs_full = states.flatten()
            rewards = np.array(env_info.rewards)
            dones = np.array(env_info.local_done)


            # add data to buffer
            transition = ([obs, obs_full, actions_array, rewards, next_obs, next_obs_full, dones])


            buffer.push(transition)

            example_output = buffer.sample(1)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

        # update once after every episode_per_update
        if len(buffer) > batchsize and episode % episode_per_update < parallel_envs:
            for a_i in range(2):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            maddpg.update_targets() #soft update the target network towards the actual networks



        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i,0])
            agent1_reward.append(reward_this_episode[i,1])

        if episode % 100 == 0 or episode == number_of_episodes-1:
            avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward)]
            agent0_reward = []
            agent1_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i, avg_rew, episode)

        #saving model
        save_dict_list =[]
        if save_info:
            for i in range(2):

                save_dict = {'actor_params' : maddpg.maddpg_agent[i].actor.state_dict(),
                             'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                             'critic_params' : maddpg.maddpg_agent[i].critic.state_dict(),
                             'critic_optim_params' : maddpg.maddpg_agent[i].critic_optimizer.state_dict()}
                save_dict_list.append(save_dict)

                torch.save(save_dict_list, 
                           os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

    env.close()
    logger.close()
    timer.finish()

main()

episode: 9/1000   0% ETA:  2:45:33 |                                         | 

In [None]:
env.close()