## Watch a Smart Agent!

### 1.Start the Environment for Trained Agent

In [1]:
import numpy as np
import torch
import gym
import argparse
import os
import time
import pybullet
import pybullet_envs


from TwinDelayed import TD3, device

pybullet.connect(pybullet.DIRECT)

env = gym.make('Walker2DBulletEnv-v0', render=True)

# Set seeds
seed = 12345
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

state_size = env.observation_space.shape[0]
action_size=env.action_space.shape[0]
action_high= float(env.action_space.high[0])
print('state_size: ', state_size, ', action_size: ', action_size, ', action_high: ', action_high)
    
agent = TD3(state_dim=state_size, action_dim=action_size, max_action=action_high)


device:  cuda
state_size:  22 , action_size:  6 , action_high:  1.0


### 2. Prepare Load

In [2]:
def load(agent, dir, prefix):
    agent.actor.load_state_dict(torch.load(os.path.join(dir,'%s_actor.pth' % prefix)))
    agent.critic.load_state_dict(torch.load(os.path.join(dir,'%s_critic.pth' % prefix)))
    agent.actor_target.load_state_dict(torch.load(os.path.join(dir,'%s_actor_t.pth' % prefix)))
    agent.critic_target.load_state_dict(torch.load(os.path.join(dir,'%s_critic_t.pth' % prefix)))


### 3. Prepare Player

In [3]:
from collections import deque
import os

std_noise = 0.1

def play(env, agent, n_episodes):
    state = env.reset()
    
    scores_deque = deque(maxlen=100)
    scores = []
    
    low = env.action_space.low
    high = env.action_space.high

    for i_episode in range(1, n_episodes+1):
        state = env.reset()        
        score = 0
        
        time_start = time.time()
        done = False
        
        while True:
            action = agent.select_action(np.array(state))
            env.render()
            time.sleep(0.01)
            
            next_state, reward, done, _ = env.step(action)
            state = next_state
            score += reward
            if done:
                break 

        s = (int)(time.time() - time_start)
        
        scores_deque.append(score)
        scores.append(score)

        print('Episode {}\tAverage Score: {:.2f},\tScore: {:.2f} \tTime: {:02}:{:02}:{:02}'\
                  .format(i_episode, np.mean(scores_deque), score, s//3600, s%3600//60, s%60))  


### 3. Load and Play

In [5]:
load(agent, 'dir_Walker2D_002', 'chpnt_ts2500')
play(env, agent, n_episodes=10)

Episode 1	Average Score: 2564.43,	Score: 2564.43 	Time: 00:00:58
Episode 2	Average Score: 2535.87,	Score: 2507.31 	Time: 00:00:19
Episode 3	Average Score: 2538.75,	Score: 2544.52 	Time: 00:00:19
Episode 4	Average Score: 2523.26,	Score: 2476.79 	Time: 00:00:19
Episode 5	Average Score: 2532.37,	Score: 2568.83 	Time: 00:00:19
Episode 6	Average Score: 2534.58,	Score: 2545.62 	Time: 00:00:19
Episode 7	Average Score: 2221.45,	Score: 342.68 	Time: 00:00:02
Episode 8	Average Score: 2261.54,	Score: 2542.16 	Time: 00:00:56
Episode 9	Average Score: 2293.57,	Score: 2549.84 	Time: 00:00:19
Episode 10	Average Score: 2318.17,	Score: 2539.56 	Time: 00:00:19


In [6]:
env.close()