## Watch a Smart Agent!

### 1.Start the Environment for Trained Agent

In [1]:
import numpy as np
import torch
import gym
import argparse
import os
import time

from TwinDelay import TD3

env = gym.make('BipedalWalker-v2')

# Set seeds
seed = 12345
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

state_size = env.observation_space.shape[0]
action_size=env.action_space.shape[0]
action_high= float(env.action_space.high[0])
print('state_size: ', state_size, ', action_size: ', action_size, ', action_high: ', action_high)
    
agent = TD3(state_dim=state_size, action_dim=action_size, max_action=action_high)


device:  cuda
state_size:  24 , action_size:  4 , action_high:  1.0


### 2. Prepare Load

In [2]:
def load(agent, dir, prefix):
    agent.actor.load_state_dict(
        torch.load(os.path.join(dir,'%s_actor.pth' % prefix)))
    agent.critic.load_state_dict(
        torch.load(os.path.join(dir,'%s_critic.pth' % prefix)))
    agent.actor_target.load_state_dict(
        torch.load(os.path.join(dir,'%s_actor_t.pth' % prefix)))
    agent.critic_target.load_state_dict(
        torch.load(os.path.join(dir,'%s_critic_t.pth' % prefix)))


### 3. Prepare Player

In [3]:
from collections import deque
import os

def play(env, agent, n_episodes):
    state = env.reset()
    
    scores_deque = deque(maxlen=100)
    scores = []

    for i_episode in range(1, n_episodes+1):
        state = env.reset()        
        score = 0
        
        time_start = time.time()
        
        while True:
            action = agent.select_action(np.array(state))
            env.render()
            next_state, reward, done, _ = env.step(action)
            state = next_state
            score += reward
            if done:
                break 

        s = (int)(time.time() - time_start)
        
        scores_deque.append(score)
        scores.append(score)

        print('Episode {}\tAverage Score: {:.2f},\tScore: {:.2f} \tTime: {:02}:{:02}:{:02}'\
                  .format(i_episode, np.mean(scores_deque), score, s//3600, s%3600//60, s%60))  


### 3. Load and Play: Score = 293

In [5]:
load(agent, 'dir_chk', 'checkpoint_293')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 293.95,	Score: 293.95 	Time: 00:00:18
Episode 2	Average Score: 294.16,	Score: 294.37 	Time: 00:00:12
Episode 3	Average Score: 239.93,	Score: 131.47 	Time: 00:00:10
Episode 4	Average Score: 253.46,	Score: 294.04 	Time: 00:00:12
Episode 5	Average Score: 261.49,	Score: 293.63 	Time: 00:00:12


### 4. Load and Play: Score = 300.5,  training time = 9h 44m

In [6]:
load(agent, 'dir_chk', 'chpnt_88seed_300-5sc_9h44m')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 304.02,	Score: 304.02 	Time: 00:00:12
Episode 2	Average Score: 304.14,	Score: 304.26 	Time: 00:00:12
Episode 3	Average Score: 303.97,	Score: 303.62 	Time: 00:00:12
Episode 4	Average Score: 304.11,	Score: 304.54 	Time: 00:00:12
Episode 5	Average Score: 304.08,	Score: 303.94 	Time: 00:00:12


### 5. Load and Play: Score = 306

In [8]:
load(agent, 'dir_chk', 'checkpoint_306')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 304.04,	Score: 304.04 	Time: 00:00:12
Episode 2	Average Score: 305.21,	Score: 306.38 	Time: 00:00:11
Episode 3	Average Score: 305.97,	Score: 307.49 	Time: 00:00:11
Episode 4	Average Score: 306.36,	Score: 307.53 	Time: 00:00:11
Episode 5	Average Score: 306.41,	Score: 306.58 	Time: 00:00:11


In [9]:
env.close()