## Watch a Soft-Actor-Critic Agent! 

### 1.Start the Environment for Trained Agent

In [1]:
import numpy as np
import torch
import gym
from gym import wrappers as w
import pybullet_envs
import time
from  collections  import deque
from sac_agent import soft_actor_critic_agent

env = gym.make('Walker2DBulletEnv-v0', render=True)
env = w.monitor.Monitor(env, directory='./videos/ep_6900') 

# Set seeds
seed = 0 ## 12345
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('device: ', device)

state_size = env.observation_space.shape[0]
action_size=env.action_space.shape[0]
action_high= float(env.action_space.high[0])
print('state_size: ', state_size, ', action_size: ', action_size, ', action_high: ', action_high)
    
agent = soft_actor_critic_agent(env.observation_space.shape[0], env.action_space, \
        device=device, hidden_size=256, seed=seed, lr, gamma=0.99, tau=0.005, alpha=0.2)

device:  cuda:0
state_size:  22 , action_size:  6 , action_high:  1.0


### 2. Prepare Load

In [2]:
def load(agent, actor, critic):
    print('Load model from {} and {}'.format(actor, critic))
    agent.policy.load_state_dict(torch.load(actor))
    agent.critic.load_state_dict(torch.load(critic))


### 3. Prepare Player

In [3]:
def play(env, agent, num_episodes):
    
    state = env.reset()
    scores_deque = deque(maxlen=100)
    scores = []
    
    for i_episode in range(num_episodes + 1):
        
        state = env.reset()
        score = 0                    
        time_start = time.time()
        
        while True:
            
            ## action = agent.select_action(np.array(state)) ## TD3
            action = agent.select_action(state, eval=True)
            env.render()
            next_state, reward, done, _ = env.step(action)
            score += reward 
            state = next_state
    
            if done:
                break
                
        s = (int)(time.time() - time_start)
        
        scores_deque.append(score)
        scores.append(score)    
        
        print('Episode {}\tAverage Score: {:.2f},\tScore: {:.2f} \tTime: {:02}:{:02}:{:02}'\
                  .format(i_episode, np.mean(scores_deque), score, s//3600, s%3600//60, s%60)) 
    

### 4. Load and Play, Walker2DBulletEnv-SAC, ep.=6900, score= 2529

In [6]:
dir = 'dir_chk_lr0.0003'

actor =  dir + '\weights_actor_final_2551.35.pth'    
critic = dir + '\weights_actor_critic_2551.35.pth'

load(agent, actor, critic)
play(env, agent, num_episodes=3)

Load model from dir_chk_lr0.0003\weights_actor_6900_2528.92.pth and dir_chk_lr0.0003\weights_critic_6900_2528.92.pth
Episode 0	Average Score: 2534.17,	Score: 2534.17 	Time: 00:00:58
Episode 1	Average Score: 2533.00,	Score: 2531.83 	Time: 00:00:21
Episode 2	Average Score: 2534.38,	Score: 2537.14 	Time: 00:00:18
Episode 3	Average Score: 2533.65,	Score: 2531.46 	Time: 00:00:17


In [None]:
env.close()