In [2]:
import gym
from rl.algo.ppo import PPO
import os
import torch as th
import time
import numpy as np
from gym.spaces import Box

from rl.module.actor_critic import ActorCriticNetwork2
from rl.module.general import DummyExtractor

from rl.vecenv import vecenv
import metaworld
from rl.metaenv_wrapper import MetaWrapper

import torch.utils.tensorboard

#, critic_coef=1, ent_coef=0.01, pred_coef=0.0, gamma=0.99, epsilon=0.2, lamda=0.95, policy_epochs=4

alg_args = {
    'name' : 'lunar',
    'nenvs' : 16,
    'nsteps' : 256,
    'gamma' : 0.99,
    'train_steps' : 1e6,
    'minibatches' : 256,
    'lr' : 3e-4,
}


def train(model, env, log_interval=0.01, save_interval=0.1):
    prevtime = time.time()
    saves = 1
    
    next_log = log_interval
    next_save = save_interval
    while next_log <= model.progress:
        next_log += log_interval
    while next_save <= model.progress:
        next_save += save_interval


    while os.path.isdir(f'logs/{model.name}/valid/{saves}'):
        saves += 1

    while model.progress < 1:
        model.train_epoch(env)
        if model.progress >= next_log:
            next_log += log_interval
            avg = np.mean(env.recent_scores)
            
            print(f"Average score:\t{round(avg,3)}")
            print(f"progress:\t{round(model.progress * 100, 2)}%")
            currtime = time.time()
            time_passed = currtime - prevtime
            print(f"elapsed time:\t{round(time_passed, 3)} second")
            print(f"time left:\t{round(time_passed*(1-model.progress)/log_interval/3600, 3)} hour")
            prevtime = currtime
            model.write_log('Average_score', avg)
            print('-----------------------------------------------------------')


if __name__ == '__main__':
    device = th.device('cuda' if th.cuda.is_available() else 'cpu')
    istrain = True
    
    env_name = 'LunarLander-v2'

    env = gym.make(env_name)

    extractor = DummyExtractor(env.observation_space.shape[0])


    if isinstance(env.action_space, Box):
        action_type = 'continuous'
        network = ActorCriticNetwork2(action_type, env.action_space.shape[0], extractor)
    else:
        action_type = 'discrete'
        network = ActorCriticNetwork2(action_type, env.action_space.n, extractor)

    

    envs = vecenv(alg_args['nenvs'], env_name, {})

    model = PPO(device, network, **alg_args)
    
    if istrain:
        train(model, envs, save_interval=1)
    else:
        #env = Monitor(env, f'./logs/{model.name}/video', force=True)
        state = env.reset()
        done = False
        while not done:
            action = model.model.get_action(th.as_tensor(state[None], dtype=th.float32).to(device))
            state, reward, done, info = env.step(action[0])

        env.close()
        

Average score:	-151.379
progress:	1.23%
elapsed time:	38.189 second
time left:	1.048 hour
-----------------------------------------------------------
Average score:	-95.366
progress:	2.05%
elapsed time:	25.835 second
time left:	0.703 hour
-----------------------------------------------------------
Average score:	-54.024
progress:	3.28%
elapsed time:	41.479 second
time left:	1.114 hour
-----------------------------------------------------------
Average score:	-43.85
progress:	4.1%
elapsed time:	38.489 second
time left:	1.025 hour
-----------------------------------------------------------
Average score:	-24.889
progress:	5.32%
elapsed time:	61.112 second
time left:	1.607 hour
-----------------------------------------------------------
Average score:	-5.258
progress:	6.14%
elapsed time:	39.942 second
time left:	1.041 hour
-----------------------------------------------------------
Average score:	15.252
progress:	7.37%
elapsed time:	66.086 second
time left:	1.7 hour
----------------------

  self.write_log('avg_score', self.reward_log / self.cnt_log)


avg_score is NaN. -inf
Average score:	181.153
progress:	25.4%
elapsed time:	52.657 second
time left:	1.091 hour
-----------------------------------------------------------
Average score:	178.02
progress:	26.21%
elapsed time:	37.496 second
time left:	0.769 hour
-----------------------------------------------------------
avg_score is NaN. -inf
Average score:	174.946
progress:	27.03%
elapsed time:	40.733 second
time left:	0.826 hour
-----------------------------------------------------------
avg_score is NaN. -inf
Average score:	163.808
progress:	28.26%
elapsed time:	60.696 second
time left:	1.21 hour
-----------------------------------------------------------
avg_score is NaN. inf
Average score:	160.914
progress:	29.08%
elapsed time:	41.167 second
time left:	0.811 hour
-----------------------------------------------------------
Average score:	158.297
progress:	30.31%
elapsed time:	55.822 second
time left:	1.081 hour
-----------------------------------------------------------
Average scor