In [None]:
import gym
import torch.optim as optim

from planet.models.determinstic_state import DeterministicStateModel
from planet.models.stochastic_state import StochasticStateModel
from planet.models.observation import ObservationModel
from planet.models.reward import RewardModel
from planet.models.encoder import EncoderModel
from planet.trainer import train

from planet.utils.wrappers import RepeatAction
from planet.utils.seed import set_seed


%load_ext autoreload
%autoreload 2

In [None]:
# set seed for reproducibility
set_seed(0)

In [None]:
# initialize the environment
# env = gym.make("BipedalWalker-v3", hardcore=False)
# env = gym.make("Pendulum-v1")
env = RepeatAction(gym.make("LunarLander-v2", continuous=True))

In [None]:
# observation info
observation, info = env.reset()
observation_size = observation.shape[0]

# action info
action = env.action_space.sample()
action_size = action.shape[0]

# model sizes
state_size = 30
hidden_state_size = 200
hidden_layer_size = 300

In [None]:
det_state_model = DeterministicStateModel(
    hidden_state_size=hidden_state_size,
    state_size=state_size,
    action_size=action_size,
).cuda()

In [None]:
stoch_state_model = StochasticStateModel(
    hidden_state_size=hidden_state_size,
    state_size=state_size,
    hidden_layer_size=hidden_layer_size,
).cuda()

In [None]:
obs_model = ObservationModel(
    hidden_state_size=hidden_state_size,
    state_size=state_size,
    observation_size=observation_size,
    hidden_layer_size=hidden_layer_size,
).cuda()

In [None]:
reward_obs_model = RewardModel(
    hidden_state_size=hidden_state_size,
    state_size=state_size,
    hidden_layer_size=hidden_layer_size,
).cuda()

In [None]:
enc_model = EncoderModel(
    hidden_state_size=hidden_state_size,
    observation_size=observation_size,
    state_size=state_size,
    hidden_layer_size=hidden_layer_size,
).cuda()

In [None]:
models = {
    "det_state_model": det_state_model,
    "stoch_state_model": stoch_state_model,
    "obs_model": obs_model,
    "reward_obs_model": reward_obs_model,
    "enc_model": enc_model,
}


lr = 1e-3
eps = 1e-4
optimizers = {
    "det_state_model": optim.Adam(det_state_model.parameters(), lr=lr, eps=eps),
    "stoch_state_model": optim.Adam(stoch_state_model.parameters(), lr=lr, eps=eps),
    "obs_model": optim.Adam(obs_model.parameters(), lr=lr, eps=eps),
    "reward_obs_model": optim.Adam(reward_obs_model.parameters(), lr=lr, eps=eps),
    "enc_model": optim.Adam(enc_model.parameters(), lr=lr, eps=eps),
}



In [13]:
train(
    env=env,
    train_steps=10_000,
    T=1000,
    S=5,
    C=100,
    B=50,
    L=50,
    H=12,
    I=10,
    J=1000,
    K=100,
    models=models,
    optimizers=optimizers,
    hidden_state_size=hidden_state_size,
    state_size=state_size,
    action_size=action_size,
    log_interval=1
)