In [None]:
import gym

import torch
import torch.optim as optim

from planet.dataset.buffer import SequenceBuffer
from planet.dataset.env_objects import EnvStep

from planet.models.determinstic_state import DeterministicStateModel
from planet.models.stochastic_state import StochasticStateModel
from planet.models.observation import ObservationModel
from planet.models.reward import RewardModel
from planet.models.encoder import EncoderModel

from planet.trainer import train
from planet.utils.sample import init_buffer, sample_random_sequences

%load_ext autoreload
%autoreload 2

In [None]:
# initialize the environment
# env = gym.make("BipedalWalker-v3", hardcore=True)
env = gym.make("Pendulum-v1")
env.action_space.seed(42)

action = env.action_space.sample()
observation, info = env.reset(seed=42)

In [None]:
# state info
observation_size = observation.shape[0]
action_size = action.shape[0]

state_size = 32
hidden_state_size = 32

hidden_layer_size = 128

In [None]:
det_state_model = DeterministicStateModel(
    hidden_state_size=hidden_state_size,
    state_size=state_size,
    action_size=action_size,
).cuda()

In [None]:
stoch_state_model = StochasticStateModel(
    hidden_state_size=hidden_state_size,
    state_size=state_size,
    hidden_layer_size=hidden_layer_size,
).cuda()

In [None]:
obs_model = ObservationModel(
    hidden_state_size=hidden_state_size,
    state_size=state_size,
    observation_size=observation_size,
    hidden_layer_size=hidden_layer_size,
).cuda()

In [None]:
reward_obs_model = RewardModel(
    hidden_state_size=hidden_state_size,
    state_size=state_size,
    hidden_layer_size=hidden_layer_size,
).cuda()

In [None]:
enc_model = EncoderModel(
    hidden_state_size=hidden_state_size,
    observation_size=observation_size,
    state_size=state_size,
    hidden_layer_size=hidden_layer_size,
).cuda()

In [None]:
# # construct buffer of sequences
# buffer = SequenceBuffer()

# def create_env_step():
#     return EnvStep(
#         observation=3 * torch.ones((observation_size, )),
#         action=torch.randn(action_size),
#         reward=torch.tensor([1]),
#         done=torch.tensor([0]),
#     )

# def add_sequence():
#     buffer.add_sequence([create_env_step() for _ in range(50)])

# for _ in range(100):
#     add_sequence()

In [None]:
models = {
    "det_state_model": det_state_model,
    "stoch_state_model": stoch_state_model,
    "obs_model": obs_model,
    "reward_obs_model": reward_obs_model,
    "enc_model": enc_model,
}


lr = 1e-3
optimizers = {
    "det_state_model": optim.Adam(det_state_model.parameters(), lr=lr),
    "stoch_state_model": optim.Adam(stoch_state_model.parameters(), lr=lr),
    "obs_model": optim.Adam(obs_model.parameters(), lr=lr),
    "reward_obs_model": optim.Adam(reward_obs_model.parameters(), lr=lr),
    "enc_model": optim.Adam(enc_model.parameters(), lr=lr),
}



In [11]:
train(
    env=env,
    train_steps=10_000,
    T=200,
    R=2,
    S=5,
    C=10,
    B=50,
    L=50,
    H=12,
    I=10,
    J=1000,
    K=100,
    models=models,
    optimizers=optimizers,
    hidden_state_size=hidden_state_size,
    state_size=state_size,
    action_size=action_size,
    log_interval=1
)

 88%|████████▊ | 88/100 [00:14<00:02,  5.96it/s]

In [None]:
# from planet.planning.planner import latent_planning

In [None]:
# hidden_state = torch.zeros(1, 1, hidden_state_size)
# current_state_belief = (
#     torch.zeros(1, state_size),
#     torch.ones(1, state_size),
# )

In [None]:
# latent_planning(
#     H=10,
#     I=5,
#     J=10,
#     K=3,
#     hidden_state=hidden_state,
#     current_state_belief=current_state_belief,
#     deterministic_state_model=det_state_model,
#     stochastic_state_model=stoch_state_model,
#     reward_model=reward_obs_model,
#     action_size=action_size,
# )