In [1]:
%cd ../

/Users/asmazaev/Projects/TensorAeroSpace


In [2]:
import gymnasium as gym

In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import Uniform
from torch.utils.tensorboard import SummaryWriter
import numpy as np
from scipy.stats import uniform
from tqdm import tqdm


class Net(nn.Module):
    """Создает нейронную сеть для моделирования динамики системы.

    Сеть состоит из трех линейных слоев и функций активации ReLU между ними.
    Входной слой принимает вектор из 3 элементов, представляющих состояния системы.
    Второй и третий слои - это скрытые слои с 128 нейронами.
    Выходной слой генерирует вектор из 2 элементов, представляющих предсказание следующего состояния системы.
    """
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(3, 256)  # 3 состояния + 1 действие = 4
        self.fc2 = nn.Linear(256, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 2)  # Предсказание следующего состояния

    def forward(self, x):
        """Выполняет прямое распространение входных данных через сеть.

        Args:
            x (torch.Tensor): Входные данные, представляющие состояния системы.

        Returns:
            torch.Tensor: Предсказание следующего состояния системы.
        """
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)




class MPCAgent(object):
    """
    Агент, использующий метод Модельно-Прогностического Управления (MPC) для оптимизации действий в среде.

    Attributes:
        gamma (float): Коэффициент дисконтирования.
        action_dim (int): Размерность пространства действий.
        observation_dim (int): Размерность пространства наблюдений.
        model (torch.nn.Module): Модель для аппроксимации динамики среды.
        cost_function (callable): Функция стоимости, используемая для оценки действий.
        lr (float): Скорость обучения для оптимизатора модели.
        criterion (torch.nn.modules.loss): Критерий потерь для обучения модели.

    Methods:
        train_model(states, actions, next_states, epochs=100, batch_size=64):
            Обучает модель динамики среды.
        collect_data(env, num_episodes=1000):
            Собирает данные о состояниях, действиях и следующих состояниях, исполняя политику в среде.
        choose_action(state, rollout, horizon):
            Выбирает оптимальное действие, используя прогнозируемое моделирование.
        choose_action_ref(state, rollout, horizon, reference_signals, step):
            Выбирает оптимальное действие с учетом эталонных сигналов.
        test_model(env, num_episodes=100, rollout=10, horizon=1):
            Тестирует модель, измеряя среднее вознаграждение в среде.
        test_network(states, actions, next_states):
            Тестирует точность предсказаний модели на заданном наборе данных.
    """
    def __init__(self, gamma, action_dim, observation_dim, model, cost_function, lr=1e-3, criterion=torch.nn.MSELoss()):
        self.gamma = gamma
        self.action_dim = action_dim
        self.observation_dim = observation_dim
        self.system_model = model
        self.system_model_optimizer = optim.Adam(self.system_model.parameters(), lr=lr)
        self.cost_function = cost_function
        self.writer = SummaryWriter()
        self.criterion = criterion
    
    def train_model(self, states, actions, next_states, epochs=100, batch_size=64):
        """
        Обучает модель динамики среды, используя данные о состояниях, действиях и следующих состояниях.

        Args:
            states (numpy.ndarray): Массив текущих состояний.
            actions (numpy.ndarray): Массив действий, совершенных в этих состояниях.
            next_states (numpy.ndarray): Массив следующих состояний после совершения действий.
            epochs (int): Количество эпох обучения.
            batch_size (int): Размер батча для обучения.

        Returns:
            None
        """
        for epoch in  (pbar := tqdm(range(epochs))):
            permutation = np.random.permutation(states.shape[0])
            for i in range(0, states.shape[0], batch_size):
                indices = permutation[i:i+batch_size]
                batch_states, batch_actions, batch_next_states = states[indices], actions[indices], next_states[indices]
                inputs = np.hstack((batch_states, batch_actions.reshape(-1, 1)))
                inputs = torch.tensor(inputs, dtype=torch.float32)
                targets = torch.tensor(batch_next_states, dtype=torch.float32)
                self.system_model_optimizer.zero_grad()
                outputs = self.system_model(inputs)
                loss = self.criterion(outputs, targets)
                loss.backward()
                self.system_model_optimizer.step()
            
            self.writer.add_scalar('Loss/train', loss.item(), epoch)
            pbar.set_description(f"Loss {loss.item()}")

    def collect_data(self, env, num_episodes=1000):
        """
        Собирает данные о состояниях, действиях и следующих состояниях, исполняя случайную политику в среде.

        Args:
            env (gym.Env): Среда, в которой собираются данные.
            num_episodes (int): Количество эпизодов для сбора данных.

        Returns:
            tuple: Возвращает кортеж из трех массивов (states, actions, next_states).
        """
        states, actions, next_states = [], [], []
        for _ in tqdm(range(num_episodes)):
            state, info = env.reset()
            done = False
            while not done:
                action = env.action_space.sample()
                next_state, reward, terminated, truncated, info = env.step(action)
                done = terminated or truncated
                states.append(state)
                actions.append(action)
                next_states.append(next_state)
                state = next_state
        return np.array(states), np.array(actions), np.array(next_states)

    def choose_action(self, state, rollout, horizon):
        initial_state = torch.tensor([state], dtype=torch.float32, requires_grad=False)
        best_action = None
        lowest_cost = float('inf')
        
        # Initialize action as a tensor with requires_grad=True to enable gradient computation
        action = torch.zeros(1, 1, requires_grad=True)
        
        # Use an optimizer; here we use Adam for its robustness
        optimizer = optim.Adam([action], lr=0.1)
        
        for trajectory in range(rollout):
            optimizer.zero_grad()  # Reset gradients
            trajectory_cost = 0
            state = initial_state
            
            for h in range(horizon):
                if h == 0:
                    first_action = action.clone()
                
                # Forward pass through the system model
                next_state = self.system_model(torch.cat([state, action], dim=-1))
                
                # Compute cost
                cost = self.cost_function(next_state, action)
                trajectory_cost += cost
                
                state = next_state
            
            # Backward pass to compute gradient of the cost wrt action
            trajectory_cost.backward()
            
            # Update action using its gradient
            optimizer.step()
            
            if trajectory_cost < lowest_cost:
                lowest_cost = trajectory_cost
                best_action = first_action.detach()
        
        return best_action.numpy()


    def choose_action_ref(self, state, rollout, horizon, reference_signals, step):
        """
        Выбирает оптимальное действие с учетом эталонных сигналов.

        Args:
            state (numpy.ndarray): Текущее состояние среды.
            rollout (int): Количество прогнозируемых траекторий для оценки.
            horizon (int): Горизонт планирования.
            reference_signals (numpy.ndarray): Эталонные сигналы для оценки действий.
            step (int): Текущий временной шаг в среде.

        Returns:
            numpy.ndarray: Возвращает массив, содержащий выбранное действие.
        """
        initial_state = torch.tensor([state], dtype=torch.float32)
        best_action = None
        max_trajectory_value = -float('inf')
        action_distribution = Uniform(-60, 60)
        for trajectory in range(rollout):
            state = initial_state
            trajectory_value = 0
            for h in range(horizon):
                
                action = torch.Tensor([[action_distribution.sample()]])
                if h == 0:
                    first_action = action
                next_state = self.system_model(torch.cat([state, action], dim=-1))
                costs = self.cost_function(next_state, action, reference_signals, step)
                trajectory_value += -costs
                
                state = next_state
            if trajectory_value > max_trajectory_value:
                max_trajectory_value = trajectory_value
                best_action = first_action
        return best_action.numpy()
    
    def test_model(self, env, num_episodes=100, rollout=10, horizon=1):
        """
        Тестирует модель в среде, измеряя среднее вознаграждение за серию эпизодов.

        Args:
            env (gym.Env): Среда для тестирования.
            num_episodes (int): Количество эпизодов для тестирования.
            rollout (int): Количество прогнозируемых траекторий для выбора действий.
            horizon (int): Горизонт планирования для выбора действий.

        Returns:
            list: Список суммарных вознаграждений за каждый эпизод.
        """
        total_rewards = []  # Список для хранения суммарных вознаграждений за каждый эпизод
        for episode in range(num_episodes):
            state, info = env.reset()
            total_reward = 0
            done = False
            while not done:
                action = self.choose_action(state, rollout, horizon)
                state, reward, terminated, truncated, info= env.step(action[0])
                done = terminated or truncated
                total_reward += reward
                if done:
                    break
            print(f'Episode {episode+1}: Total Reward = {total_reward}')
            total_rewards.append(total_reward)

        average_reward = sum(total_rewards) / num_episodes
        self.writer.add_scalar('Test/AverageReward', average_reward, num_episodes)
        return total_rewards
    
    def test_network(self, states, actions, next_states):
        """
        Тестирует точность предсказаний модели на заданном наборе данных.

        Args:
            states (numpy.ndarray): Массив текущих состояний.
            actions (numpy.ndarray): Массив действий.
            next_states (numpy.ndarray): Массив следующих состояний.

        Returns:
            None
        """
        self.system_model.eval()  # Перевести модель в режим оценки
        with torch.no_grad():  # Отключить вычисление градиентов
            # Подготовка данных
            inputs = np.hstack((states, actions.reshape(-1, 1)))
            inputs = torch.tensor(inputs, dtype=torch.float32)
            true_next_states = torch.tensor(next_states, dtype=torch.float32)
            
            # Получение предсказаний от модели
            predicted_next_states = self.system_model(inputs)
            
            # Вычисление потерь (среднеквадратичная ошибка)
            mse_loss = torch.nn.functional.mse_loss(predicted_next_states, true_next_states)
            print(f'Test MSE Loss: {mse_loss.item()}')
            
            # Логирование потерь в TensorBoard
            self.writer.add_scalar('Test/MSE_Loss', mse_loss.item(), 0)
        
        self.system_model.train()  # Вернуть модель в режим обучения

In [4]:
env = gym.make('Pendulum-v1', g=9.81)

In [5]:
env.reset()

  logger.warn(f"{pre} is not within the observation space.")


(array([ 0.02761483, -0.36976734], dtype=float32), {})

In [6]:
model = Net()

In [7]:
def example_cost_function(state, action):
    theta = state[0, 0].item()
    theta_dot = state[0, 1].item()
    return (theta ** 2 + 0.1 * theta_dot ** 2 + 0.001 * (action ** 2))

In [8]:
torch.__version__

'2.2.0'

In [9]:
agent = MPCAgent(gamma=0.99, action_dim=1, observation_dim=2, model=model, cost_function=example_cost_function)

In [11]:
states, actions, next_states = agent.collect_data(env, num_episodes=2000)

100%|██████████| 2000/2000 [00:09<00:00, 200.13it/s]


In [12]:
agent.train_model(states, actions, next_states, epochs=300, batch_size=2048)

Loss 0.0019631721079349518: 100%|██████████| 300/300 [03:35<00:00,  1.39it/s]


In [13]:
states, actions, next_states = agent.collect_data(env, num_episodes=3000)

100%|██████████| 3000/3000 [00:15<00:00, 199.96it/s]


In [14]:
agent.test_network(states, actions, next_states)

Test MSE Loss: 0.005056232679635286


In [15]:
state, info = env.reset()

In [16]:
def choose_action(state, rollout, horizon):
    initial_state = torch.tensor([state], dtype=torch.float32)
    best_action = None
    max_trajectory_value = -float('inf')
    action_distribution =torch.distributions.uniform.Uniform(-2,2)


    for trajectory in range(rollout):
        state = initial_state
        trajectory_value = 0
        for h in range(horizon):
            action = torch.Tensor([[action_distribution.sample()]])
            if h == 0:
                first_action = action
            next_state = agent.system_model(torch.cat([state, action], dim=-1))
            costs = agent.cost_function(next_state, action)
            trajectory_value += -costs
                
            state = next_state
        if trajectory_value > max_trajectory_value:
            max_trajectory_value = trajectory_value
            best_action = first_action
    return best_action.numpy()

In [17]:
def choose_action_gradient(state, rollout, horizon, lr=1):
    initial_state = torch.tensor([state], dtype=torch.float32, requires_grad=False)
    action = torch.zeros(1, 1, requires_grad=True)  # Start with a zero action
    optimizer = torch.optim.Adam([action], lr=lr)  # Use Adam optimizer for the action variable

    for step in range(rollout):
        optimizer.zero_grad()
        state = initial_state
        cumulative_cost = 0
        for h in range(horizon):
            next_state = agent.system_model(torch.cat([state, action], dim=-1))
            cost = agent.cost_function(next_state, action)
            cumulative_cost += cost
            state = next_state

        # Since we want to minimize the cost, we take negative of the cumulative cost
        (-cumulative_cost).backward()
        optimizer.step()

    # Clip or adjust the action as per the action space constraints
    optimized_action = action.detach().numpy()
    return optimized_action

In [18]:
def choose_action_grad_opti(state, rollout, horizon):
    initial_state = torch.tensor([state], dtype=torch.float32, requires_grad=False)
    best_cost = float('inf')
    best_action_sequence = None

    for _ in range(rollout):
        action_sequence = torch.randn(horizon, 1, requires_grad=True)  # Инициализируем последовательность действий
        optimizer = optim.Adam([action_sequence], lr=1)

        for optimization_step in range(rollout):  # Количество шагов оптимизации
            optimizer.zero_grad()
            state = initial_state
            total_cost = 0
            for h in range(horizon):
                action = action_sequence[h].unsqueeze(0)
                next_state = agent.system_model(torch.cat([state, action], dim=-1))
                cost = agent.cost_function(next_state, action)
                total_cost += cost
                state = next_state

            if total_cost < best_cost:
                best_cost = total_cost
                best_action_sequence = action_sequence.detach().clone()

            total_cost.backward()
            optimizer.step()

    return best_action_sequence[0].detach().numpy()  # Возвращаем первое действие из наилучшей последовательности



In [19]:
def choose_action_with_noise(state, rollout, horizon):
    initial_state = torch.tensor([state], dtype=torch.float32, requires_grad=False)
    best_cost = float('inf')
    best_action_sequence = None

    for _ in range(rollout):
        # Инициализация последовательности действий с возможностью градиентного спуска
        action_sequence = torch.randn(horizon, 1, requires_grad=True)
        optimizer = optim.Adam([action_sequence], lr=0.1)

        for optimization_step in range(100):  # Количество шагов оптимизации
            optimizer.zero_grad()
            state = initial_state
            total_cost = 0
            for h in range(horizon):
                action = action_sequence[h].unsqueeze(0)
                
                # Добавление возмущения к действию (исправлено, чтобы избежать операции на месте)
                noise = torch.randn_like(action) * 0.1  # Малое стандартное отклонение для возмущения
                action = action + noise  # Исправлено на операцию, не изменяющую переменную на месте

                next_state = agent.system_model(torch.cat([state, action], dim=-1))
                cost = agent.cost_function(next_state, action)
                total_cost += cost
                state = next_state

            if total_cost < best_cost:
                best_cost = total_cost.item()
                best_action_sequence = action_sequence.detach().clone()

            total_cost.backward()
            optimizer.step()

    # Возвращаем первое действие из наилучшей найденной последовательности
    return best_action_sequence[0].detach().numpy()


In [24]:

def choose_action_with_noise_v1(state, rollout, horizon):
    initial_state = torch.tensor([state], dtype=torch.float32, requires_grad=False)
    best_cost = float('inf')
    best_action_sequence = None

    # Предварительное вычисление шума для всех возможных действий
    noise = torch.randn(rollout, horizon, 1) * 0.1

    for rollout_index in range(rollout):
        # Инициализация последовательности действий с возможностью градиентного спуска
        action_sequence = torch.randn(horizon, 1, requires_grad=True)
        optimizer = optim.Adam([action_sequence], lr=0.1)

        for optimization_step in range(100):  # Количество шагов оптимизации
            optimizer.zero_grad()
            state = initial_state.clone()
            total_cost = 0

            # Добавление шума заранее
            action_sequence_with_noise = action_sequence + noise[rollout_index]

            for h in range(horizon):
                action = action_sequence_with_noise[h].unsqueeze(0)
                noise = torch.randn_like(action) * 0.1
                next_state = agent.system_model(torch.cat([state, action], dim=-1))
                cost = agent.cost_function(next_state, action)
                total_cost += cost
                state = next_state.detach()  # Отсоединяем, чтобы избежать ненужного накопления градиентов

            if total_cost < best_cost:
                best_cost = total_cost.item()
                best_action_sequence = action_sequence.detach().clone()

            total_cost.backward()
            optimizer.step()

    return best_action_sequence[0].numpy()

In [126]:

class ActionGenerator(nn.Module):
    def __init__(self, state_dim, action_dim, horizon):
        super(ActionGenerator, self).__init__()
        self.horizon = horizon
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_dim * horizon)

    def forward(self, state):
        x = torch.tanh(self.fc1(state))
        x = torch.tanh(self.fc2(x))
        action_sequence = torch.tanh(self.fc3(x))
        action_sequence = action_sequence.view(-1, self.horizon, 1)  # Reshape to [horizon, action_dim]
        return action_sequence

action_generator = ActionGenerator(2, 1, 10)
optimizer_action = optim.Adam(action_generator.parameters(), lr=0.001)

In [129]:
def choose_action_with_network(state, horizon, rollout=10):
    best_cost = float('inf')
    best_action_sequence = None

    for rollout_iter in range(rollout):
        optimizer_action.zero_grad()  # Очищаем градиенты на каждом роллауте

        action_sequence = action_generator(torch.tensor([state], dtype=torch.float32)) 
        total_cost = torch.tensor(0., requires_grad=True)  # Инициализируем total_cost как тензор для накопления градиентов
        state_tensor = torch.tensor([state], dtype=torch.float32)

        for h in range(horizon):
            action = action_sequence[:, h, :]
            noise = torch.randn_like(action) * 0.1
            action = (action + noise)
            actions_clipped = torch.clip(action, -2, 2)
            next_state = agent.system_model(torch.cat([state_tensor, actions_clipped], dim=-1))
            cost = agent.cost_function(next_state, actions_clipped)

            if cost.requires_grad:
                cost.backward(retain_graph=True)  # Накапливаем градиенты на каждом шаге

            total_cost = total_cost + cost.detach().item()  # Обновляем total_cost, используя значение стоимости
            state_tensor = next_state.detach()

        # Проверяем, является ли найденная последовательность действий лучшей
        if total_cost < best_cost:
            best_cost = total_cost
            best_action_sequence = action_sequence.detach().clone()

    # После всех роллаутов, делаем шаг оптимизации
    optimizer_action.step()

    return best_action_sequence[0, 0].numpy()  # Возвращаем первое действие из лучшей последовательности


In [131]:
env = gym.make('Pendulum-v1', g=9.81, render_mode="rgb_array")
env_rec =gym.wrappers.RecordVideo(env, "./video")
rollout, horizon = 50,10


for _i in range(30):
    episode_reward = 0
    done = False
    state, info = env_rec.reset()
    for _ in tqdm(range(200)):
        action = choose_action_with_network(state, horizon, rollout)
        state, reward, terminated, truncated, info= env_rec.step(action)
        done = terminated or truncated
        env_rec.render()
        episode_reward += reward
    print(episode_reward)
env_rec.close()

  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(f"{pre} is not within the observation space.")
100%|█████████▉| 199/200 [00:22<00:00,  8.95it/s]

Moviepy - Building video /Users/asmazaev/Projects/TensorAeroSpace/video/rl-video-episode-0.mp4.
Moviepy - Writing video /Users/asmazaev/Projects/TensorAeroSpace/video/rl-video-episode-0.mp4



100%|██████████| 200/200 [00:23<00:00,  8.52it/s]


Moviepy - Done !
Moviepy - video ready /Users/asmazaev/Projects/TensorAeroSpace/video/rl-video-episode-0.mp4
-1089.3551462689184


100%|█████████▉| 199/200 [00:22<00:00,  9.10it/s]

Moviepy - Building video /Users/asmazaev/Projects/TensorAeroSpace/video/rl-video-episode-1.mp4.
Moviepy - Writing video /Users/asmazaev/Projects/TensorAeroSpace/video/rl-video-episode-1.mp4



100%|██████████| 200/200 [00:22<00:00,  8.74it/s]


Moviepy - Done !
Moviepy - video ready /Users/asmazaev/Projects/TensorAeroSpace/video/rl-video-episode-1.mp4
-1817.5463590224992


100%|██████████| 200/200 [00:22<00:00,  8.97it/s]


-862.1822102561072


100%|██████████| 200/200 [00:22<00:00,  9.03it/s]


-1689.1850605552236


100%|██████████| 200/200 [00:21<00:00,  9.22it/s]


-861.7928944045152


100%|██████████| 200/200 [00:22<00:00,  9.02it/s]


-1190.9053250696911


100%|██████████| 200/200 [00:22<00:00,  9.07it/s]


-848.5041158585544


100%|██████████| 200/200 [00:21<00:00,  9.19it/s]


-1273.4843445141641


100%|█████████▉| 199/200 [00:23<00:00,  8.77it/s]

Moviepy - Building video /Users/asmazaev/Projects/TensorAeroSpace/video/rl-video-episode-8.mp4.
Moviepy - Writing video /Users/asmazaev/Projects/TensorAeroSpace/video/rl-video-episode-8.mp4



100%|██████████| 200/200 [00:23<00:00,  8.50it/s]


Moviepy - Done !
Moviepy - video ready /Users/asmazaev/Projects/TensorAeroSpace/video/rl-video-episode-8.mp4
-748.7775248185852


100%|██████████| 200/200 [00:22<00:00,  8.95it/s]


-1626.43625899645


100%|██████████| 200/200 [00:22<00:00,  8.74it/s]


-821.8756828588789


100%|██████████| 200/200 [00:22<00:00,  8.79it/s]


-1185.503173078708


100%|██████████| 200/200 [00:21<00:00,  9.23it/s]


-1169.919511180048


100%|██████████| 200/200 [00:21<00:00,  9.31it/s]


-1453.5951487384955


100%|██████████| 200/200 [00:21<00:00,  9.32it/s]


-1540.404016595317


100%|██████████| 200/200 [00:21<00:00,  9.38it/s]


-1789.6466608823462


100%|██████████| 200/200 [00:21<00:00,  9.30it/s]


-1007.2592278410323


100%|██████████| 200/200 [00:21<00:00,  9.35it/s]


-865.8452123515995


100%|██████████| 200/200 [00:21<00:00,  9.35it/s]


-1755.6243432157637


100%|██████████| 200/200 [00:21<00:00,  9.32it/s]


-1162.4545046941728


100%|██████████| 200/200 [00:21<00:00,  9.34it/s]


-1267.095094378468


100%|██████████| 200/200 [00:21<00:00,  9.31it/s]


-1398.5068305315845


100%|██████████| 200/200 [00:21<00:00,  9.33it/s]


-953.6589352224665


100%|██████████| 200/200 [00:21<00:00,  9.34it/s]


-1619.3457175416804


100%|██████████| 200/200 [00:21<00:00,  9.33it/s]


-1001.5421486693716


100%|██████████| 200/200 [00:21<00:00,  9.36it/s]


-629.1082381225402


100%|██████████| 200/200 [00:21<00:00,  9.34it/s]


-953.7574418620668


100%|█████████▉| 199/200 [00:22<00:00,  9.08it/s]

Moviepy - Building video /Users/asmazaev/Projects/TensorAeroSpace/video/rl-video-episode-27.mp4.
Moviepy - Writing video /Users/asmazaev/Projects/TensorAeroSpace/video/rl-video-episode-27.mp4



100%|██████████| 200/200 [00:22<00:00,  8.86it/s]


Moviepy - Done !
Moviepy - video ready /Users/asmazaev/Projects/TensorAeroSpace/video/rl-video-episode-27.mp4
-1473.7103607311676


100%|██████████| 200/200 [00:21<00:00,  9.31it/s]


-974.1388025820218


100%|██████████| 200/200 [00:21<00:00,  9.34it/s]


-1720.6918680221522


In [74]:
# Is MPS even available? macOS 12.3+
print(torch.backends.mps.is_available())

# Was the current version of PyTorch built with MPS activated?
print(torch.backends.mps.is_built())


True
True


device(type='mps')

In [56]:
rollout, horizon = 200,10
for episode in range(1):
    done = False
    state, info = env.reset()
    episode_reward = 0
    while not done:
        action = agent.choose_action(state, rollout, horizon)
        state, reward, terminated, truncated, info= env.step(action[0])
        done = terminated or truncated
        episode_reward += reward
        if done:
            break
    print('rollout: %d, horizon: %d, episode: %d, reward: %d' % (rollout, horizon, episode, episode_reward))

rollout: 200, horizon: 10, episode: 0, reward: -967


In [59]:
rollout, horizon, episode, episode_reward

(200, 10, 0, -967.0516866740916)