In [1]:
#ppo algorithm single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate additional metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Train and evaluate the model with all metrics
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'NFLX_TRAINING.csv'
    test_file = 'NFLX_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize the PPO model and train
    model = PPO("MlpPolicy", env_train, verbose=1)
    model.learn(total_timesteps=100000)

    # Test the model on the training data
    obs, _ = env_train.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_train.step(action)

    # Generate report for the training session
    env_train.generate_report()

    # Calculate and display metrics for the training period
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Test the model on the testing data
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_test.step(action)

    # Generate report for the testing session
    env_test.generate_report()

    # Calculate and display metrics for the testing period
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 1896 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1499        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009333496 |
|    clip_fraction        | 0.0758      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -1.05       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0162     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00813    |
|    value_loss         

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.14e+03    |
|    ep_rew_mean          | 14.9        |
| time/                   |             |
|    fps                  | 1336        |
|    iterations           | 12          |
|    time_elapsed         | 18          |
|    total_timesteps      | 24576       |
| train/                  |             |
|    approx_kl            | 0.010572189 |
|    clip_fraction        | 0.106       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.855      |
|    explained_variance   | 0.71        |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0368     |
|    n_updates            | 110         |
|    policy_gradient_loss | -0.00547    |
|    value_loss           | 0.00878     |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 7.14

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.14e+03    |
|    ep_rew_mean          | 34.6        |
| time/                   |             |
|    fps                  | 1321        |
|    iterations           | 22          |
|    time_elapsed         | 34          |
|    total_timesteps      | 45056       |
| train/                  |             |
|    approx_kl            | 0.009380452 |
|    clip_fraction        | 0.0942      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.797      |
|    explained_variance   | 0.631       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00885    |
|    n_updates            | 210         |
|    policy_gradient_loss | -0.00436    |
|    value_loss           | 0.00825     |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 7.14

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.14e+03    |
|    ep_rew_mean          | 45.2        |
| time/                   |             |
|    fps                  | 1322        |
|    iterations           | 32          |
|    time_elapsed         | 49          |
|    total_timesteps      | 65536       |
| train/                  |             |
|    approx_kl            | 0.014186889 |
|    clip_fraction        | 0.11        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.738      |
|    explained_variance   | 0.522       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00565    |
|    n_updates            | 310         |
|    policy_gradient_loss | -0.00558    |
|    value_loss           | 0.00781     |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 7.14

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.14e+03    |
|    ep_rew_mean          | 52.6        |
| time/                   |             |
|    fps                  | 1306        |
|    iterations           | 42          |
|    time_elapsed         | 65          |
|    total_timesteps      | 86016       |
| train/                  |             |
|    approx_kl            | 0.007000295 |
|    clip_fraction        | 0.0833      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.58       |
|    explained_variance   | 0.667       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00592    |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.00577    |
|    value_loss           | 0.00952     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.14e+


--- Agent Report ---
Agent starts with 0 holdings (neutral position), Initial Balance: 10000
Agent sells (short) at 604.88, Current Balance: 10000, Holdings: 1 Short
Agent closes short at 550.3, profit: 54.58000000000004, Current Balance: 10054.58, Holdings: 0
Agent buys at 555.425, Current Balance: 10054.58, Holdings: 1 Long
Agent closes long at 557.99, profit: 2.5650000000000546, Current Balance: 10057.145, Holdings: 0
Agent sells (short) at 557.2253, Current Balance: 10057.145, Holdings: 1 Short
Agent closes short at 553.29, profit: 3.935299999999984, Current Balance: 10061.0803, Holdings: 0
Agent sells (short) at 554.7, Current Balance: 10061.0803, Holdings: 1 Short
Agent closes short at 539.665, profit: 15.035000000000082, Current Balance: 10076.1153, Holdings: 0
Agent buys at 541.06, Current Balance: 10076.1153, Holdings: 1 Long
Agent closes long at 540.88, profit: -0.17999999999994998, Current Balance: 10075.9353, Holdings: 0
Agent sells (short) at 540.85, Current Balance: 1007

In [2]:
#dqn algorithm single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import DQN
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate additional metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Train and evaluate the model with all metrics
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'NFLX_TRAINING.csv'
    test_file = 'NFLX_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize the DQN model and train
    model = DQN("MlpPolicy", env_train, verbose=1)
    model.learn(total_timesteps=100000)

    # Test the model on the training data
    obs, _ = env_train.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_train.step(action)

    # Generate report for the training session
    env_train.generate_report()

    # Calculate and display metrics for the training period
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Test the model on the testing data
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_test.step(action)

    # Generate report for the testing session
    env_test.generate_report()

    # Calculate and display metrics for the testing period
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 7.14e+03 |
|    ep_rew_mean      | 3.23     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1100     |
|    time_elapsed     | 25       |
|    total_timesteps  | 28576    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000634 |
|    n_updates        | 7118     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 7.14e+03 |
|    ep_rew_mean      | 2.31     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1124     |
|    time_elapsed     | 50       |
|    total_timesteps  | 57152    |
| train/              |        


--- Agent Report ---
Agent starts with 0 holdings (neutral position), Initial Balance: 10000
Agent buys at 604.88, Current Balance: 10000, Holdings: 1 Long
Agent closes long at 592.38, profit: -12.5, Current Balance: 9987.5, Holdings: 0
Agent buys at 594.88, Current Balance: 9987.5, Holdings: 1 Long
Agent closes long at 596.38, profit: 1.5, Current Balance: 9989.0, Holdings: 0
Agent buys at 596.4, Current Balance: 9989.0, Holdings: 1 Long
Agent closes long at 592.6, profit: -3.7999999999999545, Current Balance: 9985.2, Holdings: 0
Agent sells (short) at 585.32, Current Balance: 9985.2, Holdings: 1 Short
Agent closes short at 582.02, profit: 3.300000000000068, Current Balance: 9988.5, Holdings: 0
Agent sells (short) at 587.415, Current Balance: 9988.5, Holdings: 1 Short
Agent closes short at 591.2101, profit: -3.7951000000000477, Current Balance: 9984.7049, Holdings: 0
Agent buys at 592.0, Current Balance: 9984.7049, Holdings: 1 Long
Agent closes long at 590.66, profit: -1.340000000000

In [3]:
#a2c algorithm single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import A2C
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate additional metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Train and evaluate the model with action logging for debugging
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'NFLX_TRAINING.csv'
    test_file = 'NFLX_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize the A2C model and train with more timesteps
    model = A2C("MlpPolicy", env_train, verbose=1)
    model.learn(total_timesteps=500000)  # Increased timesteps

    # Testing on the training data with action logging
    obs, _ = env_train.reset()
    done = False
    print("\n--- Training Session ---")
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_train.step(action)
        # Log each action and reward for debugging
        print(f"Step: {env_train.current_step}, Action: {action}, Reward: {reward}")

    # Generate report and metrics for the training session
    env_train.generate_report()
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Testing on the testing data with action logging
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    print("\n--- Testing Session ---")
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_test.step(action)
        # Log each action and reward for debugging
        print(f"Step: {env_test.current_step}, Action: {action}, Reward: {reward}")

    # Generate report and metrics for the testing session
    env_test.generate_report()
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")



# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| time/                 |          |
|    fps                | 1166     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.748   |
|    explained_variance | -32.2    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.0214   |
|    value_loss         | 0.000801 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1201      |
|    iterations         | 200       |
|    time_elapsed       | 0         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -1.01     |
|    explained_variance | -2.27e+03 |
|    learning_rate      | 0.0007    |
|    n_u

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | -1.38    |
| time/                 |          |
|    fps                | 1016     |
|    iterations         | 1700     |
|    time_elapsed       | 8        |
|    total_timesteps    | 8500     |
| train/                |          |
|    entropy_loss       | -0.531   |
|    explained_variance | -32.5    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1699     |
|    policy_loss        | -0.0183  |
|    value_loss         | 0.00177  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | -1.38    |
| time/                 |          |
|    fps                | 1012     |
|    iterations         | 1800     |
|    time_elapsed       | 8        |
|    total_timesteps    | 9000     |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 0.411     |
| time/                 |           |
|    fps                | 1000      |
|    iterations         | 3000      |
|    time_elapsed       | 14        |
|    total_timesteps    | 15000     |
| train/                |           |
|    entropy_loss       | -0.151    |
|    explained_variance | -10.4     |
|    learning_rate      | 0.0007    |
|    n_updates          | 2999      |
|    policy_loss        | -6.29e-05 |
|    value_loss         | 1.28e-05  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 0.411    |
| time/                 |          |
|    fps                | 999      |
|    iterations         | 3100     |
|    time_elapsed       | 15       |
|    total_timesteps    | 15500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | -0.74    |
| time/                 |          |
|    fps                | 988      |
|    iterations         | 4300     |
|    time_elapsed       | 21       |
|    total_timesteps    | 21500    |
| train/                |          |
|    entropy_loss       | -0.538   |
|    explained_variance | 0.442    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4299     |
|    policy_loss        | -0.00428 |
|    value_loss         | 6.42e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | -0.74    |
| time/                 |          |
|    fps                | 987      |
|    iterations         | 4400     |
|    time_elapsed       | 22       |
|    total_timesteps    | 22000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | -0.74     |
| time/                 |           |
|    fps                | 977       |
|    iterations         | 5600      |
|    time_elapsed       | 28        |
|    total_timesteps    | 28000     |
| train/                |           |
|    entropy_loss       | -0.0356   |
|    explained_variance | -29.1     |
|    learning_rate      | 0.0007    |
|    n_updates          | 5599      |
|    policy_loss        | -2.12e-06 |
|    value_loss         | 4.78e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | -0.74    |
| time/                 |          |
|    fps                | 977      |
|    iterations         | 5700     |
|    time_elapsed       | 29       |
|    total_timesteps    | 28500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | -0.131   |
| time/                 |          |
|    fps                | 974      |
|    iterations         | 6900     |
|    time_elapsed       | 35       |
|    total_timesteps    | 34500    |
| train/                |          |
|    entropy_loss       | -0.0214  |
|    explained_variance | -3.34    |
|    learning_rate      | 0.0007   |
|    n_updates          | 6899     |
|    policy_loss        | 3.2e-06  |
|    value_loss         | 1.21e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | -0.131    |
| time/                 |           |
|    fps                | 974       |
|    iterations         | 7000      |
|    time_elapsed       | 35        |
|    total_timesteps    | 35000     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 0.28      |
| time/                 |           |
|    fps                | 974       |
|    iterations         | 8200      |
|    time_elapsed       | 42        |
|    total_timesteps    | 41000     |
| train/                |           |
|    entropy_loss       | -0.165    |
|    explained_variance | -1.46e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 8199      |
|    policy_loss        | 0.00172   |
|    value_loss         | 0.0112    |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 0.28     |
| time/                 |          |
|    fps                | 974      |
|    iterations         | 8300     |
|    time_elapsed       | 42       |
|    total_timesteps    | 41500    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 1.53      |
| time/                 |           |
|    fps                | 974       |
|    iterations         | 9500      |
|    time_elapsed       | 48        |
|    total_timesteps    | 47500     |
| train/                |           |
|    entropy_loss       | -0.146    |
|    explained_variance | 0.084     |
|    learning_rate      | 0.0007    |
|    n_updates          | 9499      |
|    policy_loss        | -0.000364 |
|    value_loss         | 8.25e-05  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 1.53     |
| time/                 |          |
|    fps                | 975      |
|    iterations         | 9600     |
|    time_elapsed       | 49       |
|    total_timesteps    | 48000    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 2.83     |
| time/                 |          |
|    fps                | 972      |
|    iterations         | 10800    |
|    time_elapsed       | 55       |
|    total_timesteps    | 54000    |
| train/                |          |
|    entropy_loss       | -0.326   |
|    explained_variance | -1.2     |
|    learning_rate      | 0.0007   |
|    n_updates          | 10799    |
|    policy_loss        | -0.00142 |
|    value_loss         | 5.91e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 2.83     |
| time/                 |          |
|    fps                | 972      |
|    iterations         | 10900    |
|    time_elapsed       | 56       |
|    total_timesteps    | 54500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 3.54     |
| time/                 |          |
|    fps                | 969      |
|    iterations         | 12100    |
|    time_elapsed       | 62       |
|    total_timesteps    | 60500    |
| train/                |          |
|    entropy_loss       | -0.43    |
|    explained_variance | -0.495   |
|    learning_rate      | 0.0007   |
|    n_updates          | 12099    |
|    policy_loss        | -0.00454 |
|    value_loss         | 0.000183 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 3.54     |
| time/                 |          |
|    fps                | 969      |
|    iterations         | 12200    |
|    time_elapsed       | 62       |
|    total_timesteps    | 61000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4         |
| time/                 |           |
|    fps                | 970       |
|    iterations         | 13400     |
|    time_elapsed       | 69        |
|    total_timesteps    | 67000     |
| train/                |           |
|    entropy_loss       | -0.241    |
|    explained_variance | 0.538     |
|    learning_rate      | 0.0007    |
|    n_updates          | 13399     |
|    policy_loss        | -0.000469 |
|    value_loss         | 5.99e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4         |
| time/                 |           |
|    fps                | 970       |
|    iterations         | 13500     |
|    time_elapsed       | 69        |
|    total_timesteps    | 67500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.44      |
| time/                 |           |
|    fps                | 971       |
|    iterations         | 14700     |
|    time_elapsed       | 75        |
|    total_timesteps    | 73500     |
| train/                |           |
|    entropy_loss       | -0.29     |
|    explained_variance | -1.41e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 14699     |
|    policy_loss        | -0.00189  |
|    value_loss         | 0.126     |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.44     |
| time/                 |          |
|    fps                | 971      |
|    iterations         | 14800    |
|    time_elapsed       | 76       |
|    total_timesteps    | 74000    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.39     |
| time/                 |          |
|    fps                | 971      |
|    iterations         | 16000    |
|    time_elapsed       | 82       |
|    total_timesteps    | 80000    |
| train/                |          |
|    entropy_loss       | -0.00416 |
|    explained_variance | -0.833   |
|    learning_rate      | 0.0007   |
|    n_updates          | 15999    |
|    policy_loss        | 1.95e-07 |
|    value_loss         | 4.34e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.39     |
| time/                 |          |
|    fps                | 971      |
|    iterations         | 16100    |
|    time_elapsed       | 82       |
|    total_timesteps    | 80500    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.5       |
| time/                 |           |
|    fps                | 972       |
|    iterations         | 17300     |
|    time_elapsed       | 88        |
|    total_timesteps    | 86500     |
| train/                |           |
|    entropy_loss       | -0.217    |
|    explained_variance | -4.23     |
|    learning_rate      | 0.0007    |
|    n_updates          | 17299     |
|    policy_loss        | -2.08e-05 |
|    value_loss         | 1.61e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.5      |
| time/                 |          |
|    fps                | 972      |
|    iterations         | 17400    |
|    time_elapsed       | 89       |
|    total_timesteps    | 87000    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.6      |
| time/                 |          |
|    fps                | 973      |
|    iterations         | 18600    |
|    time_elapsed       | 95       |
|    total_timesteps    | 93000    |
| train/                |          |
|    entropy_loss       | -0.164   |
|    explained_variance | -751     |
|    learning_rate      | 0.0007   |
|    n_updates          | 18599    |
|    policy_loss        | 8.26e-05 |
|    value_loss         | 4.64e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.6       |
| time/                 |           |
|    fps                | 973       |
|    iterations         | 18700     |
|    time_elapsed       | 96        |
|    total_timesteps    | 93500     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.6       |
| time/                 |           |
|    fps                | 974       |
|    iterations         | 19900     |
|    time_elapsed       | 102       |
|    total_timesteps    | 99500     |
| train/                |           |
|    entropy_loss       | -0.0355   |
|    explained_variance | -7.47     |
|    learning_rate      | 0.0007    |
|    n_updates          | 19899     |
|    policy_loss        | -6.45e-08 |
|    value_loss         | 4.7e-08   |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.6      |
| time/                 |          |
|    fps                | 974      |
|    iterations         | 20000    |
|    time_elapsed       | 102      |
|    total_timesteps    | 100000   |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.61     |
| time/                 |          |
|    fps                | 974      |
|    iterations         | 21200    |
|    time_elapsed       | 108      |
|    total_timesteps    | 106000   |
| train/                |          |
|    entropy_loss       | -0.00308 |
|    explained_variance | -293     |
|    learning_rate      | 0.0007   |
|    n_updates          | 21199    |
|    policy_loss        | 1.8e-07  |
|    value_loss         | 3.61e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.61      |
| time/                 |           |
|    fps                | 975       |
|    iterations         | 21300     |
|    time_elapsed       | 109       |
|    total_timesteps    | 106500    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.66     |
| time/                 |          |
|    fps                | 968      |
|    iterations         | 22500    |
|    time_elapsed       | 116      |
|    total_timesteps    | 112500   |
| train/                |          |
|    entropy_loss       | -0.203   |
|    explained_variance | -9.73    |
|    learning_rate      | 0.0007   |
|    n_updates          | 22499    |
|    policy_loss        | 0.00648  |
|    value_loss         | 0.00144  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.66     |
| time/                 |          |
|    fps                | 967      |
|    iterations         | 22600    |
|    time_elapsed       | 116      |
|    total_timesteps    | 113000   |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.72      |
| time/                 |           |
|    fps                | 949       |
|    iterations         | 23800     |
|    time_elapsed       | 125       |
|    total_timesteps    | 119000    |
| train/                |           |
|    entropy_loss       | -0.000702 |
|    explained_variance | -121      |
|    learning_rate      | 0.0007    |
|    n_updates          | 23799     |
|    policy_loss        | 1.15e-08  |
|    value_loss         | 3.22e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.72     |
| time/                 |          |
|    fps                | 947      |
|    iterations         | 23900    |
|    time_elapsed       | 126      |
|    total_timesteps    | 119500   |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.77     |
| time/                 |          |
|    fps                | 945      |
|    iterations         | 25100    |
|    time_elapsed       | 132      |
|    total_timesteps    | 125500   |
| train/                |          |
|    entropy_loss       | -0.00066 |
|    explained_variance | -3.73    |
|    learning_rate      | 0.0007   |
|    n_updates          | 25099    |
|    policy_loss        | -2.2e-07 |
|    value_loss         | 2.2e-05  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.77     |
| time/                 |          |
|    fps                | 945      |
|    iterations         | 25200    |
|    time_elapsed       | 133      |
|    total_timesteps    | 126000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.78     |
| time/                 |          |
|    fps                | 943      |
|    iterations         | 26400    |
|    time_elapsed       | 139      |
|    total_timesteps    | 132000   |
| train/                |          |
|    entropy_loss       | -0.0232  |
|    explained_variance | -7.85    |
|    learning_rate      | 0.0007   |
|    n_updates          | 26399    |
|    policy_loss        | 1.74e-05 |
|    value_loss         | 2.49e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.78      |
| time/                 |           |
|    fps                | 944       |
|    iterations         | 26500     |
|    time_elapsed       | 140       |
|    total_timesteps    | 132500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.83      |
| time/                 |           |
|    fps                | 943       |
|    iterations         | 27700     |
|    time_elapsed       | 146       |
|    total_timesteps    | 138500    |
| train/                |           |
|    entropy_loss       | -0.000424 |
|    explained_variance | -44.6     |
|    learning_rate      | 0.0007    |
|    n_updates          | 27699     |
|    policy_loss        | -2.19e-09 |
|    value_loss         | 6.13e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.83      |
| time/                 |           |
|    fps                | 942       |
|    iterations         | 27800     |
|    time_elapsed       | 147       |
|    total_timesteps    | 139000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.8       |
| time/                 |           |
|    fps                | 941       |
|    iterations         | 29000     |
|    time_elapsed       | 153       |
|    total_timesteps    | 145000    |
| train/                |           |
|    entropy_loss       | -0.000684 |
|    explained_variance | -2.72     |
|    learning_rate      | 0.0007    |
|    n_updates          | 28999     |
|    policy_loss        | 1.02e-06  |
|    value_loss         | 0.000166  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.8      |
| time/                 |          |
|    fps                | 941      |
|    iterations         | 29100    |
|    time_elapsed       | 154      |
|    total_timesteps    | 145500   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.82      |
| time/                 |           |
|    fps                | 942       |
|    iterations         | 30300     |
|    time_elapsed       | 160       |
|    total_timesteps    | 151500    |
| train/                |           |
|    entropy_loss       | -0.000315 |
|    explained_variance | -1.36e+05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 30299     |
|    policy_loss        | -1.95e-08 |
|    value_loss         | 6.69e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.82      |
| time/                 |           |
|    fps                | 942       |
|    iterations         | 30400     |
|    time_elapsed       | 161       |
|    total_timesteps    | 152000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.94      |
| time/                 |           |
|    fps                | 944       |
|    iterations         | 31600     |
|    time_elapsed       | 167       |
|    total_timesteps    | 158000    |
| train/                |           |
|    entropy_loss       | -0.00034  |
|    explained_variance | -3.98     |
|    learning_rate      | 0.0007    |
|    n_updates          | 31599     |
|    policy_loss        | -3.31e-08 |
|    value_loss         | 1.45e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.94      |
| time/                 |           |
|    fps                | 944       |
|    iterations         | 31700     |
|    time_elapsed       | 167       |
|    total_timesteps    | 158500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.04      |
| time/                 |           |
|    fps                | 943       |
|    iterations         | 32900     |
|    time_elapsed       | 174       |
|    total_timesteps    | 164500    |
| train/                |           |
|    entropy_loss       | -7.66e-05 |
|    explained_variance | -750      |
|    learning_rate      | 0.0007    |
|    n_updates          | 32899     |
|    policy_loss        | 1.07e-08  |
|    value_loss         | 4.15e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.04      |
| time/                 |           |
|    fps                | 942       |
|    iterations         | 33000     |
|    time_elapsed       | 175       |
|    total_timesteps    | 165000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.04      |
| time/                 |           |
|    fps                | 941       |
|    iterations         | 34200     |
|    time_elapsed       | 181       |
|    total_timesteps    | 171000    |
| train/                |           |
|    entropy_loss       | -0.0282   |
|    explained_variance | -25.2     |
|    learning_rate      | 0.0007    |
|    n_updates          | 34199     |
|    policy_loss        | -7.35e-07 |
|    value_loss         | 3.83e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.09      |
| time/                 |           |
|    fps                | 940       |
|    iterations         | 34300     |
|    time_elapsed       | 182       |
|    total_timesteps    | 171500    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 5.09     |
| time/                 |          |
|    fps                | 939      |
|    iterations         | 35500    |
|    time_elapsed       | 189      |
|    total_timesteps    | 177500   |
| train/                |          |
|    entropy_loss       | -0.0527  |
|    explained_variance | -22.8    |
|    learning_rate      | 0.0007   |
|    n_updates          | 35499    |
|    policy_loss        | 9.43e-06 |
|    value_loss         | 1.19e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.09      |
| time/                 |           |
|    fps                | 939       |
|    iterations         | 35600     |
|    time_elapsed       | 189       |
|    total_timesteps    | 178000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.16      |
| time/                 |           |
|    fps                | 939       |
|    iterations         | 36800     |
|    time_elapsed       | 195       |
|    total_timesteps    | 184000    |
| train/                |           |
|    entropy_loss       | -0.0236   |
|    explained_variance | 0.771     |
|    learning_rate      | 0.0007    |
|    n_updates          | 36799     |
|    policy_loss        | -1.19e-05 |
|    value_loss         | 2e-05     |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.16      |
| time/                 |           |
|    fps                | 939       |
|    iterations         | 36900     |
|    time_elapsed       | 196       |
|    total_timesteps    | 184500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.18      |
| time/                 |           |
|    fps                | 940       |
|    iterations         | 38100     |
|    time_elapsed       | 202       |
|    total_timesteps    | 190500    |
| train/                |           |
|    entropy_loss       | -0.0117   |
|    explained_variance | -64.8     |
|    learning_rate      | 0.0007    |
|    n_updates          | 38099     |
|    policy_loss        | -3.22e-06 |
|    value_loss         | 5.41e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.18      |
| time/                 |           |
|    fps                | 940       |
|    iterations         | 38200     |
|    time_elapsed       | 203       |
|    total_timesteps    | 191000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 5.24     |
| time/                 |          |
|    fps                | 942      |
|    iterations         | 39400    |
|    time_elapsed       | 209      |
|    total_timesteps    | 197000   |
| train/                |          |
|    entropy_loss       | -0.0466  |
|    explained_variance | 0.174    |
|    learning_rate      | 0.0007   |
|    n_updates          | 39399    |
|    policy_loss        | -7.1e-06 |
|    value_loss         | 8.79e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 5.24     |
| time/                 |          |
|    fps                | 942      |
|    iterations         | 39500    |
|    time_elapsed       | 209      |
|    total_timesteps    | 197500   |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.38      |
| time/                 |           |
|    fps                | 943       |
|    iterations         | 40700     |
|    time_elapsed       | 215       |
|    total_timesteps    | 203500    |
| train/                |           |
|    entropy_loss       | -0.00917  |
|    explained_variance | -3.03     |
|    learning_rate      | 0.0007    |
|    n_updates          | 40699     |
|    policy_loss        | -2.14e-05 |
|    value_loss         | 0.000553  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.38      |
| time/                 |           |
|    fps                | 943       |
|    iterations         | 40800     |
|    time_elapsed       | 216       |
|    total_timesteps    | 204000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.45      |
| time/                 |           |
|    fps                | 945       |
|    iterations         | 42000     |
|    time_elapsed       | 222       |
|    total_timesteps    | 210000    |
| train/                |           |
|    entropy_loss       | -0.00135  |
|    explained_variance | -8.66     |
|    learning_rate      | 0.0007    |
|    n_updates          | 41999     |
|    policy_loss        | -1.19e-06 |
|    value_loss         | 0.000233  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.45      |
| time/                 |           |
|    fps                | 945       |
|    iterations         | 42100     |
|    time_elapsed       | 222       |
|    total_timesteps    | 210500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.58      |
| time/                 |           |
|    fps                | 946       |
|    iterations         | 43300     |
|    time_elapsed       | 228       |
|    total_timesteps    | 216500    |
| train/                |           |
|    entropy_loss       | -0.00216  |
|    explained_variance | -6.23     |
|    learning_rate      | 0.0007    |
|    n_updates          | 43299     |
|    policy_loss        | -3.67e-06 |
|    value_loss         | 0.000368  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 5.58     |
| time/                 |          |
|    fps                | 947      |
|    iterations         | 43400    |
|    time_elapsed       | 229      |
|    total_timesteps    | 217000   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.67      |
| time/                 |           |
|    fps                | 948       |
|    iterations         | 44600     |
|    time_elapsed       | 235       |
|    total_timesteps    | 223000    |
| train/                |           |
|    entropy_loss       | -0.000177 |
|    explained_variance | -20.4     |
|    learning_rate      | 0.0007    |
|    n_updates          | 44599     |
|    policy_loss        | -1.67e-09 |
|    value_loss         | 3.02e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.67      |
| time/                 |           |
|    fps                | 948       |
|    iterations         | 44700     |
|    time_elapsed       | 235       |
|    total_timesteps    | 223500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.84      |
| time/                 |           |
|    fps                | 949       |
|    iterations         | 45900     |
|    time_elapsed       | 241       |
|    total_timesteps    | 229500    |
| train/                |           |
|    entropy_loss       | -0.000177 |
|    explained_variance | -16.1     |
|    learning_rate      | 0.0007    |
|    n_updates          | 45899     |
|    policy_loss        | 1.12e-08  |
|    value_loss         | 6.53e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 5.84      |
| time/                 |           |
|    fps                | 949       |
|    iterations         | 46000     |
|    time_elapsed       | 242       |
|    total_timesteps    | 230000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 6.07      |
| time/                 |           |
|    fps                | 950       |
|    iterations         | 47200     |
|    time_elapsed       | 248       |
|    total_timesteps    | 236000    |
| train/                |           |
|    entropy_loss       | -0.000407 |
|    explained_variance | -44.3     |
|    learning_rate      | 0.0007    |
|    n_updates          | 47199     |
|    policy_loss        | -1.97e-08 |
|    value_loss         | 6.88e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 6.07      |
| time/                 |           |
|    fps                | 950       |
|    iterations         | 47300     |
|    time_elapsed       | 248       |
|    total_timesteps    | 236500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 6.07      |
| time/                 |           |
|    fps                | 951       |
|    iterations         | 48500     |
|    time_elapsed       | 254       |
|    total_timesteps    | 242500    |
| train/                |           |
|    entropy_loss       | -0.000183 |
|    explained_variance | -34.2     |
|    learning_rate      | 0.0007    |
|    n_updates          | 48499     |
|    policy_loss        | -2.34e-09 |
|    value_loss         | 7.56e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 6.3       |
| time/                 |           |
|    fps                | 951       |
|    iterations         | 48600     |
|    time_elapsed       | 255       |
|    total_timesteps    | 243000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 6.3       |
| time/                 |           |
|    fps                | 952       |
|    iterations         | 49800     |
|    time_elapsed       | 261       |
|    total_timesteps    | 249000    |
| train/                |           |
|    entropy_loss       | -0.000163 |
|    explained_variance | -34.5     |
|    learning_rate      | 0.0007    |
|    n_updates          | 49799     |
|    policy_loss        | 3.82e-08  |
|    value_loss         | 9.16e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 6.3       |
| time/                 |           |
|    fps                | 952       |
|    iterations         | 49900     |
|    time_elapsed       | 261       |
|    total_timesteps    | 249500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 6.54      |
| time/                 |           |
|    fps                | 952       |
|    iterations         | 51100     |
|    time_elapsed       | 268       |
|    total_timesteps    | 255500    |
| train/                |           |
|    entropy_loss       | -0.204    |
|    explained_variance | -1.33e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 51099     |
|    policy_loss        | -0.000866 |
|    value_loss         | 4.92e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 6.54      |
| time/                 |           |
|    fps                | 952       |
|    iterations         | 51200     |
|    time_elapsed       | 268       |
|    total_timesteps    | 256000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 6.87      |
| time/                 |           |
|    fps                | 953       |
|    iterations         | 52400     |
|    time_elapsed       | 274       |
|    total_timesteps    | 262000    |
| train/                |           |
|    entropy_loss       | -0.182    |
|    explained_variance | -3.18     |
|    learning_rate      | 0.0007    |
|    n_updates          | 52399     |
|    policy_loss        | -0.000939 |
|    value_loss         | 0.000446  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 6.87     |
| time/                 |          |
|    fps                | 953      |
|    iterations         | 52500    |
|    time_elapsed       | 275      |
|    total_timesteps    | 262500   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 7.98      |
| time/                 |           |
|    fps                | 953       |
|    iterations         | 53700     |
|    time_elapsed       | 281       |
|    total_timesteps    | 268500    |
| train/                |           |
|    entropy_loss       | -0.0126   |
|    explained_variance | 0.245     |
|    learning_rate      | 0.0007    |
|    n_updates          | 53699     |
|    policy_loss        | -2.23e-05 |
|    value_loss         | 0.000326  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 7.98     |
| time/                 |          |
|    fps                | 953      |
|    iterations         | 53800    |
|    time_elapsed       | 282      |
|    total_timesteps    | 269000   |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 8.53     |
| time/                 |          |
|    fps                | 954      |
|    iterations         | 55000    |
|    time_elapsed       | 288      |
|    total_timesteps    | 275000   |
| train/                |          |
|    entropy_loss       | -0.0601  |
|    explained_variance | -0.209   |
|    learning_rate      | 0.0007   |
|    n_updates          | 54999    |
|    policy_loss        | 0.00226  |
|    value_loss         | 0.0221   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 8.53     |
| time/                 |          |
|    fps                | 954      |
|    iterations         | 55100    |
|    time_elapsed       | 288      |
|    total_timesteps    | 275500   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 9.41     |
| time/                 |          |
|    fps                | 955      |
|    iterations         | 56300    |
|    time_elapsed       | 294      |
|    total_timesteps    | 281500   |
| train/                |          |
|    entropy_loss       | -0.00371 |
|    explained_variance | -2.48    |
|    learning_rate      | 0.0007   |
|    n_updates          | 56299    |
|    policy_loss        | 2.21e-05 |
|    value_loss         | 0.00207  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 9.41      |
| time/                 |           |
|    fps                | 955       |
|    iterations         | 56400     |
|    time_elapsed       | 295       |
|    total_timesteps    | 282000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 10.1      |
| time/                 |           |
|    fps                | 955       |
|    iterations         | 57600     |
|    time_elapsed       | 301       |
|    total_timesteps    | 288000    |
| train/                |           |
|    entropy_loss       | -0.098    |
|    explained_variance | 0.417     |
|    learning_rate      | 0.0007    |
|    n_updates          | 57599     |
|    policy_loss        | -0.000446 |
|    value_loss         | 0.000326  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 10.1      |
| time/                 |           |
|    fps                | 955       |
|    iterations         | 57700     |
|    time_elapsed       | 301       |
|    total_timesteps    | 288500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 10.9      |
| time/                 |           |
|    fps                | 956       |
|    iterations         | 58900     |
|    time_elapsed       | 307       |
|    total_timesteps    | 294500    |
| train/                |           |
|    entropy_loss       | -7.63e-05 |
|    explained_variance | 0.414     |
|    learning_rate      | 0.0007    |
|    n_updates          | 58899     |
|    policy_loss        | -7.55e-09 |
|    value_loss         | 3.5e-06   |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 10.9     |
| time/                 |          |
|    fps                | 956      |
|    iterations         | 59000    |
|    time_elapsed       | 308      |
|    total_timesteps    | 295000   |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 11.7     |
| time/                 |          |
|    fps                | 956      |
|    iterations         | 60200    |
|    time_elapsed       | 314      |
|    total_timesteps    | 301000   |
| train/                |          |
|    entropy_loss       | -0.0372  |
|    explained_variance | -0.0339  |
|    learning_rate      | 0.0007   |
|    n_updates          | 60199    |
|    policy_loss        | -0.00157 |
|    value_loss         | 0.00656  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 11.7     |
| time/                 |          |
|    fps                | 956      |
|    iterations         | 60300    |
|    time_elapsed       | 315      |
|    total_timesteps    | 301500   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 12.3     |
| time/                 |          |
|    fps                | 957      |
|    iterations         | 61500    |
|    time_elapsed       | 321      |
|    total_timesteps    | 307500   |
| train/                |          |
|    entropy_loss       | -0.374   |
|    explained_variance | -0.0274  |
|    learning_rate      | 0.0007   |
|    n_updates          | 61499    |
|    policy_loss        | 0.00417  |
|    value_loss         | 0.000699 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 12.3     |
| time/                 |          |
|    fps                | 957      |
|    iterations         | 61600    |
|    time_elapsed       | 321      |
|    total_timesteps    | 308000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 12.3     |
| time/                 |          |
|    fps                | 957      |
|    iterations         | 62800    |
|    time_elapsed       | 327      |
|    total_timesteps    | 314000   |
| train/                |          |
|    entropy_loss       | -0.382   |
|    explained_variance | 0.169    |
|    learning_rate      | 0.0007   |
|    n_updates          | 62799    |
|    policy_loss        | 0.00538  |
|    value_loss         | 0.000119 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 13.2     |
| time/                 |          |
|    fps                | 957      |
|    iterations         | 62900    |
|    time_elapsed       | 328      |
|    total_timesteps    | 314500   |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 13.2      |
| time/                 |           |
|    fps                | 957       |
|    iterations         | 64100     |
|    time_elapsed       | 334       |
|    total_timesteps    | 320500    |
| train/                |           |
|    entropy_loss       | -0.00749  |
|    explained_variance | 0.0274    |
|    learning_rate      | 0.0007    |
|    n_updates          | 64099     |
|    policy_loss        | -4.04e-05 |
|    value_loss         | 0.000616  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 13.2      |
| time/                 |           |
|    fps                | 957       |
|    iterations         | 64200     |
|    time_elapsed       | 335       |
|    total_timesteps    | 321000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 13.8     |
| time/                 |          |
|    fps                | 958      |
|    iterations         | 65400    |
|    time_elapsed       | 341      |
|    total_timesteps    | 327000   |
| train/                |          |
|    entropy_loss       | -0.367   |
|    explained_variance | -2.73    |
|    learning_rate      | 0.0007   |
|    n_updates          | 65399    |
|    policy_loss        | -0.0256  |
|    value_loss         | 0.000905 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 13.8      |
| time/                 |           |
|    fps                | 958       |
|    iterations         | 65500     |
|    time_elapsed       | 341       |
|    total_timesteps    | 327500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 14.3      |
| time/                 |           |
|    fps                | 958       |
|    iterations         | 66700     |
|    time_elapsed       | 347       |
|    total_timesteps    | 333500    |
| train/                |           |
|    entropy_loss       | -0.0458   |
|    explained_variance | -2.38     |
|    learning_rate      | 0.0007    |
|    n_updates          | 66699     |
|    policy_loss        | -3.82e-05 |
|    value_loss         | 9.26e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 14.3      |
| time/                 |           |
|    fps                | 958       |
|    iterations         | 66800     |
|    time_elapsed       | 348       |
|    total_timesteps    | 334000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 14.9      |
| time/                 |           |
|    fps                | 958       |
|    iterations         | 68000     |
|    time_elapsed       | 354       |
|    total_timesteps    | 340000    |
| train/                |           |
|    entropy_loss       | -0.000218 |
|    explained_variance | -13.2     |
|    learning_rate      | 0.0007    |
|    n_updates          | 67999     |
|    policy_loss        | 3.82e-07  |
|    value_loss         | 0.00044   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 14.9      |
| time/                 |           |
|    fps                | 958       |
|    iterations         | 68100     |
|    time_elapsed       | 355       |
|    total_timesteps    | 340500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 15.3      |
| time/                 |           |
|    fps                | 959       |
|    iterations         | 69300     |
|    time_elapsed       | 361       |
|    total_timesteps    | 346500    |
| train/                |           |
|    entropy_loss       | -0.0158   |
|    explained_variance | -0.234    |
|    learning_rate      | 0.0007    |
|    n_updates          | 69299     |
|    policy_loss        | -5.43e-05 |
|    value_loss         | 0.000164  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 15.3      |
| time/                 |           |
|    fps                | 959       |
|    iterations         | 69400     |
|    time_elapsed       | 361       |
|    total_timesteps    | 347000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 15.4      |
| time/                 |           |
|    fps                | 959       |
|    iterations         | 70600     |
|    time_elapsed       | 367       |
|    total_timesteps    | 353000    |
| train/                |           |
|    entropy_loss       | -0.0324   |
|    explained_variance | 0.0803    |
|    learning_rate      | 0.0007    |
|    n_updates          | 70599     |
|    policy_loss        | -4.99e-05 |
|    value_loss         | 0.000226  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 15.4     |
| time/                 |          |
|    fps                | 959      |
|    iterations         | 70700    |
|    time_elapsed       | 368      |
|    total_timesteps    | 353500   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 15.8      |
| time/                 |           |
|    fps                | 959       |
|    iterations         | 71900     |
|    time_elapsed       | 374       |
|    total_timesteps    | 359500    |
| train/                |           |
|    entropy_loss       | -1.55e-05 |
|    explained_variance | 0.32      |
|    learning_rate      | 0.0007    |
|    n_updates          | 71899     |
|    policy_loss        | 1.01e-08  |
|    value_loss         | 0.000486  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 15.8      |
| time/                 |           |
|    fps                | 959       |
|    iterations         | 72000     |
|    time_elapsed       | 375       |
|    total_timesteps    | 360000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 15.9      |
| time/                 |           |
|    fps                | 960       |
|    iterations         | 73200     |
|    time_elapsed       | 381       |
|    total_timesteps    | 366000    |
| train/                |           |
|    entropy_loss       | -7.73e-06 |
|    explained_variance | -2.12     |
|    learning_rate      | 0.0007    |
|    n_updates          | 73199     |
|    policy_loss        | 2.62e-08  |
|    value_loss         | 0.0024    |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 15.9     |
| time/                 |          |
|    fps                | 960      |
|    iterations         | 73300    |
|    time_elapsed       | 381      |
|    total_timesteps    | 366500   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 16.3      |
| time/                 |           |
|    fps                | 960       |
|    iterations         | 74500     |
|    time_elapsed       | 387       |
|    total_timesteps    | 372500    |
| train/                |           |
|    entropy_loss       | -0.0912   |
|    explained_variance | -0.75     |
|    learning_rate      | 0.0007    |
|    n_updates          | 74499     |
|    policy_loss        | -0.000245 |
|    value_loss         | 0.00107   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 16.3      |
| time/                 |           |
|    fps                | 960       |
|    iterations         | 74600     |
|    time_elapsed       | 388       |
|    total_timesteps    | 373000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 16.7      |
| time/                 |           |
|    fps                | 960       |
|    iterations         | 75800     |
|    time_elapsed       | 394       |
|    total_timesteps    | 379000    |
| train/                |           |
|    entropy_loss       | -0.000617 |
|    explained_variance | -310      |
|    learning_rate      | 0.0007    |
|    n_updates          | 75799     |
|    policy_loss        | 1.17e-06  |
|    value_loss         | 0.000262  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 16.7     |
| time/                 |          |
|    fps                | 960      |
|    iterations         | 75900    |
|    time_elapsed       | 395      |
|    total_timesteps    | 379500   |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 16.7     |
| time/                 |          |
|    fps                | 960      |
|    iterations         | 77100    |
|    time_elapsed       | 401      |
|    total_timesteps    | 385500   |
| train/                |          |
|    entropy_loss       | -0.00161 |
|    explained_variance | 0.308    |
|    learning_rate      | 0.0007   |
|    n_updates          | 77099    |
|    policy_loss        | 2.24e-05 |
|    value_loss         | 0.00765  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 17.3     |
| time/                 |          |
|    fps                | 960      |
|    iterations         | 77200    |
|    time_elapsed       | 401      |
|    total_timesteps    | 386000   |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 17.3      |
| time/                 |           |
|    fps                | 960       |
|    iterations         | 78400     |
|    time_elapsed       | 407       |
|    total_timesteps    | 392000    |
| train/                |           |
|    entropy_loss       | -6.07e-06 |
|    explained_variance | -1.08     |
|    learning_rate      | 0.0007    |
|    n_updates          | 78399     |
|    policy_loss        | -1.6e-09  |
|    value_loss         | 0.000163  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 17.3      |
| time/                 |           |
|    fps                | 961       |
|    iterations         | 78500     |
|    time_elapsed       | 408       |
|    total_timesteps    | 392500    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 17.7     |
| time/                 |          |
|    fps                | 961      |
|    iterations         | 79700    |
|    time_elapsed       | 414      |
|    total_timesteps    | 398500   |
| train/                |          |
|    entropy_loss       | -0.0185  |
|    explained_variance | 0.326    |
|    learning_rate      | 0.0007   |
|    n_updates          | 79699    |
|    policy_loss        | 0.000569 |
|    value_loss         | 0.0124   |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 17.7      |
| time/                 |           |
|    fps                | 961       |
|    iterations         | 79800     |
|    time_elapsed       | 415       |
|    total_timesteps    | 399000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 18.1      |
| time/                 |           |
|    fps                | 961       |
|    iterations         | 81000     |
|    time_elapsed       | 421       |
|    total_timesteps    | 405000    |
| train/                |           |
|    entropy_loss       | -0.0031   |
|    explained_variance | -8.68     |
|    learning_rate      | 0.0007    |
|    n_updates          | 80999     |
|    policy_loss        | -2.63e-05 |
|    value_loss         | 0.00295   |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 18.1     |
| time/                 |          |
|    fps                | 961      |
|    iterations         | 81100    |
|    time_elapsed       | 421      |
|    total_timesteps    | 405500   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 18.4      |
| time/                 |           |
|    fps                | 961       |
|    iterations         | 82300     |
|    time_elapsed       | 428       |
|    total_timesteps    | 411500    |
| train/                |           |
|    entropy_loss       | -0.000261 |
|    explained_variance | -0.413    |
|    learning_rate      | 0.0007    |
|    n_updates          | 82299     |
|    policy_loss        | -3.58e-07 |
|    value_loss         | 0.000392  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 18.4      |
| time/                 |           |
|    fps                | 961       |
|    iterations         | 82400     |
|    time_elapsed       | 428       |
|    total_timesteps    | 412000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 18.7     |
| time/                 |          |
|    fps                | 961      |
|    iterations         | 83600    |
|    time_elapsed       | 434      |
|    total_timesteps    | 418000   |
| train/                |          |
|    entropy_loss       | -0.25    |
|    explained_variance | -3.62    |
|    learning_rate      | 0.0007   |
|    n_updates          | 83599    |
|    policy_loss        | 0.00915  |
|    value_loss         | 0.00132  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 18.7     |
| time/                 |          |
|    fps                | 961      |
|    iterations         | 83700    |
|    time_elapsed       | 435      |
|    total_timesteps    | 418500   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 19       |
| time/                 |          |
|    fps                | 961      |
|    iterations         | 84900    |
|    time_elapsed       | 441      |
|    total_timesteps    | 424500   |
| train/                |          |
|    entropy_loss       | -0.164   |
|    explained_variance | 0.572    |
|    learning_rate      | 0.0007   |
|    n_updates          | 84899    |
|    policy_loss        | -0.00286 |
|    value_loss         | 0.000271 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 19        |
| time/                 |           |
|    fps                | 961       |
|    iterations         | 85000     |
|    time_elapsed       | 441       |
|    total_timesteps    | 425000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 19.4      |
| time/                 |           |
|    fps                | 961       |
|    iterations         | 86200     |
|    time_elapsed       | 448       |
|    total_timesteps    | 431000    |
| train/                |           |
|    entropy_loss       | -0.0933   |
|    explained_variance | 0.905     |
|    learning_rate      | 0.0007    |
|    n_updates          | 86199     |
|    policy_loss        | -0.000446 |
|    value_loss         | 0.00032   |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 19.4     |
| time/                 |          |
|    fps                | 961      |
|    iterations         | 86300    |
|    time_elapsed       | 448      |
|    total_timesteps    | 431500   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 19.7      |
| time/                 |           |
|    fps                | 961       |
|    iterations         | 87500     |
|    time_elapsed       | 454       |
|    total_timesteps    | 437500    |
| train/                |           |
|    entropy_loss       | -0.164    |
|    explained_variance | -0.0394   |
|    learning_rate      | 0.0007    |
|    n_updates          | 87499     |
|    policy_loss        | -0.000885 |
|    value_loss         | 0.000556  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 19.7      |
| time/                 |           |
|    fps                | 962       |
|    iterations         | 87600     |
|    time_elapsed       | 455       |
|    total_timesteps    | 438000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 20.1      |
| time/                 |           |
|    fps                | 962       |
|    iterations         | 88800     |
|    time_elapsed       | 461       |
|    total_timesteps    | 444000    |
| train/                |           |
|    entropy_loss       | -0.000945 |
|    explained_variance | 0.293     |
|    learning_rate      | 0.0007    |
|    n_updates          | 88799     |
|    policy_loss        | -1.2e-07  |
|    value_loss         | 0.00304   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 20.1      |
| time/                 |           |
|    fps                | 962       |
|    iterations         | 88900     |
|    time_elapsed       | 461       |
|    total_timesteps    | 444500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 20.3      |
| time/                 |           |
|    fps                | 962       |
|    iterations         | 90100     |
|    time_elapsed       | 468       |
|    total_timesteps    | 450500    |
| train/                |           |
|    entropy_loss       | -2.11e-06 |
|    explained_variance | -9.92     |
|    learning_rate      | 0.0007    |
|    n_updates          | 90099     |
|    policy_loss        | -0        |
|    value_loss         | 5.6e-07   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 20.3      |
| time/                 |           |
|    fps                | 962       |
|    iterations         | 90200     |
|    time_elapsed       | 468       |
|    total_timesteps    | 451000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 20.3      |
| time/                 |           |
|    fps                | 960       |
|    iterations         | 91400     |
|    time_elapsed       | 475       |
|    total_timesteps    | 457000    |
| train/                |           |
|    entropy_loss       | -2.75e-06 |
|    explained_variance | -216      |
|    learning_rate      | 0.0007    |
|    n_updates          | 91399     |
|    policy_loss        | -0        |
|    value_loss         | 6.39e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 20.2     |
| time/                 |          |
|    fps                | 959      |
|    iterations         | 91500    |
|    time_elapsed       | 476      |
|    total_timesteps    | 457500   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 20.2      |
| time/                 |           |
|    fps                | 956       |
|    iterations         | 92700     |
|    time_elapsed       | 484       |
|    total_timesteps    | 463500    |
| train/                |           |
|    entropy_loss       | -8.89e-06 |
|    explained_variance | -1.69     |
|    learning_rate      | 0.0007    |
|    n_updates          | 92699     |
|    policy_loss        | -4.08e-09 |
|    value_loss         | 0.00059   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 20.2      |
| time/                 |           |
|    fps                | 956       |
|    iterations         | 92800     |
|    time_elapsed       | 485       |
|    total_timesteps    | 464000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 20.4      |
| time/                 |           |
|    fps                | 956       |
|    iterations         | 94000     |
|    time_elapsed       | 491       |
|    total_timesteps    | 470000    |
| train/                |           |
|    entropy_loss       | -2.55e-06 |
|    explained_variance | 0.36      |
|    learning_rate      | 0.0007    |
|    n_updates          | 93999     |
|    policy_loss        | -0        |
|    value_loss         | 0.0032    |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 20.4      |
| time/                 |           |
|    fps                | 956       |
|    iterations         | 94100     |
|    time_elapsed       | 491       |
|    total_timesteps    | 470500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 20.7      |
| time/                 |           |
|    fps                | 956       |
|    iterations         | 95300     |
|    time_elapsed       | 498       |
|    total_timesteps    | 476500    |
| train/                |           |
|    entropy_loss       | -0.00181  |
|    explained_variance | -1.4      |
|    learning_rate      | 0.0007    |
|    n_updates          | 95299     |
|    policy_loss        | -2.19e-06 |
|    value_loss         | 0.000329  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 20.7      |
| time/                 |           |
|    fps                | 956       |
|    iterations         | 95400     |
|    time_elapsed       | 498       |
|    total_timesteps    | 477000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 21        |
| time/                 |           |
|    fps                | 956       |
|    iterations         | 96600     |
|    time_elapsed       | 505       |
|    total_timesteps    | 483000    |
| train/                |           |
|    entropy_loss       | -0.00766  |
|    explained_variance | -4.3      |
|    learning_rate      | 0.0007    |
|    n_updates          | 96599     |
|    policy_loss        | -1.68e-05 |
|    value_loss         | 0.000454  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 21        |
| time/                 |           |
|    fps                | 956       |
|    iterations         | 96700     |
|    time_elapsed       | 505       |
|    total_timesteps    | 483500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 21.2      |
| time/                 |           |
|    fps                | 956       |
|    iterations         | 97900     |
|    time_elapsed       | 511       |
|    total_timesteps    | 489500    |
| train/                |           |
|    entropy_loss       | -0.000782 |
|    explained_variance | -0.327    |
|    learning_rate      | 0.0007    |
|    n_updates          | 97899     |
|    policy_loss        | -4.97e-06 |
|    value_loss         | 0.00165   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 21.2      |
| time/                 |           |
|    fps                | 956       |
|    iterations         | 98000     |
|    time_elapsed       | 512       |
|    total_timesteps    | 490000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 21.5     |
| time/                 |          |
|    fps                | 956      |
|    iterations         | 99200    |
|    time_elapsed       | 518      |
|    total_timesteps    | 496000   |
| train/                |          |
|    entropy_loss       | -0.0214  |
|    explained_variance | -45.4    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99199    |
|    policy_loss        | 4.11e-05 |
|    value_loss         | 0.00472  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 21.5      |
| time/                 |           |
|    fps                | 956       |
|    iterations         | 99300     |
|    time_elapsed       | 519       |
|    total_timesteps    | 496500    |
| train/                |    

Step: 330, Action: 2, Reward: 0
Step: 331, Action: 2, Reward: 0
Step: 332, Action: 2, Reward: 0
Step: 333, Action: 2, Reward: 0
Step: 334, Action: 2, Reward: 0
Step: 335, Action: 2, Reward: 0
Step: 336, Action: 2, Reward: 0
Step: 337, Action: 2, Reward: 0
Step: 338, Action: 2, Reward: 0
Step: 339, Action: 2, Reward: 0
Step: 340, Action: 2, Reward: 0
Step: 341, Action: 2, Reward: 0
Step: 342, Action: 2, Reward: 0
Step: 343, Action: 2, Reward: 0
Step: 344, Action: 2, Reward: 0
Step: 345, Action: 2, Reward: 0
Step: 346, Action: 2, Reward: 0
Step: 347, Action: 2, Reward: 0
Step: 348, Action: 2, Reward: 0
Step: 349, Action: 2, Reward: 0
Step: 350, Action: 2, Reward: 0
Step: 351, Action: 2, Reward: 0
Step: 352, Action: 2, Reward: 0
Step: 353, Action: 2, Reward: 0
Step: 354, Action: 2, Reward: 0
Step: 355, Action: 2, Reward: 0
Step: 356, Action: 2, Reward: 0
Step: 357, Action: 2, Reward: 0
Step: 358, Action: 2, Reward: 0
Step: 359, Action: 2, Reward: 0
Step: 360, Action: 2, Reward: 0
Step: 36

Step: 736, Action: 2, Reward: 0
Step: 737, Action: 2, Reward: 0
Step: 738, Action: 2, Reward: 0
Step: 739, Action: 2, Reward: 0
Step: 740, Action: 2, Reward: 0
Step: 741, Action: 2, Reward: 0
Step: 742, Action: 2, Reward: 0
Step: 743, Action: 2, Reward: 0
Step: 744, Action: 2, Reward: 0
Step: 745, Action: 2, Reward: 0
Step: 746, Action: 2, Reward: 0
Step: 747, Action: 2, Reward: 0
Step: 748, Action: 2, Reward: 0
Step: 749, Action: 2, Reward: 0
Step: 750, Action: 2, Reward: 0
Step: 751, Action: 2, Reward: 0
Step: 752, Action: 2, Reward: 0
Step: 753, Action: 2, Reward: 0
Step: 754, Action: 2, Reward: 0
Step: 755, Action: 2, Reward: 0
Step: 756, Action: 2, Reward: 0
Step: 757, Action: 2, Reward: 0
Step: 758, Action: 2, Reward: 0
Step: 759, Action: 2, Reward: 0
Step: 760, Action: 2, Reward: 0
Step: 761, Action: 2, Reward: 0
Step: 762, Action: 2, Reward: 0
Step: 763, Action: 2, Reward: 0
Step: 764, Action: 2, Reward: 0
Step: 765, Action: 2, Reward: 0
Step: 766, Action: 2, Reward: 0
Step: 76

Step: 1167, Action: 2, Reward: 0
Step: 1168, Action: 0, Reward: 0
Step: 1169, Action: 2, Reward: 0
Step: 1170, Action: 2, Reward: 0
Step: 1171, Action: 2, Reward: 0
Step: 1172, Action: 2, Reward: 0
Step: 1173, Action: 2, Reward: 0
Step: 1174, Action: 1, Reward: 0.030876393576918182
Step: 1175, Action: 2, Reward: 0
Step: 1176, Action: 2, Reward: 0
Step: 1177, Action: 2, Reward: 0
Step: 1178, Action: 2, Reward: 0
Step: 1179, Action: 1, Reward: 0.025598745643345633
Step: 1180, Action: 2, Reward: 0
Step: 1181, Action: 2, Reward: 0
Step: 1182, Action: 1, Reward: -0.005981015337230389
Step: 1183, Action: 2, Reward: 0
Step: 1184, Action: 2, Reward: 0
Step: 1185, Action: 2, Reward: 0
Step: 1186, Action: 2, Reward: 0
Step: 1187, Action: 2, Reward: 0
Step: 1188, Action: 2, Reward: 0
Step: 1189, Action: 2, Reward: 0
Step: 1190, Action: 2, Reward: 0
Step: 1191, Action: 2, Reward: 0
Step: 1192, Action: 2, Reward: 0
Step: 1193, Action: 2, Reward: 0
Step: 1194, Action: 2, Reward: 0
Step: 1195, Action

Step: 1603, Action: 2, Reward: 0
Step: 1604, Action: 2, Reward: 0
Step: 1605, Action: 2, Reward: 0
Step: 1606, Action: 2, Reward: 0
Step: 1607, Action: 2, Reward: 0
Step: 1608, Action: 2, Reward: 0
Step: 1609, Action: 2, Reward: 0
Step: 1610, Action: 2, Reward: 0
Step: 1611, Action: 2, Reward: 0
Step: 1612, Action: 2, Reward: 0
Step: 1613, Action: 2, Reward: 0
Step: 1614, Action: 2, Reward: 0
Step: 1615, Action: 2, Reward: 0
Step: 1616, Action: 2, Reward: 0
Step: 1617, Action: 2, Reward: 0
Step: 1618, Action: 2, Reward: 0
Step: 1619, Action: 2, Reward: 0
Step: 1620, Action: 2, Reward: 0
Step: 1621, Action: 2, Reward: 0
Step: 1622, Action: 2, Reward: 0
Step: 1623, Action: 2, Reward: 0
Step: 1624, Action: 2, Reward: 0
Step: 1625, Action: 2, Reward: 0
Step: 1626, Action: 2, Reward: 0
Step: 1627, Action: 2, Reward: 0
Step: 1628, Action: 2, Reward: 0
Step: 1629, Action: 2, Reward: 0
Step: 1630, Action: 2, Reward: 0
Step: 1631, Action: 2, Reward: 0
Step: 1632, Action: 2, Reward: 0
Step: 1633

Step: 2033, Action: 1, Reward: 0
Step: 2034, Action: 1, Reward: 0
Step: 2035, Action: 1, Reward: 0
Step: 2036, Action: 2, Reward: 0.07057598097931761
Step: 2037, Action: 2, Reward: 0
Step: 2038, Action: 2, Reward: 0
Step: 2039, Action: 2, Reward: 0
Step: 2040, Action: 2, Reward: 0
Step: 2041, Action: 0, Reward: 0
Step: 2042, Action: 2, Reward: 0
Step: 2043, Action: 2, Reward: 0
Step: 2044, Action: 2, Reward: 0
Step: 2045, Action: 2, Reward: 0
Step: 2046, Action: 2, Reward: 0
Step: 2047, Action: 2, Reward: 0
Step: 2048, Action: 2, Reward: 0
Step: 2049, Action: 2, Reward: 0
Step: 2050, Action: 2, Reward: 0
Step: 2051, Action: 1, Reward: 0.6579116870953364
Step: 2052, Action: 2, Reward: 0
Step: 2053, Action: 2, Reward: 0
Step: 2054, Action: 2, Reward: 0
Step: 2055, Action: 2, Reward: 0
Step: 2056, Action: 1, Reward: -0.13397474355395927
Step: 2057, Action: 2, Reward: 0
Step: 2058, Action: 2, Reward: 0
Step: 2059, Action: 1, Reward: -0.04425951349550432
Step: 2060, Action: 2, Reward: 0
Ste

Step: 2451, Action: 2, Reward: 0
Step: 2452, Action: 2, Reward: 0
Step: 2453, Action: 1, Reward: 0.00035886092023418303
Step: 2454, Action: 1, Reward: 0
Step: 2455, Action: 2, Reward: -0.024163301962410316
Step: 2456, Action: 2, Reward: 0
Step: 2457, Action: 1, Reward: -0.007894940245144338
Step: 2458, Action: 1, Reward: 0
Step: 2459, Action: 2, Reward: 0.0050240528832736775
Step: 2460, Action: 0, Reward: 0
Step: 2461, Action: 2, Reward: 0
Step: 2462, Action: 2, Reward: 0
Step: 2463, Action: 1, Reward: 0.0017943046011688335
Step: 2464, Action: 0, Reward: 0
Step: 2465, Action: 2, Reward: 0
Step: 2466, Action: 1, Reward: 0.002392406134891986
Step: 2467, Action: 1, Reward: 0
Step: 2468, Action: 1, Reward: 0
Step: 2469, Action: 1, Reward: 0
Step: 2470, Action: 2, Reward: 0.3423533179030635
Step: 2471, Action: 2, Reward: 0
Step: 2472, Action: 1, Reward: 0.0666285108567452
Step: 2473, Action: 2, Reward: 0
Step: 2474, Action: 1, Reward: 0.012201271287949678
Step: 2475, Action: 2, Reward: 0
St

Step: 2873, Action: 1, Reward: 0
Step: 2874, Action: 1, Reward: 0
Step: 2875, Action: 0, Reward: 0
Step: 2876, Action: 0, Reward: 0
Step: 2877, Action: 1, Reward: 0
Step: 2878, Action: 1, Reward: 0
Step: 2879, Action: 2, Reward: 0.12180457354576288
Step: 2880, Action: 2, Reward: 0
Step: 2881, Action: 2, Reward: 0
Step: 2882, Action: 1, Reward: 0.03822466902023949
Step: 2883, Action: 1, Reward: 0
Step: 2884, Action: 2, Reward: 0.028963664872071276
Step: 2885, Action: 2, Reward: 0
Step: 2886, Action: 1, Reward: 0.0700986959554061
Step: 2887, Action: 1, Reward: 0
Step: 2888, Action: 1, Reward: 0
Step: 2889, Action: 1, Reward: 0
Step: 2890, Action: 1, Reward: 0
Step: 2891, Action: 0, Reward: 0
Step: 2892, Action: 1, Reward: 0
Step: 2893, Action: 1, Reward: 0
Step: 2894, Action: 1, Reward: 0
Step: 2895, Action: 2, Reward: 0.09725130938336513
Step: 2896, Action: 1, Reward: 0
Step: 2897, Action: 1, Reward: 0
Step: 2898, Action: 1, Reward: 0
Step: 2899, Action: 1, Reward: 0
Step: 2900, Action:

Step: 3295, Action: 2, Reward: 0
Step: 3296, Action: 2, Reward: 0
Step: 3297, Action: 1, Reward: -0.05251331466088216
Step: 3298, Action: 2, Reward: 0
Step: 3299, Action: 2, Reward: 0
Step: 3300, Action: 2, Reward: 0
Step: 3301, Action: 1, Reward: 0.023325959815198166
Step: 3302, Action: 2, Reward: 0
Step: 3303, Action: 2, Reward: 0
Step: 3304, Action: 2, Reward: 0
Step: 3305, Action: 2, Reward: 0
Step: 3306, Action: 1, Reward: 0.06016901429253682
Step: 3307, Action: 1, Reward: 0
Step: 3308, Action: 2, Reward: -0.0008971523005850968
Step: 3309, Action: 2, Reward: 0
Step: 3310, Action: 1, Reward: -0.01100506822050401
Step: 3311, Action: 2, Reward: 0
Step: 3312, Action: 1, Reward: -0.0029905076686151666
Step: 3313, Action: 2, Reward: 0
Step: 3314, Action: 2, Reward: 0
Step: 3315, Action: 2, Reward: 0
Step: 3316, Action: 2, Reward: 0
Step: 3317, Action: 0, Reward: 0
Step: 3318, Action: 2, Reward: 0
Step: 3319, Action: 2, Reward: 0
Step: 3320, Action: 0, Reward: 0
Step: 3321, Action: 2, Re

Step: 3718, Action: 1, Reward: 0
Step: 3719, Action: 1, Reward: 0
Step: 3720, Action: 1, Reward: 0
Step: 3721, Action: 1, Reward: 0
Step: 3722, Action: 1, Reward: 0
Step: 3723, Action: 2, Reward: -0.037560776317806355
Step: 3724, Action: 1, Reward: 0
Step: 3725, Action: 1, Reward: 0
Step: 3726, Action: 1, Reward: 0
Step: 3727, Action: 1, Reward: 0
Step: 3728, Action: 1, Reward: 0
Step: 3729, Action: 1, Reward: 0
Step: 3730, Action: 1, Reward: 0
Step: 3731, Action: 1, Reward: 0
Step: 3732, Action: 1, Reward: 0
Step: 3733, Action: 1, Reward: 0
Step: 3734, Action: 1, Reward: 0
Step: 3735, Action: 1, Reward: 0
Step: 3736, Action: 1, Reward: 0
Step: 3737, Action: 1, Reward: 0
Step: 3738, Action: 1, Reward: 0
Step: 3739, Action: 1, Reward: 0
Step: 3740, Action: 1, Reward: 0
Step: 3741, Action: 1, Reward: 0
Step: 3742, Action: 1, Reward: 0
Step: 3743, Action: 1, Reward: 0
Step: 3744, Action: 1, Reward: 0
Step: 3745, Action: 1, Reward: 0
Step: 3746, Action: 1, Reward: 0
Step: 3747, Action: 1, 

Step: 4146, Action: 1, Reward: 0
Step: 4147, Action: 1, Reward: 0
Step: 4148, Action: 1, Reward: 0
Step: 4149, Action: 1, Reward: 0
Step: 4150, Action: 1, Reward: 0
Step: 4151, Action: 1, Reward: 0
Step: 4152, Action: 1, Reward: 0
Step: 4153, Action: 1, Reward: 0
Step: 4154, Action: 1, Reward: 0
Step: 4155, Action: 1, Reward: 0
Step: 4156, Action: 1, Reward: 0
Step: 4157, Action: 1, Reward: 0
Step: 4158, Action: 1, Reward: 0
Step: 4159, Action: 1, Reward: 0
Step: 4160, Action: 1, Reward: 0
Step: 4161, Action: 1, Reward: 0
Step: 4162, Action: 1, Reward: 0
Step: 4163, Action: 1, Reward: 0
Step: 4164, Action: 1, Reward: 0
Step: 4165, Action: 1, Reward: 0
Step: 4166, Action: 1, Reward: 0
Step: 4167, Action: 1, Reward: 0
Step: 4168, Action: 1, Reward: 0
Step: 4169, Action: 1, Reward: 0
Step: 4170, Action: 1, Reward: 0
Step: 4171, Action: 1, Reward: 0
Step: 4172, Action: 1, Reward: 0
Step: 4173, Action: 1, Reward: 0
Step: 4174, Action: 1, Reward: 0
Step: 4175, Action: 1, Reward: 0
Step: 4176

Step: 4576, Action: 1, Reward: 0
Step: 4577, Action: 1, Reward: 0
Step: 4578, Action: 1, Reward: 0
Step: 4579, Action: 1, Reward: 0
Step: 4580, Action: 1, Reward: 0
Step: 4581, Action: 1, Reward: 0
Step: 4582, Action: 1, Reward: 0
Step: 4583, Action: 1, Reward: 0
Step: 4584, Action: 1, Reward: 0
Step: 4585, Action: 1, Reward: 0
Step: 4586, Action: 1, Reward: 0
Step: 4587, Action: 1, Reward: 0
Step: 4588, Action: 1, Reward: 0
Step: 4589, Action: 1, Reward: 0
Step: 4590, Action: 1, Reward: 0
Step: 4591, Action: 1, Reward: 0
Step: 4592, Action: 1, Reward: 0
Step: 4593, Action: 1, Reward: 0
Step: 4594, Action: 1, Reward: 0
Step: 4595, Action: 1, Reward: 0
Step: 4596, Action: 1, Reward: 0
Step: 4597, Action: 1, Reward: 0
Step: 4598, Action: 1, Reward: 0
Step: 4599, Action: 1, Reward: 0
Step: 4600, Action: 1, Reward: 0
Step: 4601, Action: 1, Reward: 0
Step: 4602, Action: 1, Reward: 0
Step: 4603, Action: 1, Reward: 0
Step: 4604, Action: 1, Reward: 0
Step: 4605, Action: 2, Reward: 0.1704589371

Step: 5004, Action: 0, Reward: 0
Step: 5005, Action: 1, Reward: -0.00023924061348901127
Step: 5006, Action: 1, Reward: 0
Step: 5007, Action: 2, Reward: 0.048685464845054816
Step: 5008, Action: 2, Reward: 0
Step: 5009, Action: 2, Reward: 0
Step: 5010, Action: 2, Reward: 0
Step: 5011, Action: 1, Reward: 0.03744115601106182
Step: 5012, Action: 2, Reward: 0
Step: 5013, Action: 1, Reward: 0.024401346372832172
Step: 5014, Action: 1, Reward: 0
Step: 5015, Action: 2, Reward: -0.005742970926808477
Step: 5016, Action: 1, Reward: 0
Step: 5017, Action: 1, Reward: 0
Step: 5018, Action: 1, Reward: 0
Step: 5019, Action: 1, Reward: 0
Step: 5020, Action: 2, Reward: 0.0594512924520698
Step: 5021, Action: 1, Reward: 0
Step: 5022, Action: 1, Reward: 0
Step: 5023, Action: 2, Reward: 0.029905076686151652
Step: 5024, Action: 1, Reward: 0
Step: 5025, Action: 2, Reward: -0.008611465882544175
Step: 5026, Action: 2, Reward: 0
Step: 5027, Action: 2, Reward: 0
Step: 5028, Action: 2, Reward: 0
Step: 5029, Action: 2

Step: 5425, Action: 1, Reward: 0
Step: 5426, Action: 1, Reward: 0
Step: 5427, Action: 1, Reward: 0
Step: 5428, Action: 1, Reward: 0
Step: 5429, Action: 1, Reward: 0
Step: 5430, Action: 1, Reward: 0
Step: 5431, Action: 1, Reward: 0
Step: 5432, Action: 1, Reward: 0
Step: 5433, Action: 1, Reward: 0
Step: 5434, Action: 1, Reward: 0
Step: 5435, Action: 1, Reward: 0
Step: 5436, Action: 1, Reward: 0
Step: 5437, Action: 1, Reward: 0
Step: 5438, Action: 1, Reward: 0
Step: 5439, Action: 1, Reward: 0
Step: 5440, Action: 1, Reward: 0
Step: 5441, Action: 1, Reward: 0
Step: 5442, Action: 1, Reward: 0
Step: 5443, Action: 1, Reward: 0
Step: 5444, Action: 1, Reward: 0
Step: 5445, Action: 1, Reward: 0
Step: 5446, Action: 1, Reward: 0
Step: 5447, Action: 1, Reward: 0
Step: 5448, Action: 1, Reward: 0
Step: 5449, Action: 1, Reward: 0
Step: 5450, Action: 1, Reward: 0
Step: 5451, Action: 1, Reward: 0
Step: 5452, Action: 1, Reward: 0
Step: 5453, Action: 1, Reward: 0
Step: 5454, Action: 1, Reward: 0
Step: 5455

Step: 5854, Action: 1, Reward: 0
Step: 5855, Action: 1, Reward: 0
Step: 5856, Action: 1, Reward: 0
Step: 5857, Action: 1, Reward: 0
Step: 5858, Action: 1, Reward: 0
Step: 5859, Action: 1, Reward: 0
Step: 5860, Action: 1, Reward: 0
Step: 5861, Action: 1, Reward: 0
Step: 5862, Action: 1, Reward: 0
Step: 5863, Action: 1, Reward: 0
Step: 5864, Action: 1, Reward: 0
Step: 5865, Action: 1, Reward: 0
Step: 5866, Action: 1, Reward: 0
Step: 5867, Action: 1, Reward: 0
Step: 5868, Action: 1, Reward: 0
Step: 5869, Action: 1, Reward: 0
Step: 5870, Action: 1, Reward: 0
Step: 5871, Action: 1, Reward: 0
Step: 5872, Action: 1, Reward: 0
Step: 5873, Action: 1, Reward: 0
Step: 5874, Action: 1, Reward: 0
Step: 5875, Action: 1, Reward: 0
Step: 5876, Action: 1, Reward: 0
Step: 5877, Action: 1, Reward: 0
Step: 5878, Action: 1, Reward: 0
Step: 5879, Action: 1, Reward: 0
Step: 5880, Action: 1, Reward: 0
Step: 5881, Action: 1, Reward: 0
Step: 5882, Action: 2, Reward: 0.0972513093833651
Step: 5883, Action: 1, Rew

Step: 6293, Action: 1, Reward: 0
Step: 6294, Action: 1, Reward: 0
Step: 6295, Action: 1, Reward: 0
Step: 6296, Action: 1, Reward: 0
Step: 6297, Action: 1, Reward: 0
Step: 6298, Action: 2, Reward: 0.8105471985014542
Step: 6299, Action: 1, Reward: 0
Step: 6300, Action: 1, Reward: 0
Step: 6301, Action: 1, Reward: 0
Step: 6302, Action: 1, Reward: 0
Step: 6303, Action: 1, Reward: 0
Step: 6304, Action: 1, Reward: 0
Step: 6305, Action: 1, Reward: 0
Step: 6306, Action: 1, Reward: 0
Step: 6307, Action: 1, Reward: 0
Step: 6308, Action: 1, Reward: 0
Step: 6309, Action: 1, Reward: 0
Step: 6310, Action: 1, Reward: 0
Step: 6311, Action: 1, Reward: 0
Step: 6312, Action: 1, Reward: 0
Step: 6313, Action: 1, Reward: 0
Step: 6314, Action: 1, Reward: 0
Step: 6315, Action: 1, Reward: 0
Step: 6316, Action: 1, Reward: 0
Step: 6317, Action: 1, Reward: 0
Step: 6318, Action: 1, Reward: 0
Step: 6319, Action: 1, Reward: 0
Step: 6320, Action: 1, Reward: 0
Step: 6321, Action: 1, Reward: 0
Step: 6322, Action: 1, Rew

Step: 6730, Action: 1, Reward: 0
Step: 6731, Action: 1, Reward: 0
Step: 6732, Action: 1, Reward: 0
Step: 6733, Action: 1, Reward: 0
Step: 6734, Action: 1, Reward: 0
Step: 6735, Action: 1, Reward: 0
Step: 6736, Action: 1, Reward: 0
Step: 6737, Action: 1, Reward: 0
Step: 6738, Action: 1, Reward: 0
Step: 6739, Action: 1, Reward: 0
Step: 6740, Action: 1, Reward: 0
Step: 6741, Action: 1, Reward: 0
Step: 6742, Action: 1, Reward: 0
Step: 6743, Action: 1, Reward: 0
Step: 6744, Action: 1, Reward: 0
Step: 6745, Action: 1, Reward: 0
Step: 6746, Action: 1, Reward: 0
Step: 6747, Action: 1, Reward: 0
Step: 6748, Action: 1, Reward: 0
Step: 6749, Action: 1, Reward: 0
Step: 6750, Action: 1, Reward: 0
Step: 6751, Action: 1, Reward: 0
Step: 6752, Action: 1, Reward: 0
Step: 6753, Action: 1, Reward: 0
Step: 6754, Action: 1, Reward: 0
Step: 6755, Action: 1, Reward: 0
Step: 6756, Action: 1, Reward: 0
Step: 6757, Action: 1, Reward: 0
Step: 6758, Action: 1, Reward: 0
Step: 6759, Action: 1, Reward: 0
Step: 6760

Step: 203, Action: 2, Reward: 0
Step: 204, Action: 2, Reward: 0
Step: 205, Action: 2, Reward: 0
Step: 206, Action: 2, Reward: 0
Step: 207, Action: 2, Reward: 0
Step: 208, Action: 2, Reward: 0
Step: 209, Action: 2, Reward: 0
Step: 210, Action: 2, Reward: 0
Step: 211, Action: 2, Reward: 0
Step: 212, Action: 1, Reward: 0.2881653189477569
Step: 213, Action: 2, Reward: 0
Step: 214, Action: 2, Reward: 0
Step: 215, Action: 2, Reward: 0
Step: 216, Action: 2, Reward: 0
Step: 217, Action: 2, Reward: 0
Step: 218, Action: 2, Reward: 0
Step: 219, Action: 2, Reward: 0
Step: 220, Action: 2, Reward: 0
Step: 221, Action: 2, Reward: 0
Step: 222, Action: 2, Reward: 0
Step: 223, Action: 2, Reward: 0
Step: 224, Action: 2, Reward: 0
Step: 225, Action: 2, Reward: 0
Step: 226, Action: 2, Reward: 0
Step: 227, Action: 2, Reward: 0
Step: 228, Action: 2, Reward: 0
Step: 229, Action: 2, Reward: 0
Step: 230, Action: 2, Reward: 0
Step: 231, Action: 2, Reward: 0
Step: 232, Action: 2, Reward: 0
Step: 233, Action: 2, R

Step: 631, Action: 2, Reward: 0
Step: 632, Action: 2, Reward: 0
Step: 633, Action: 2, Reward: 0
Step: 634, Action: 2, Reward: 0
Step: 635, Action: 2, Reward: 0
Step: 636, Action: 2, Reward: 0
Step: 637, Action: 2, Reward: 0
Step: 638, Action: 2, Reward: 0
Step: 639, Action: 2, Reward: 0
Step: 640, Action: 2, Reward: 0
Step: 641, Action: 2, Reward: 0
Step: 642, Action: 2, Reward: 0
Step: 643, Action: 2, Reward: 0
Step: 644, Action: 2, Reward: 0
Step: 645, Action: 2, Reward: 0
Step: 646, Action: 2, Reward: 0
Step: 647, Action: 2, Reward: 0
Step: 648, Action: 2, Reward: 0
Step: 649, Action: 2, Reward: 0
Step: 650, Action: 2, Reward: 0
Step: 651, Action: 2, Reward: 0
Step: 652, Action: 2, Reward: 0
Step: 653, Action: 2, Reward: 0
Step: 654, Action: 2, Reward: 0
Step: 655, Action: 2, Reward: 0
Step: 656, Action: 2, Reward: 0
Step: 657, Action: 2, Reward: 0
Step: 658, Action: 2, Reward: 0
Step: 659, Action: 2, Reward: 0
Step: 660, Action: 2, Reward: 0
Step: 661, Action: 2, Reward: 0
Step: 66

Step: 1068, Action: 2, Reward: 0
Step: 1069, Action: 2, Reward: 0
Step: 1070, Action: 2, Reward: 0
Step: 1071, Action: 2, Reward: 0
Step: 1072, Action: 2, Reward: 0
Step: 1073, Action: 2, Reward: 0
Step: 1074, Action: 2, Reward: 0
Step: 1075, Action: 2, Reward: 0
Step: 1076, Action: 2, Reward: 0
Step: 1077, Action: 2, Reward: 0
Step: 1078, Action: 2, Reward: 0
Step: 1079, Action: 2, Reward: 0
Step: 1080, Action: 2, Reward: 0
Step: 1081, Action: 2, Reward: 0
Step: 1082, Action: 2, Reward: 0
Step: 1083, Action: 2, Reward: 0
Step: 1084, Action: 2, Reward: 0
Step: 1085, Action: 2, Reward: 0
Step: 1086, Action: 2, Reward: 0
Step: 1087, Action: 2, Reward: 0
Step: 1088, Action: 2, Reward: 0
Step: 1089, Action: 2, Reward: 0
Step: 1090, Action: 2, Reward: 0
Step: 1091, Action: 2, Reward: 0
Step: 1092, Action: 2, Reward: 0
Step: 1093, Action: 2, Reward: 0
Step: 1094, Action: 2, Reward: 0
Step: 1095, Action: 2, Reward: 0
Step: 1096, Action: 2, Reward: 0
Step: 1097, Action: 2, Reward: 0
Step: 1098

Step: 1505, Action: 2, Reward: 0
Step: 1506, Action: 2, Reward: 0
Step: 1507, Action: 2, Reward: 0
Step: 1508, Action: 2, Reward: 0
Step: 1509, Action: 2, Reward: 0
Step: 1510, Action: 2, Reward: 0
Step: 1511, Action: 2, Reward: 0
Step: 1512, Action: 2, Reward: 0
Step: 1513, Action: 2, Reward: 0
Step: 1514, Action: 2, Reward: 0
Step: 1515, Action: 2, Reward: 0
Step: 1516, Action: 2, Reward: 0
Step: 1517, Action: 2, Reward: 0
Step: 1518, Action: 2, Reward: 0
Step: 1519, Action: 2, Reward: 0
Step: 1520, Action: 2, Reward: 0
Step: 1521, Action: 2, Reward: 0
Step: 1522, Action: 2, Reward: 0
Step: 1523, Action: 2, Reward: 0
Step: 1524, Action: 2, Reward: 0
Step: 1525, Action: 2, Reward: 0
Step: 1526, Action: 2, Reward: 0
Step: 1527, Action: 2, Reward: 0
Step: 1528, Action: 2, Reward: 0
Step: 1529, Action: 2, Reward: 0
Step: 1530, Action: 2, Reward: 0
Step: 1531, Action: 2, Reward: 0
Step: 1532, Action: 2, Reward: 0
Step: 1533, Action: 2, Reward: 0
Step: 1534, Action: 2, Reward: 0
Step: 1535

Step: 1942, Action: 2, Reward: 0
Step: 1943, Action: 2, Reward: 0
Step: 1944, Action: 2, Reward: 0
Step: 1945, Action: 2, Reward: 0
Step: 1946, Action: 2, Reward: 0
Step: 1947, Action: 2, Reward: 0
Step: 1948, Action: 2, Reward: 0
Step: 1949, Action: 2, Reward: 0
Step: 1950, Action: 2, Reward: 0
Step: 1951, Action: 2, Reward: 0
Step: 1952, Action: 2, Reward: 0
Step: 1953, Action: 2, Reward: 0
Step: 1954, Action: 2, Reward: 0
Step: 1955, Action: 2, Reward: 0
Step: 1956, Action: 2, Reward: 0
Step: 1957, Action: 2, Reward: 0
Step: 1958, Action: 2, Reward: 0
Step: 1959, Action: 2, Reward: 0
Step: 1960, Action: 2, Reward: 0
Step: 1961, Action: 2, Reward: 0
Step: 1962, Action: 2, Reward: 0
Step: 1963, Action: 2, Reward: 0
Step: 1964, Action: 2, Reward: 0
Step: 1965, Action: 2, Reward: 0
Step: 1966, Action: 2, Reward: 0
Step: 1967, Action: 2, Reward: 0
Step: 1968, Action: 2, Reward: 0
Step: 1969, Action: 2, Reward: 0
Step: 1970, Action: 2, Reward: 0
Step: 1971, Action: 2, Reward: 0
Step: 1972

Step: 2378, Action: 2, Reward: 0
Step: 2379, Action: 2, Reward: 0
Step: 2380, Action: 2, Reward: 0
Step: 2381, Action: 2, Reward: 0
Step: 2382, Action: 2, Reward: 0
Step: 2383, Action: 2, Reward: 0
Step: 2384, Action: 2, Reward: 0
Step: 2385, Action: 2, Reward: 0
Step: 2386, Action: 2, Reward: 0
Step: 2387, Action: 2, Reward: 0
Step: 2388, Action: 2, Reward: 0
Step: 2389, Action: 2, Reward: 0
Step: 2390, Action: 2, Reward: 0
Step: 2391, Action: 2, Reward: 0
Step: 2392, Action: 2, Reward: 0
Step: 2393, Action: 2, Reward: 0
Step: 2394, Action: 2, Reward: 0
Step: 2395, Action: 2, Reward: 0
Step: 2396, Action: 2, Reward: 0
Step: 2397, Action: 2, Reward: 0
Step: 2398, Action: 2, Reward: 0
Step: 2399, Action: 2, Reward: 0
Step: 2400, Action: 2, Reward: 0
Step: 2401, Action: 2, Reward: 0
Step: 2402, Action: 2, Reward: 0
Step: 2403, Action: 2, Reward: 0
Step: 2404, Action: 2, Reward: 0
Step: 2405, Action: 2, Reward: 0
Step: 2406, Action: 2, Reward: 0
Step: 2407, Action: 2, Reward: 0
Step: 2408

Step: 2819, Action: 2, Reward: 0
Step: 2820, Action: 2, Reward: 0
Step: 2821, Action: 2, Reward: 0
Step: 2822, Action: 2, Reward: 0
Step: 2823, Action: 2, Reward: 0
Step: 2824, Action: 2, Reward: 0
Step: 2825, Action: 2, Reward: 0
Step: 2826, Action: 2, Reward: 0
Step: 2827, Action: 2, Reward: 0
Step: 2828, Action: 2, Reward: 0
Step: 2829, Action: 2, Reward: 0
Step: 2830, Action: 2, Reward: 0
Step: 2831, Action: 2, Reward: 0
Step: 2832, Action: 2, Reward: 0
Step: 2833, Action: 2, Reward: 0
Step: 2834, Action: 2, Reward: 0
Step: 2835, Action: 2, Reward: 0
Step: 2836, Action: 2, Reward: 0
Step: 2837, Action: 2, Reward: 0
Step: 2838, Action: 2, Reward: 0
Step: 2839, Action: 2, Reward: 0
Step: 2840, Action: 2, Reward: 0
Step: 2841, Action: 2, Reward: 0
Step: 2842, Action: 2, Reward: 0
Step: 2843, Action: 2, Reward: 0
Step: 2844, Action: 2, Reward: 0
Step: 2845, Action: 2, Reward: 0
Step: 2846, Action: 2, Reward: 0
Step: 2847, Action: 2, Reward: 0
Step: 2848, Action: 2, Reward: 0
Step: 2849

Step: 3255, Action: 2, Reward: 0
Step: 3256, Action: 2, Reward: 0
Step: 3257, Action: 2, Reward: 0
Step: 3258, Action: 2, Reward: 0
Step: 3259, Action: 2, Reward: 0
Step: 3260, Action: 2, Reward: 0
Step: 3261, Action: 2, Reward: 0
Step: 3262, Action: 2, Reward: 0
Step: 3263, Action: 2, Reward: 0
Step: 3264, Action: 2, Reward: 0
Step: 3265, Action: 2, Reward: 0
Step: 3266, Action: 2, Reward: 0
Step: 3267, Action: 2, Reward: 0
Step: 3268, Action: 2, Reward: 0
Step: 3269, Action: 2, Reward: 0
Step: 3270, Action: 2, Reward: 0
Step: 3271, Action: 2, Reward: 0
Step: 3272, Action: 2, Reward: 0
Step: 3273, Action: 2, Reward: 0
Step: 3274, Action: 2, Reward: 0
Step: 3275, Action: 2, Reward: 0
Step: 3276, Action: 2, Reward: 0
Step: 3277, Action: 2, Reward: 0
Step: 3278, Action: 2, Reward: 0
Step: 3279, Action: 2, Reward: 0
Step: 3280, Action: 2, Reward: 0
Step: 3281, Action: 2, Reward: 0
Step: 3282, Action: 2, Reward: 0
Step: 3283, Action: 2, Reward: 0
Step: 3284, Action: 2, Reward: 0
Step: 3285

Step: 3693, Action: 2, Reward: 0
Step: 3694, Action: 2, Reward: 0
Step: 3695, Action: 2, Reward: 0
Step: 3696, Action: 2, Reward: 0
Step: 3697, Action: 2, Reward: 0
Step: 3698, Action: 2, Reward: 0
Step: 3699, Action: 2, Reward: 0
Step: 3700, Action: 2, Reward: 0
Step: 3701, Action: 2, Reward: 0
Step: 3702, Action: 2, Reward: 0
Step: 3703, Action: 2, Reward: 0
Step: 3704, Action: 2, Reward: 0
Step: 3705, Action: 2, Reward: 0
Step: 3706, Action: 2, Reward: 0
Step: 3707, Action: 2, Reward: 0
Step: 3708, Action: 2, Reward: 0
Step: 3709, Action: 2, Reward: 0
Step: 3710, Action: 2, Reward: 0
Step: 3711, Action: 2, Reward: 0
Step: 3712, Action: 2, Reward: 0
Step: 3713, Action: 2, Reward: 0
Step: 3714, Action: 2, Reward: 0
Step: 3715, Action: 2, Reward: 0
Step: 3716, Action: 2, Reward: 0
Step: 3717, Action: 2, Reward: 0
Step: 3718, Action: 2, Reward: 0
Step: 3719, Action: 2, Reward: 0
Step: 3720, Action: 2, Reward: 0
Step: 3721, Action: 2, Reward: 0
Step: 3722, Action: 2, Reward: 0
Step: 3723

In [4]:
#ensemble learning single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO, DQN, A2C
from sklearn.preprocessing import StandardScaler
from collections import Counter

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Ensemble model function
def ensemble_predict(actions):
    # Convert numpy arrays to integers for each action
    actions = [int(action) for action in actions]
    # Perform a majority vote among the actions (hold, buy, sell)
    action_counts = Counter(actions)
    return action_counts.most_common(1)[0][0]

# Train and evaluate the ensemble model
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'NFLX_TRAINING.csv'
    test_file = 'NFLX_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize each model and train them separately
    ppo_model = PPO("MlpPolicy", env_train, verbose=1)
    dqn_model = DQN("MlpPolicy", env_train, verbose=1)
    a2c_model = A2C("MlpPolicy", env_train, verbose=1)

    # Train each model
    ppo_model.learn(total_timesteps=50000)
    dqn_model.learn(total_timesteps=50000)
    a2c_model.learn(total_timesteps=50000)

    # Test the ensemble model on the training data
    obs, _ = env_train.reset()
    done = False
    while not done:
        # Get predictions from each model
        ppo_action, _ = ppo_model.predict(obs)
        dqn_action, _ = dqn_model.predict(obs)
        a2c_action, _ = a2c_model.predict(obs)

        # Aggregate the actions through majority voting
        final_action = ensemble_predict([ppo_action, dqn_action, a2c_action])

        # Step the environment with the final action
        obs, reward, done, truncated, info = env_train.step(final_action)

    # Calculate and display training metrics
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Test the ensemble model on the testing data
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    while not done:
        # Get predictions from each model
        ppo_action, _ = ppo_model.predict(obs)
        dqn_action, _ = dqn_model.predict(obs)
        a2c_action, _ = a2c_model.predict(obs)

        # Aggregate the actions through majority voting
        final_action = ensemble_predict([ppo_action, dqn_action, a2c_action])

        # Step the environment with the final action
        obs, reward, done, truncated, info = env_test.step(final_action)

    # Generate report for the testing session
    env_test.generate_report()

    # Calculate and display testing metrics
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 1844 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1567        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.006843879 |
|    clip_fraction        | 0.0286      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -4.07       |
|   

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.14e+03    |
|    ep_rew_mean          | 19.6        |
| time/                   |             |
|    fps                  | 1354        |
|    iterations           | 12          |
|    time_elapsed         | 18          |
|    total_timesteps      | 24576       |
| train/                  |             |
|    approx_kl            | 0.008893795 |
|    clip_fraction        | 0.0889      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.936      |
|    explained_variance   | 0.433       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00932     |
|    n_updates            | 110         |
|    policy_gradient_loss | -0.00802    |
|    value_loss           | 0.0111      |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.14e+

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 7.14e+03     |
|    ep_rew_mean          | 37.4         |
| time/                   |              |
|    fps                  | 1341         |
|    iterations           | 22           |
|    time_elapsed         | 33           |
|    total_timesteps      | 45056        |
| train/                  |              |
|    approx_kl            | 0.0068851635 |
|    clip_fraction        | 0.0796       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.652       |
|    explained_variance   | 0.556        |
|    learning_rate        | 0.0003       |
|    loss                 | -0.012       |
|    n_updates            | 210          |
|    policy_gradient_loss | -0.00316     |
|    value_loss           | 0.00951      |
------------------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean 

------------------------------------
| time/                 |          |
|    fps                | 959      |
|    iterations         | 1000     |
|    time_elapsed       | 5        |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss       | -1.08    |
|    explained_variance | -24.3    |
|    learning_rate      | 0.0007   |
|    n_updates          | 999      |
|    policy_loss        | -0.0694  |
|    value_loss         | 0.00653  |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 960      |
|    iterations         | 1100     |
|    time_elapsed       | 5        |
|    total_timesteps    | 5500     |
| train/                |          |
|    entropy_loss       | -0.895   |
|    explained_variance | 0.0584   |
|    learning_rate      | 0.0007   |
|    n_updates          | 1099     |
|    policy_loss        | -0.043   |
|    value_loss         | 0.00353  |
-

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 4.2       |
| time/                 |           |
|    fps                | 958       |
|    iterations         | 2400      |
|    time_elapsed       | 12        |
|    total_timesteps    | 12000     |
| train/                |           |
|    entropy_loss       | -1.02     |
|    explained_variance | -1.54e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 2399      |
|    policy_loss        | 0.0482    |
|    value_loss         | 0.00231   |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 4.2      |
| time/                 |          |
|    fps                | 958      |
|    iterations         | 2500     |
|    time_elapsed       | 13       |
|    total_timesteps    | 12500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 2.4      |
| time/                 |          |
|    fps                | 960      |
|    iterations         | 3700     |
|    time_elapsed       | 19       |
|    total_timesteps    | 18500    |
| train/                |          |
|    entropy_loss       | -1.08    |
|    explained_variance | 0.118    |
|    learning_rate      | 0.0007   |
|    n_updates          | 3699     |
|    policy_loss        | -0.0375  |
|    value_loss         | 0.00197  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 2.4      |
| time/                 |          |
|    fps                | 960      |
|    iterations         | 3800     |
|    time_elapsed       | 19       |
|    total_timesteps    | 19000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 3.32     |
| time/                 |          |
|    fps                | 961      |
|    iterations         | 5000     |
|    time_elapsed       | 26       |
|    total_timesteps    | 25000    |
| train/                |          |
|    entropy_loss       | -1.03    |
|    explained_variance | -1.84    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4999     |
|    policy_loss        | -0.0579  |
|    value_loss         | 0.00406  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 3.32     |
| time/                 |          |
|    fps                | 961      |
|    iterations         | 5100     |
|    time_elapsed       | 26       |
|    total_timesteps    | 25500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 3.52     |
| time/                 |          |
|    fps                | 962      |
|    iterations         | 6300     |
|    time_elapsed       | 32       |
|    total_timesteps    | 31500    |
| train/                |          |
|    entropy_loss       | -0.532   |
|    explained_variance | -3.91    |
|    learning_rate      | 0.0007   |
|    n_updates          | 6299     |
|    policy_loss        | 6.69e-05 |
|    value_loss         | 3.14e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 3.52     |
| time/                 |          |
|    fps                | 962      |
|    iterations         | 6400     |
|    time_elapsed       | 33       |
|    total_timesteps    | 32000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 3.37      |
| time/                 |           |
|    fps                | 964       |
|    iterations         | 7600      |
|    time_elapsed       | 39        |
|    total_timesteps    | 38000     |
| train/                |           |
|    entropy_loss       | -0.659    |
|    explained_variance | -1.81     |
|    learning_rate      | 0.0007    |
|    n_updates          | 7599      |
|    policy_loss        | -0.000235 |
|    value_loss         | 0.000118  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 3.37     |
| time/                 |          |
|    fps                | 964      |
|    iterations         | 7700     |
|    time_elapsed       | 39       |
|    total_timesteps    | 38500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.14e+03 |
|    ep_rew_mean        | 3.39     |
| time/                 |          |
|    fps                | 966      |
|    iterations         | 8900     |
|    time_elapsed       | 46       |
|    total_timesteps    | 44500    |
| train/                |          |
|    entropy_loss       | -0.00908 |
|    explained_variance | -5.96    |
|    learning_rate      | 0.0007   |
|    n_updates          | 8899     |
|    policy_loss        | 1.43e-07 |
|    value_loss         | 1.11e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.14e+03  |
|    ep_rew_mean        | 3.39      |
| time/                 |           |
|    fps                | 965       |
|    iterations         | 9000      |
|    time_elapsed       | 46        |
|    total_timesteps    | 45000     |
| train/                |    