In [5]:
#ppo algorithm single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate additional metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Train and evaluate the model with all metrics
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'LPL_TRAINING.csv'
    test_file = 'LPL_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize the PPO model and train
    model = PPO("MlpPolicy", env_train, verbose=1)
    model.learn(total_timesteps=100000)

    # Test the model on the training data
    obs, _ = env_train.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_train.step(action)

    # Generate report for the training session
    env_train.generate_report()

    # Calculate and display metrics for the training period
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Test the model on the testing data
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_test.step(action)

    # Generate report for the testing session
    env_test.generate_report()

    # Calculate and display metrics for the testing period
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 1647 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1336        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009475296 |
|    clip_fraction        | 0.0437      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -1.93       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00911     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00702    |
|    value_loss         

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 4.78e+03     |
|    ep_rew_mean          | 10.3         |
| time/                   |              |
|    fps                  | 1144         |
|    iterations           | 12           |
|    time_elapsed         | 21           |
|    total_timesteps      | 24576        |
| train/                  |              |
|    approx_kl            | 0.0066246865 |
|    clip_fraction        | 0.0726       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.94        |
|    explained_variance   | 0.171        |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00499     |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00915     |
|    value_loss           | 0.0105       |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 4.78e+03     |
|    ep_rew_mean          | 19           |
| time/                   |              |
|    fps                  | 1153         |
|    iterations           | 22           |
|    time_elapsed         | 39           |
|    total_timesteps      | 45056        |
| train/                  |              |
|    approx_kl            | 0.0065078484 |
|    clip_fraction        | 0.0958       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.803       |
|    explained_variance   | 0.166        |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0244      |
|    n_updates            | 210          |
|    policy_gradient_loss | -0.00702     |
|    value_loss           | 0.00911      |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.78e+03    |
|    ep_rew_mean          | 26.2        |
| time/                   |             |
|    fps                  | 1179        |
|    iterations           | 32          |
|    time_elapsed         | 55          |
|    total_timesteps      | 65536       |
| train/                  |             |
|    approx_kl            | 0.011029489 |
|    clip_fraction        | 0.118       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.749      |
|    explained_variance   | 0.43        |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00759     |
|    n_updates            | 310         |
|    policy_gradient_loss | -0.00418    |
|    value_loss           | 0.00997     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.78e+

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.78e+03    |
|    ep_rew_mean          | 31.6        |
| time/                   |             |
|    fps                  | 1181        |
|    iterations           | 42          |
|    time_elapsed         | 72          |
|    total_timesteps      | 86016       |
| train/                  |             |
|    approx_kl            | 0.027166732 |
|    clip_fraction        | 0.116       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.628      |
|    explained_variance   | 0.324       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0341     |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.00138    |
|    value_loss           | 0.00781     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 4.78e+


--- Agent Report ---
Agent starts with 0 holdings (neutral position), Initial Balance: 10000
Agent buys at 10.15, Current Balance: 10000, Holdings: 1 Long
Agent closes long at 10.72, profit: 0.5700000000000003, Current Balance: 10000.57, Holdings: 0
Agent buys at 10.6, Current Balance: 10000.57, Holdings: 1 Long
Agent closes long at 10.7, profit: 0.09999999999999964, Current Balance: 10000.67, Holdings: 0
Agent sells (short) at 10.705, Current Balance: 10000.67, Holdings: 1 Short
Agent closes short at 10.25, profit: 0.45500000000000007, Current Balance: 10001.125, Holdings: 0
Agent sells (short) at 10.21, Current Balance: 10001.125, Holdings: 1 Short
Agent closes short at 10.215, profit: -0.004999999999999005, Current Balance: 10001.12, Holdings: 0
Agent buys at 10.245, Current Balance: 10001.12, Holdings: 1 Long
Agent closes long at 10.24, profit: -0.004999999999999005, Current Balance: 10001.115000000002, Holdings: 0
Agent sells (short) at 10.24, Current Balance: 10001.115000000002,

In [6]:
#dqn algorithm single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import DQN
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate additional metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Train and evaluate the model with all metrics
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'LPL_TRAINING.csv'
    test_file = 'LPL_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize the DQN model and train
    model = DQN("MlpPolicy", env_train, verbose=1)
    model.learn(total_timesteps=100000)

    # Test the model on the training data
    obs, _ = env_train.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_train.step(action)

    # Generate report for the training session
    env_train.generate_report()

    # Calculate and display metrics for the training period
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Test the model on the testing data
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_test.step(action)

    # Generate report for the testing session
    env_test.generate_report()

    # Calculate and display metrics for the testing period
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 4.78e+03 |
|    ep_rew_mean      | 2.13     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 917      |
|    time_elapsed     | 20       |
|    total_timesteps  | 19112    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000156 |
|    n_updates        | 4752     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 4.78e+03 |
|    ep_rew_mean      | 3.42     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1037     |
|    time_elapsed     | 36       |
|    total_timesteps  | 38224    |
| train/              |        


--- Agent Report ---
Agent starts with 0 holdings (neutral position), Initial Balance: 10000
Agent sells (short) at 10.15, Current Balance: 10000, Holdings: 1 Short
Agent closes short at 10.25, profit: -0.09999999999999964, Current Balance: 9999.9, Holdings: 0
Agent buys at 10.24, Current Balance: 9999.9, Holdings: 1 Long
Agent closes long at 10.72, profit: 0.4800000000000004, Current Balance: 10000.38, Holdings: 0
Agent sells (short) at 10.6, Current Balance: 10000.38, Holdings: 1 Short
Agent closes short at 10.59, profit: 0.009999999999999787, Current Balance: 10000.39, Holdings: 0
Agent sells (short) at 10.7, Current Balance: 10000.39, Holdings: 1 Short
Agent closes short at 10.705, profit: -0.005000000000000782, Current Balance: 10000.385, Holdings: 0
Agent buys at 10.675, Current Balance: 10000.385, Holdings: 1 Long
Agent closes long at 10.65, profit: -0.025000000000000355, Current Balance: 10000.36, Holdings: 0
Agent sells (short) at 10.55, Current Balance: 10000.36, Holdings: 1

In [7]:
#a2c algorithm single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import A2C
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate additional metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Train and evaluate the model with action logging for debugging
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'LPL_TRAINING.csv'
    test_file = 'LPL_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize the A2C model and train with more timesteps
    model = A2C("MlpPolicy", env_train, verbose=1)
    model.learn(total_timesteps=500000)  # Increased timesteps

    # Testing on the training data with action logging
    obs, _ = env_train.reset()
    done = False
    print("\n--- Training Session ---")
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_train.step(action)
        # Log each action and reward for debugging
        print(f"Step: {env_train.current_step}, Action: {action}, Reward: {reward}")

    # Generate report and metrics for the training session
    env_train.generate_report()
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Testing on the testing data with action logging
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    print("\n--- Testing Session ---")
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_test.step(action)
        # Log each action and reward for debugging
        print(f"Step: {env_test.current_step}, Action: {action}, Reward: {reward}")

    # Generate report and metrics for the testing session
    env_test.generate_report()
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")



# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| time/                 |          |
|    fps                | 879      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.06    |
|    explained_variance | -18.5    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0413  |
|    value_loss         | 0.00555  |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 914      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.04    |
|    explained_variance | -456     |
|    learning_rate      | 0.0007   |
|    n_updates    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | -0.241    |
| time/                 |           |
|    fps                | 725       |
|    iterations         | 1600      |
|    time_elapsed       | 11        |
|    total_timesteps    | 8000      |
| train/                |           |
|    entropy_loss       | -0.837    |
|    explained_variance | -1.36e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 1599      |
|    policy_loss        | -0.0663   |
|    value_loss         | 0.00782   |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | -0.241   |
| time/                 |          |
|    fps                | 730      |
|    iterations         | 1700     |
|    time_elapsed       | 11       |
|    total_timesteps    | 8500     |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | -0.359   |
| time/                 |          |
|    fps                | 816      |
|    iterations         | 2900     |
|    time_elapsed       | 17       |
|    total_timesteps    | 14500    |
| train/                |          |
|    entropy_loss       | -0.457   |
|    explained_variance | -0.264   |
|    learning_rate      | 0.0007   |
|    n_updates          | 2899     |
|    policy_loss        | 0.025    |
|    value_loss         | 0.00186  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | -0.359   |
| time/                 |          |
|    fps                | 820      |
|    iterations         | 3000     |
|    time_elapsed       | 18       |
|    total_timesteps    | 15000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | -0.332   |
| time/                 |          |
|    fps                | 858      |
|    iterations         | 4200     |
|    time_elapsed       | 24       |
|    total_timesteps    | 21000    |
| train/                |          |
|    entropy_loss       | -0.0278  |
|    explained_variance | 0.611    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4199     |
|    policy_loss        | 2.49e-05 |
|    value_loss         | 4.32e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | -0.332   |
| time/                 |          |
|    fps                | 861      |
|    iterations         | 4300     |
|    time_elapsed       | 24       |
|    total_timesteps    | 21500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | -0.311   |
| time/                 |          |
|    fps                | 865      |
|    iterations         | 5500     |
|    time_elapsed       | 31       |
|    total_timesteps    | 27500    |
| train/                |          |
|    entropy_loss       | -0.586   |
|    explained_variance | -18.6    |
|    learning_rate      | 0.0007   |
|    n_updates          | 5499     |
|    policy_loss        | 0.000317 |
|    value_loss         | 5.25e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | -0.311   |
| time/                 |          |
|    fps                | 855      |
|    iterations         | 5600     |
|    time_elapsed       | 32       |
|    total_timesteps    | 28000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | -0.299    |
| time/                 |           |
|    fps                | 867       |
|    iterations         | 6800      |
|    time_elapsed       | 39        |
|    total_timesteps    | 34000     |
| train/                |           |
|    entropy_loss       | -0.0396   |
|    explained_variance | -6.43e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 6799      |
|    policy_loss        | 1.6e-06   |
|    value_loss         | 1.87e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | -0.299    |
| time/                 |           |
|    fps                | 868       |
|    iterations         | 6900      |
|    time_elapsed       | 39        |
|    total_timesteps    | 34500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | -0.173   |
| time/                 |          |
|    fps                | 884      |
|    iterations         | 8100     |
|    time_elapsed       | 45       |
|    total_timesteps    | 40500    |
| train/                |          |
|    entropy_loss       | -0.668   |
|    explained_variance | -727     |
|    learning_rate      | 0.0007   |
|    n_updates          | 8099     |
|    policy_loss        | -0.00154 |
|    value_loss         | 1.23e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | -0.173   |
| time/                 |          |
|    fps                | 885      |
|    iterations         | 8200     |
|    time_elapsed       | 46       |
|    total_timesteps    | 41000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | -0.108   |
| time/                 |          |
|    fps                | 896      |
|    iterations         | 9400     |
|    time_elapsed       | 52       |
|    total_timesteps    | 47000    |
| train/                |          |
|    entropy_loss       | -0.41    |
|    explained_variance | -0.251   |
|    learning_rate      | 0.0007   |
|    n_updates          | 9399     |
|    policy_loss        | -0.00519 |
|    value_loss         | 0.00197  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | -0.108    |
| time/                 |           |
|    fps                | 897       |
|    iterations         | 9500      |
|    time_elapsed       | 52        |
|    total_timesteps    | 47500     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.0675    |
| time/                 |           |
|    fps                | 906       |
|    iterations         | 10700     |
|    time_elapsed       | 59        |
|    total_timesteps    | 53500     |
| train/                |           |
|    entropy_loss       | -0.00484  |
|    explained_variance | -124      |
|    learning_rate      | 0.0007    |
|    n_updates          | 10699     |
|    policy_loss        | -5.64e-07 |
|    value_loss         | 2.92e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.0675   |
| time/                 |          |
|    fps                | 906      |
|    iterations         | 10800    |
|    time_elapsed       | 59       |
|    total_timesteps    | 54000    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.0644    |
| time/                 |           |
|    fps                | 914       |
|    iterations         | 12000     |
|    time_elapsed       | 65        |
|    total_timesteps    | 60000     |
| train/                |           |
|    entropy_loss       | -0.423    |
|    explained_variance | -3.02e+05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 11999     |
|    policy_loss        | 9.52e-06  |
|    value_loss         | 2.36e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.0644   |
| time/                 |          |
|    fps                | 914      |
|    iterations         | 12100    |
|    time_elapsed       | 66       |
|    total_timesteps    | 60500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.13     |
| time/                 |          |
|    fps                | 901      |
|    iterations         | 13300    |
|    time_elapsed       | 73       |
|    total_timesteps    | 66500    |
| train/                |          |
|    entropy_loss       | -0.678   |
|    explained_variance | -8.91    |
|    learning_rate      | 0.0007   |
|    n_updates          | 13299    |
|    policy_loss        | -0.00036 |
|    value_loss         | 7.95e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.161     |
| time/                 |           |
|    fps                | 899       |
|    iterations         | 13400     |
|    time_elapsed       | 74        |
|    total_timesteps    | 67000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.204    |
| time/                 |          |
|    fps                | 898      |
|    iterations         | 14600    |
|    time_elapsed       | 81       |
|    total_timesteps    | 73000    |
| train/                |          |
|    entropy_loss       | -0.00088 |
|    explained_variance | -9.85    |
|    learning_rate      | 0.0007   |
|    n_updates          | 14599    |
|    policy_loss        | 3.85e-09 |
|    value_loss         | 8.87e-09 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.204     |
| time/                 |           |
|    fps                | 897       |
|    iterations         | 14700     |
|    time_elapsed       | 81        |
|    total_timesteps    | 73500     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.217    |
| time/                 |          |
|    fps                | 893      |
|    iterations         | 15900    |
|    time_elapsed       | 88       |
|    total_timesteps    | 79500    |
| train/                |          |
|    entropy_loss       | -0.389   |
|    explained_variance | -15.3    |
|    learning_rate      | 0.0007   |
|    n_updates          | 15899    |
|    policy_loss        | 7.07e-05 |
|    value_loss         | 6.17e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.217     |
| time/                 |           |
|    fps                | 893       |
|    iterations         | 16000     |
|    time_elapsed       | 89        |
|    total_timesteps    | 80000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.221    |
| time/                 |          |
|    fps                | 878      |
|    iterations         | 17200    |
|    time_elapsed       | 97       |
|    total_timesteps    | 86000    |
| train/                |          |
|    entropy_loss       | -0.304   |
|    explained_variance | -419     |
|    learning_rate      | 0.0007   |
|    n_updates          | 17199    |
|    policy_loss        | 2.48e-05 |
|    value_loss         | 4.51e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.25     |
| time/                 |          |
|    fps                | 877      |
|    iterations         | 17300    |
|    time_elapsed       | 98       |
|    total_timesteps    | 86500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.288    |
| time/                 |          |
|    fps                | 875      |
|    iterations         | 18500    |
|    time_elapsed       | 105      |
|    total_timesteps    | 92500    |
| train/                |          |
|    entropy_loss       | -0.0102  |
|    explained_variance | -1.15    |
|    learning_rate      | 0.0007   |
|    n_updates          | 18499    |
|    policy_loss        | 3.53e-06 |
|    value_loss         | 9.15e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.288     |
| time/                 |           |
|    fps                | 875       |
|    iterations         | 18600     |
|    time_elapsed       | 106       |
|    total_timesteps    | 93000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.312    |
| time/                 |          |
|    fps                | 874      |
|    iterations         | 19800    |
|    time_elapsed       | 113      |
|    total_timesteps    | 99000    |
| train/                |          |
|    entropy_loss       | -0.00312 |
|    explained_variance | -7.2     |
|    learning_rate      | 0.0007   |
|    n_updates          | 19799    |
|    policy_loss        | 2.78e-08 |
|    value_loss         | 8.19e-08 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.312     |
| time/                 |           |
|    fps                | 874       |
|    iterations         | 19900     |
|    time_elapsed       | 113       |
|    total_timesteps    | 99500     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.346     |
| time/                 |           |
|    fps                | 872       |
|    iterations         | 21100     |
|    time_elapsed       | 120       |
|    total_timesteps    | 105500    |
| train/                |           |
|    entropy_loss       | -0.00364  |
|    explained_variance | -48.2     |
|    learning_rate      | 0.0007    |
|    n_updates          | 21099     |
|    policy_loss        | -7.89e-08 |
|    value_loss         | 7.23e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.346     |
| time/                 |           |
|    fps                | 872       |
|    iterations         | 21200     |
|    time_elapsed       | 121       |
|    total_timesteps    | 106000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.368     |
| time/                 |           |
|    fps                | 871       |
|    iterations         | 22400     |
|    time_elapsed       | 128       |
|    total_timesteps    | 112000    |
| train/                |           |
|    entropy_loss       | -0.0246   |
|    explained_variance | -1.19     |
|    learning_rate      | 0.0007    |
|    n_updates          | 22399     |
|    policy_loss        | -0.000112 |
|    value_loss         | 0.000914  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.368    |
| time/                 |          |
|    fps                | 871      |
|    iterations         | 22500    |
|    time_elapsed       | 129      |
|    total_timesteps    | 112500   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.382     |
| time/                 |           |
|    fps                | 868       |
|    iterations         | 23700     |
|    time_elapsed       | 136       |
|    total_timesteps    | 118500    |
| train/                |           |
|    entropy_loss       | -0.0197   |
|    explained_variance | -1.57e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 23699     |
|    policy_loss        | 3.18e-07  |
|    value_loss         | 1.16e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.382     |
| time/                 |           |
|    fps                | 868       |
|    iterations         | 23800     |
|    time_elapsed       | 137       |
|    total_timesteps    | 119000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.425    |
| time/                 |          |
|    fps                | 865      |
|    iterations         | 25000    |
|    time_elapsed       | 144      |
|    total_timesteps    | 125000   |
| train/                |          |
|    entropy_loss       | -0.00266 |
|    explained_variance | -95.2    |
|    learning_rate      | 0.0007   |
|    n_updates          | 24999    |
|    policy_loss        | 1.46e-08 |
|    value_loss         | 6.59e-09 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.425    |
| time/                 |          |
|    fps                | 865      |
|    iterations         | 25100    |
|    time_elapsed       | 145      |
|    total_timesteps    | 125500   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.492    |
| time/                 |          |
|    fps                | 858      |
|    iterations         | 26300    |
|    time_elapsed       | 153      |
|    total_timesteps    | 131500   |
| train/                |          |
|    entropy_loss       | -0.236   |
|    explained_variance | -0.232   |
|    learning_rate      | 0.0007   |
|    n_updates          | 26299    |
|    policy_loss        | 0.0071   |
|    value_loss         | 0.00101  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.492    |
| time/                 |          |
|    fps                | 858      |
|    iterations         | 26400    |
|    time_elapsed       | 153      |
|    total_timesteps    | 132000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.614    |
| time/                 |          |
|    fps                | 855      |
|    iterations         | 27600    |
|    time_elapsed       | 161      |
|    total_timesteps    | 138000   |
| train/                |          |
|    entropy_loss       | -0.253   |
|    explained_variance | -5.12    |
|    learning_rate      | 0.0007   |
|    n_updates          | 27599    |
|    policy_loss        | 0.00115  |
|    value_loss         | 1.36e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.614    |
| time/                 |          |
|    fps                | 855      |
|    iterations         | 27700    |
|    time_elapsed       | 161      |
|    total_timesteps    | 138500   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.613    |
| time/                 |          |
|    fps                | 854      |
|    iterations         | 28900    |
|    time_elapsed       | 169      |
|    total_timesteps    | 144500   |
| train/                |          |
|    entropy_loss       | -0.00254 |
|    explained_variance | -19      |
|    learning_rate      | 0.0007   |
|    n_updates          | 28899    |
|    policy_loss        | 2.05e-07 |
|    value_loss         | 6.15e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.613     |
| time/                 |           |
|    fps                | 854       |
|    iterations         | 29000     |
|    time_elapsed       | 169       |
|    total_timesteps    | 145000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.625     |
| time/                 |           |
|    fps                | 849       |
|    iterations         | 30200     |
|    time_elapsed       | 177       |
|    total_timesteps    | 151000    |
| train/                |           |
|    entropy_loss       | -0.172    |
|    explained_variance | 0.00528   |
|    learning_rate      | 0.0007    |
|    n_updates          | 30199     |
|    policy_loss        | -0.000138 |
|    value_loss         | 0.0014    |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.625     |
| time/                 |           |
|    fps                | 847       |
|    iterations         | 30300     |
|    time_elapsed       | 178       |
|    total_timesteps    | 151500    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.749    |
| time/                 |          |
|    fps                | 840      |
|    iterations         | 31500    |
|    time_elapsed       | 187      |
|    total_timesteps    | 157500   |
| train/                |          |
|    entropy_loss       | -0.23    |
|    explained_variance | 0.144    |
|    learning_rate      | 0.0007   |
|    n_updates          | 31499    |
|    policy_loss        | 0.000822 |
|    value_loss         | 0.000104 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.781     |
| time/                 |           |
|    fps                | 840       |
|    iterations         | 31600     |
|    time_elapsed       | 187       |
|    total_timesteps    | 158000    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.849    |
| time/                 |          |
|    fps                | 838      |
|    iterations         | 32800    |
|    time_elapsed       | 195      |
|    total_timesteps    | 164000   |
| train/                |          |
|    entropy_loss       | -0.0115  |
|    explained_variance | 0.344    |
|    learning_rate      | 0.0007   |
|    n_updates          | 32799    |
|    policy_loss        | 6.59e-06 |
|    value_loss         | 0.000105 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 0.849    |
| time/                 |          |
|    fps                | 837      |
|    iterations         | 32900    |
|    time_elapsed       | 196      |
|    total_timesteps    | 164500   |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.996     |
| time/                 |           |
|    fps                | 829       |
|    iterations         | 34100     |
|    time_elapsed       | 205       |
|    total_timesteps    | 170500    |
| train/                |           |
|    entropy_loss       | -0.000893 |
|    explained_variance | -7.78     |
|    learning_rate      | 0.0007    |
|    n_updates          | 34099     |
|    policy_loss        | 6.26e-09  |
|    value_loss         | 7.89e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 0.996     |
| time/                 |           |
|    fps                | 828       |
|    iterations         | 34200     |
|    time_elapsed       | 206       |
|    total_timesteps    | 171000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.28     |
| time/                 |          |
|    fps                | 823      |
|    iterations         | 35400    |
|    time_elapsed       | 214      |
|    total_timesteps    | 177000   |
| train/                |          |
|    entropy_loss       | -0.186   |
|    explained_variance | 0.019    |
|    learning_rate      | 0.0007   |
|    n_updates          | 35399    |
|    policy_loss        | 0.0125   |
|    value_loss         | 0.00507  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.28     |
| time/                 |          |
|    fps                | 822      |
|    iterations         | 35500    |
|    time_elapsed       | 215      |
|    total_timesteps    | 177500   |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.3       |
| time/                 |           |
|    fps                | 813       |
|    iterations         | 36700     |
|    time_elapsed       | 225       |
|    total_timesteps    | 183500    |
| train/                |           |
|    entropy_loss       | -0.00136  |
|    explained_variance | -60       |
|    learning_rate      | 0.0007    |
|    n_updates          | 36699     |
|    policy_loss        | -6.42e-08 |
|    value_loss         | 3.18e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.3      |
| time/                 |          |
|    fps                | 812      |
|    iterations         | 36800    |
|    time_elapsed       | 226      |
|    total_timesteps    | 184000   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.29      |
| time/                 |           |
|    fps                | 807       |
|    iterations         | 38000     |
|    time_elapsed       | 235       |
|    total_timesteps    | 190000    |
| train/                |           |
|    entropy_loss       | -0.000697 |
|    explained_variance | -651      |
|    learning_rate      | 0.0007    |
|    n_updates          | 37999     |
|    policy_loss        | -6.91e-08 |
|    value_loss         | 2.4e-06   |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.29     |
| time/                 |          |
|    fps                | 806      |
|    iterations         | 38100    |
|    time_elapsed       | 236      |
|    total_timesteps    | 190500   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.28      |
| time/                 |           |
|    fps                | 799       |
|    iterations         | 39300     |
|    time_elapsed       | 245       |
|    total_timesteps    | 196500    |
| train/                |           |
|    entropy_loss       | -0.000515 |
|    explained_variance | -3.65e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 39299     |
|    policy_loss        | -6.37e-09 |
|    value_loss         | 7.37e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.28      |
| time/                 |           |
|    fps                | 798       |
|    iterations         | 39400     |
|    time_elapsed       | 246       |
|    total_timesteps    | 197000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.28     |
| time/                 |          |
|    fps                | 791      |
|    iterations         | 40600    |
|    time_elapsed       | 256      |
|    total_timesteps    | 203000   |
| train/                |          |
|    entropy_loss       | -0.00301 |
|    explained_variance | -421     |
|    learning_rate      | 0.0007   |
|    n_updates          | 40599    |
|    policy_loss        | 4.66e-07 |
|    value_loss         | 6.09e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.28      |
| time/                 |           |
|    fps                | 790       |
|    iterations         | 40700     |
|    time_elapsed       | 257       |
|    total_timesteps    | 203500    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.28     |
| time/                 |          |
|    fps                | 787      |
|    iterations         | 41900    |
|    time_elapsed       | 266      |
|    total_timesteps    | 209500   |
| train/                |          |
|    entropy_loss       | -0.176   |
|    explained_variance | -12      |
|    learning_rate      | 0.0007   |
|    n_updates          | 41899    |
|    policy_loss        | -0.00578 |
|    value_loss         | 0.000231 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.28      |
| time/                 |           |
|    fps                | 787       |
|    iterations         | 42000     |
|    time_elapsed       | 266       |
|    total_timesteps    | 210000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.29      |
| time/                 |           |
|    fps                | 777       |
|    iterations         | 43200     |
|    time_elapsed       | 277       |
|    total_timesteps    | 216000    |
| train/                |           |
|    entropy_loss       | -0.000315 |
|    explained_variance | -81.3     |
|    learning_rate      | 0.0007    |
|    n_updates          | 43199     |
|    policy_loss        | 4.44e-09  |
|    value_loss         | 1.73e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.29      |
| time/                 |           |
|    fps                | 778       |
|    iterations         | 43300     |
|    time_elapsed       | 278       |
|    total_timesteps    | 216500    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.28     |
| time/                 |          |
|    fps                | 772      |
|    iterations         | 44500    |
|    time_elapsed       | 288      |
|    total_timesteps    | 222500   |
| train/                |          |
|    entropy_loss       | -0.0026  |
|    explained_variance | 0.249    |
|    learning_rate      | 0.0007   |
|    n_updates          | 44499    |
|    policy_loss        | -1e-06   |
|    value_loss         | 1.25e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.28     |
| time/                 |          |
|    fps                | 772      |
|    iterations         | 44600    |
|    time_elapsed       | 288      |
|    total_timesteps    | 223000   |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.28      |
| time/                 |           |
|    fps                | 772       |
|    iterations         | 45800     |
|    time_elapsed       | 296       |
|    total_timesteps    | 229000    |
| train/                |           |
|    entropy_loss       | -0.00086  |
|    explained_variance | -0.0469   |
|    learning_rate      | 0.0007    |
|    n_updates          | 45799     |
|    policy_loss        | -1.43e-07 |
|    value_loss         | 3.71e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.28      |
| time/                 |           |
|    fps                | 772       |
|    iterations         | 45900     |
|    time_elapsed       | 297       |
|    total_timesteps    | 229500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.29      |
| time/                 |           |
|    fps                | 772       |
|    iterations         | 47100     |
|    time_elapsed       | 304       |
|    total_timesteps    | 235500    |
| train/                |           |
|    entropy_loss       | -0.000508 |
|    explained_variance | -40.5     |
|    learning_rate      | 0.0007    |
|    n_updates          | 47099     |
|    policy_loss        | -5.23e-10 |
|    value_loss         | 1.3e-07   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.29      |
| time/                 |           |
|    fps                | 772       |
|    iterations         | 47200     |
|    time_elapsed       | 305       |
|    total_timesteps    | 236000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.3      |
| time/                 |          |
|    fps                | 774      |
|    iterations         | 48400    |
|    time_elapsed       | 312      |
|    total_timesteps    | 242000   |
| train/                |          |
|    entropy_loss       | -0.00213 |
|    explained_variance | -72      |
|    learning_rate      | 0.0007   |
|    n_updates          | 48399    |
|    policy_loss        | 4.03e-08 |
|    value_loss         | 2.62e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.3       |
| time/                 |           |
|    fps                | 774       |
|    iterations         | 48500     |
|    time_elapsed       | 313       |
|    total_timesteps    | 242500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.31      |
| time/                 |           |
|    fps                | 774       |
|    iterations         | 49700     |
|    time_elapsed       | 320       |
|    total_timesteps    | 248500    |
| train/                |           |
|    entropy_loss       | -0.000856 |
|    explained_variance | -0.539    |
|    learning_rate      | 0.0007    |
|    n_updates          | 49699     |
|    policy_loss        | -4.78e-07 |
|    value_loss         | 4.11e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.31      |
| time/                 |           |
|    fps                | 774       |
|    iterations         | 49800     |
|    time_elapsed       | 321       |
|    total_timesteps    | 249000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.31     |
| time/                 |          |
|    fps                | 771      |
|    iterations         | 51000    |
|    time_elapsed       | 330      |
|    total_timesteps    | 255000   |
| train/                |          |
|    entropy_loss       | -0.00443 |
|    explained_variance | -64.1    |
|    learning_rate      | 0.0007   |
|    n_updates          | 50999    |
|    policy_loss        | 4.66e-07 |
|    value_loss         | 3.1e-06  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.31      |
| time/                 |           |
|    fps                | 771       |
|    iterations         | 51100     |
|    time_elapsed       | 331       |
|    total_timesteps    | 255500    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.31     |
| time/                 |          |
|    fps                | 770      |
|    iterations         | 52300    |
|    time_elapsed       | 339      |
|    total_timesteps    | 261500   |
| train/                |          |
|    entropy_loss       | -0.00021 |
|    explained_variance | -5.73    |
|    learning_rate      | 0.0007   |
|    n_updates          | 52299    |
|    policy_loss        | 1.5e-09  |
|    value_loss         | 1.28e-08 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.31      |
| time/                 |           |
|    fps                | 770       |
|    iterations         | 52400     |
|    time_elapsed       | 340       |
|    total_timesteps    | 262000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.32      |
| time/                 |           |
|    fps                | 769       |
|    iterations         | 53600     |
|    time_elapsed       | 348       |
|    total_timesteps    | 268000    |
| train/                |           |
|    entropy_loss       | -0.00305  |
|    explained_variance | -121      |
|    learning_rate      | 0.0007    |
|    n_updates          | 53599     |
|    policy_loss        | -1.18e-07 |
|    value_loss         | 2.16e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.32      |
| time/                 |           |
|    fps                | 769       |
|    iterations         | 53700     |
|    time_elapsed       | 348       |
|    total_timesteps    | 268500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.33      |
| time/                 |           |
|    fps                | 769       |
|    iterations         | 54900     |
|    time_elapsed       | 356       |
|    total_timesteps    | 274500    |
| train/                |           |
|    entropy_loss       | -0.000609 |
|    explained_variance | -906      |
|    learning_rate      | 0.0007    |
|    n_updates          | 54899     |
|    policy_loss        | -9.48e-07 |
|    value_loss         | 0.00271   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.33      |
| time/                 |           |
|    fps                | 769       |
|    iterations         | 55000     |
|    time_elapsed       | 357       |
|    total_timesteps    | 275000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.34     |
| time/                 |          |
|    fps                | 767      |
|    iterations         | 56200    |
|    time_elapsed       | 365      |
|    total_timesteps    | 281000   |
| train/                |          |
|    entropy_loss       | -0.159   |
|    explained_variance | -6.61    |
|    learning_rate      | 0.0007   |
|    n_updates          | 56199    |
|    policy_loss        | 0.00133  |
|    value_loss         | 4.28e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.34      |
| time/                 |           |
|    fps                | 767       |
|    iterations         | 56300     |
|    time_elapsed       | 366       |
|    total_timesteps    | 281500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.34      |
| time/                 |           |
|    fps                | 765       |
|    iterations         | 57500     |
|    time_elapsed       | 375       |
|    total_timesteps    | 287500    |
| train/                |           |
|    entropy_loss       | -0.000111 |
|    explained_variance | -10.2     |
|    learning_rate      | 0.0007    |
|    n_updates          | 57499     |
|    policy_loss        | 1.08e-09  |
|    value_loss         | 1.06e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.34      |
| time/                 |           |
|    fps                | 765       |
|    iterations         | 57600     |
|    time_elapsed       | 376       |
|    total_timesteps    | 288000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 764       |
|    iterations         | 58800     |
|    time_elapsed       | 384       |
|    total_timesteps    | 294000    |
| train/                |           |
|    entropy_loss       | -0.0046   |
|    explained_variance | -2.67     |
|    learning_rate      | 0.0007    |
|    n_updates          | 58799     |
|    policy_loss        | -9.72e-08 |
|    value_loss         | 7.25e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 764       |
|    iterations         | 58900     |
|    time_elapsed       | 385       |
|    total_timesteps    | 294500    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.35     |
| time/                 |          |
|    fps                | 765      |
|    iterations         | 60100    |
|    time_elapsed       | 392      |
|    total_timesteps    | 300500   |
| train/                |          |
|    entropy_loss       | -0.0483  |
|    explained_variance | 0.0383   |
|    learning_rate      | 0.0007   |
|    n_updates          | 60099    |
|    policy_loss        | 0.000337 |
|    value_loss         | 0.000636 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 764       |
|    iterations         | 60200     |
|    time_elapsed       | 393       |
|    total_timesteps    | 301000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.36      |
| time/                 |           |
|    fps                | 762       |
|    iterations         | 61400     |
|    time_elapsed       | 402       |
|    total_timesteps    | 307000    |
| train/                |           |
|    entropy_loss       | -0.000103 |
|    explained_variance | -0.411    |
|    learning_rate      | 0.0007    |
|    n_updates          | 61399     |
|    policy_loss        | 1.14e-09  |
|    value_loss         | 2.55e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.36      |
| time/                 |           |
|    fps                | 762       |
|    iterations         | 61500     |
|    time_elapsed       | 403       |
|    total_timesteps    | 307500    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.37     |
| time/                 |          |
|    fps                | 762      |
|    iterations         | 62700    |
|    time_elapsed       | 411      |
|    total_timesteps    | 313500   |
| train/                |          |
|    entropy_loss       | -0.0012  |
|    explained_variance | -57.3    |
|    learning_rate      | 0.0007   |
|    n_updates          | 62699    |
|    policy_loss        | 2.36e-08 |
|    value_loss         | 1.11e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.37      |
| time/                 |           |
|    fps                | 762       |
|    iterations         | 62800     |
|    time_elapsed       | 412       |
|    total_timesteps    | 314000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.38      |
| time/                 |           |
|    fps                | 760       |
|    iterations         | 64000     |
|    time_elapsed       | 420       |
|    total_timesteps    | 320000    |
| train/                |           |
|    entropy_loss       | -0.000875 |
|    explained_variance | -53.8     |
|    learning_rate      | 0.0007    |
|    n_updates          | 63999     |
|    policy_loss        | -1.75e-08 |
|    value_loss         | 4.23e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.38     |
| time/                 |          |
|    fps                | 761      |
|    iterations         | 64100    |
|    time_elapsed       | 421      |
|    total_timesteps    | 320500   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.38      |
| time/                 |           |
|    fps                | 761       |
|    iterations         | 65300     |
|    time_elapsed       | 428       |
|    total_timesteps    | 326500    |
| train/                |           |
|    entropy_loss       | -0.000805 |
|    explained_variance | -1.69     |
|    learning_rate      | 0.0007    |
|    n_updates          | 65299     |
|    policy_loss        | 4.7e-08   |
|    value_loss         | 8.74e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.38      |
| time/                 |           |
|    fps                | 761       |
|    iterations         | 65400     |
|    time_elapsed       | 429       |
|    total_timesteps    | 327000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.37      |
| time/                 |           |
|    fps                | 758       |
|    iterations         | 66600     |
|    time_elapsed       | 438       |
|    total_timesteps    | 333000    |
| train/                |           |
|    entropy_loss       | -0.000145 |
|    explained_variance | -5.58e+07 |
|    learning_rate      | 0.0007    |
|    n_updates          | 66599     |
|    policy_loss        | -1.92e-09 |
|    value_loss         | 1.12e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.37      |
| time/                 |           |
|    fps                | 758       |
|    iterations         | 66700     |
|    time_elapsed       | 439       |
|    total_timesteps    | 333500    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.35     |
| time/                 |          |
|    fps                | 755      |
|    iterations         | 67900    |
|    time_elapsed       | 449      |
|    total_timesteps    | 339500   |
| train/                |          |
|    entropy_loss       | -0.00209 |
|    explained_variance | -5.5     |
|    learning_rate      | 0.0007   |
|    n_updates          | 67899    |
|    policy_loss        | 8.35e-08 |
|    value_loss         | 2.19e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 755       |
|    iterations         | 68000     |
|    time_elapsed       | 450       |
|    total_timesteps    | 340000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 754       |
|    iterations         | 69200     |
|    time_elapsed       | 458       |
|    total_timesteps    | 346000    |
| train/                |           |
|    entropy_loss       | -0.00109  |
|    explained_variance | -75       |
|    learning_rate      | 0.0007    |
|    n_updates          | 69199     |
|    policy_loss        | -2.84e-07 |
|    value_loss         | 9.03e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 754       |
|    iterations         | 69300     |
|    time_elapsed       | 459       |
|    total_timesteps    | 346500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.34      |
| time/                 |           |
|    fps                | 751       |
|    iterations         | 70500     |
|    time_elapsed       | 469       |
|    total_timesteps    | 352500    |
| train/                |           |
|    entropy_loss       | -0.000715 |
|    explained_variance | -7.38     |
|    learning_rate      | 0.0007    |
|    n_updates          | 70499     |
|    policy_loss        | -2.28e-09 |
|    value_loss         | 1.06e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.34      |
| time/                 |           |
|    fps                | 751       |
|    iterations         | 70600     |
|    time_elapsed       | 469       |
|    total_timesteps    | 353000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.33      |
| time/                 |           |
|    fps                | 749       |
|    iterations         | 71800     |
|    time_elapsed       | 479       |
|    total_timesteps    | 359000    |
| train/                |           |
|    entropy_loss       | -0.000745 |
|    explained_variance | -112      |
|    learning_rate      | 0.0007    |
|    n_updates          | 71799     |
|    policy_loss        | -1.48e-08 |
|    value_loss         | 1.24e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.33      |
| time/                 |           |
|    fps                | 748       |
|    iterations         | 71900     |
|    time_elapsed       | 480       |
|    total_timesteps    | 359500    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.33     |
| time/                 |          |
|    fps                | 747      |
|    iterations         | 73100    |
|    time_elapsed       | 488      |
|    total_timesteps    | 365500   |
| train/                |          |
|    entropy_loss       | -0.00029 |
|    explained_variance | -19.2    |
|    learning_rate      | 0.0007   |
|    n_updates          | 73099    |
|    policy_loss        | 8.14e-08 |
|    value_loss         | 1.22e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.33     |
| time/                 |          |
|    fps                | 747      |
|    iterations         | 73200    |
|    time_elapsed       | 489      |
|    total_timesteps    | 366000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.33     |
| time/                 |          |
|    fps                | 745      |
|    iterations         | 74400    |
|    time_elapsed       | 498      |
|    total_timesteps    | 372000   |
| train/                |          |
|    entropy_loss       | -0.0734  |
|    explained_variance | -0.0629  |
|    learning_rate      | 0.0007   |
|    n_updates          | 74399    |
|    policy_loss        | 0.000232 |
|    value_loss         | 0.000138 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.33      |
| time/                 |           |
|    fps                | 745       |
|    iterations         | 74500     |
|    time_elapsed       | 499       |
|    total_timesteps    | 372500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.34      |
| time/                 |           |
|    fps                | 744       |
|    iterations         | 75700     |
|    time_elapsed       | 508       |
|    total_timesteps    | 378500    |
| train/                |           |
|    entropy_loss       | -0.000876 |
|    explained_variance | -19.9     |
|    learning_rate      | 0.0007    |
|    n_updates          | 75699     |
|    policy_loss        | 8.05e-09  |
|    value_loss         | 1.77e-08  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.34     |
| time/                 |          |
|    fps                | 744      |
|    iterations         | 75800    |
|    time_elapsed       | 509      |
|    total_timesteps    | 379000   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.34      |
| time/                 |           |
|    fps                | 740       |
|    iterations         | 77000     |
|    time_elapsed       | 519       |
|    total_timesteps    | 385000    |
| train/                |           |
|    entropy_loss       | -8.81e-05 |
|    explained_variance | -5.33     |
|    learning_rate      | 0.0007    |
|    n_updates          | 76999     |
|    policy_loss        | -3.34e-09 |
|    value_loss         | 3.48e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.34      |
| time/                 |           |
|    fps                | 740       |
|    iterations         | 77100     |
|    time_elapsed       | 520       |
|    total_timesteps    | 385500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 739       |
|    iterations         | 78300     |
|    time_elapsed       | 529       |
|    total_timesteps    | 391500    |
| train/                |           |
|    entropy_loss       | -2.73e-05 |
|    explained_variance | -170      |
|    learning_rate      | 0.0007    |
|    n_updates          | 78299     |
|    policy_loss        | -1.1e-08  |
|    value_loss         | 3.38e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.34      |
| time/                 |           |
|    fps                | 739       |
|    iterations         | 78400     |
|    time_elapsed       | 530       |
|    total_timesteps    | 392000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.35     |
| time/                 |          |
|    fps                | 739      |
|    iterations         | 79600    |
|    time_elapsed       | 538      |
|    total_timesteps    | 398000   |
| train/                |          |
|    entropy_loss       | -0.0365  |
|    explained_variance | -6.11    |
|    learning_rate      | 0.0007   |
|    n_updates          | 79599    |
|    policy_loss        | 7.12e-06 |
|    value_loss         | 1.22e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 739       |
|    iterations         | 79700     |
|    time_elapsed       | 538       |
|    total_timesteps    | 398500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 740       |
|    iterations         | 80900     |
|    time_elapsed       | 546       |
|    total_timesteps    | 404500    |
| train/                |           |
|    entropy_loss       | -3.57e-05 |
|    explained_variance | -45.7     |
|    learning_rate      | 0.0007    |
|    n_updates          | 80899     |
|    policy_loss        | -6.85e-10 |
|    value_loss         | 9.39e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 740       |
|    iterations         | 81000     |
|    time_elapsed       | 546       |
|    total_timesteps    | 405000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 741       |
|    iterations         | 82200     |
|    time_elapsed       | 554       |
|    total_timesteps    | 411000    |
| train/                |           |
|    entropy_loss       | -0.0108   |
|    explained_variance | 0.294     |
|    learning_rate      | 0.0007    |
|    n_updates          | 82199     |
|    policy_loss        | -3.01e-06 |
|    value_loss         | 5.53e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 741       |
|    iterations         | 82300     |
|    time_elapsed       | 555       |
|    total_timesteps    | 411500    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.35     |
| time/                 |          |
|    fps                | 742      |
|    iterations         | 83500    |
|    time_elapsed       | 562      |
|    total_timesteps    | 417500   |
| train/                |          |
|    entropy_loss       | -0.00817 |
|    explained_variance | -5.34    |
|    learning_rate      | 0.0007   |
|    n_updates          | 83499    |
|    policy_loss        | 2.13e-06 |
|    value_loss         | 4.37e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 742       |
|    iterations         | 83600     |
|    time_elapsed       | 562       |
|    total_timesteps    | 418000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.34      |
| time/                 |           |
|    fps                | 742       |
|    iterations         | 84800     |
|    time_elapsed       | 570       |
|    total_timesteps    | 424000    |
| train/                |           |
|    entropy_loss       | -3.75e-05 |
|    explained_variance | -2.26e+05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 84799     |
|    policy_loss        | -1.55e-09 |
|    value_loss         | 4.83e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.34     |
| time/                 |          |
|    fps                | 742      |
|    iterations         | 84900    |
|    time_elapsed       | 571      |
|    total_timesteps    | 424500   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.34      |
| time/                 |           |
|    fps                | 743       |
|    iterations         | 86100     |
|    time_elapsed       | 579       |
|    total_timesteps    | 430500    |
| train/                |           |
|    entropy_loss       | -0.00934  |
|    explained_variance | -4.45e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 86099     |
|    policy_loss        | 3.71e-07  |
|    value_loss         | 1.5e-07   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.34      |
| time/                 |           |
|    fps                | 743       |
|    iterations         | 86200     |
|    time_elapsed       | 579       |
|    total_timesteps    | 431000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.34     |
| time/                 |          |
|    fps                | 742      |
|    iterations         | 87400    |
|    time_elapsed       | 588      |
|    total_timesteps    | 437000   |
| train/                |          |
|    entropy_loss       | -0.0331  |
|    explained_variance | 0.555    |
|    learning_rate      | 0.0007   |
|    n_updates          | 87399    |
|    policy_loss        | 1.19e-06 |
|    value_loss         | 4.53e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.34      |
| time/                 |           |
|    fps                | 742       |
|    iterations         | 87500     |
|    time_elapsed       | 588       |
|    total_timesteps    | 437500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 742       |
|    iterations         | 88700     |
|    time_elapsed       | 597       |
|    total_timesteps    | 443500    |
| train/                |           |
|    entropy_loss       | -0.0171   |
|    explained_variance | -0.23     |
|    learning_rate      | 0.0007    |
|    n_updates          | 88699     |
|    policy_loss        | -6.57e-05 |
|    value_loss         | 0.000595  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.35      |
| time/                 |           |
|    fps                | 742       |
|    iterations         | 88800     |
|    time_elapsed       | 597       |
|    total_timesteps    | 444000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.36     |
| time/                 |          |
|    fps                | 743      |
|    iterations         | 90000    |
|    time_elapsed       | 605      |
|    total_timesteps    | 450000   |
| train/                |          |
|    entropy_loss       | -0.00455 |
|    explained_variance | -4.77    |
|    learning_rate      | 0.0007   |
|    n_updates          | 89999    |
|    policy_loss        | 1.56e-07 |
|    value_loss         | 9e-08    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.36     |
| time/                 |          |
|    fps                | 743      |
|    iterations         | 90100    |
|    time_elapsed       | 605      |
|    total_timesteps    | 450500   |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.36      |
| time/                 |           |
|    fps                | 741       |
|    iterations         | 91300     |
|    time_elapsed       | 615       |
|    total_timesteps    | 456500    |
| train/                |           |
|    entropy_loss       | -0.000549 |
|    explained_variance | -24.7     |
|    learning_rate      | 0.0007    |
|    n_updates          | 91299     |
|    policy_loss        | 4.43e-08  |
|    value_loss         | 7.94e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.36     |
| time/                 |          |
|    fps                | 741      |
|    iterations         | 91400    |
|    time_elapsed       | 616      |
|    total_timesteps    | 457000   |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.38     |
| time/                 |          |
|    fps                | 740      |
|    iterations         | 92600    |
|    time_elapsed       | 625      |
|    total_timesteps    | 463000   |
| train/                |          |
|    entropy_loss       | -0.293   |
|    explained_variance | -2.02    |
|    learning_rate      | 0.0007   |
|    n_updates          | 92599    |
|    policy_loss        | 0.0251   |
|    value_loss         | 0.000363 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.38      |
| time/                 |           |
|    fps                | 740       |
|    iterations         | 92700     |
|    time_elapsed       | 626       |
|    total_timesteps    | 463500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.38      |
| time/                 |           |
|    fps                | 738       |
|    iterations         | 93900     |
|    time_elapsed       | 635       |
|    total_timesteps    | 469500    |
| train/                |           |
|    entropy_loss       | -0.0103   |
|    explained_variance | -0.949    |
|    learning_rate      | 0.0007    |
|    n_updates          | 93899     |
|    policy_loss        | -2.02e-07 |
|    value_loss         | 2.61e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.38      |
| time/                 |           |
|    fps                | 738       |
|    iterations         | 94000     |
|    time_elapsed       | 636       |
|    total_timesteps    | 470000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.38      |
| time/                 |           |
|    fps                | 737       |
|    iterations         | 95200     |
|    time_elapsed       | 645       |
|    total_timesteps    | 476000    |
| train/                |           |
|    entropy_loss       | -0.000118 |
|    explained_variance | -31.7     |
|    learning_rate      | 0.0007    |
|    n_updates          | 95199     |
|    policy_loss        | -1.14e-09 |
|    value_loss         | 2.27e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.38      |
| time/                 |           |
|    fps                | 737       |
|    iterations         | 95300     |
|    time_elapsed       | 645       |
|    total_timesteps    | 476500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.39      |
| time/                 |           |
|    fps                | 735       |
|    iterations         | 96500     |
|    time_elapsed       | 655       |
|    total_timesteps    | 482500    |
| train/                |           |
|    entropy_loss       | -0.0325   |
|    explained_variance | -84.3     |
|    learning_rate      | 0.0007    |
|    n_updates          | 96499     |
|    policy_loss        | -1.12e-06 |
|    value_loss         | 1.05e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.41      |
| time/                 |           |
|    fps                | 735       |
|    iterations         | 96600     |
|    time_elapsed       | 656       |
|    total_timesteps    | 483000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.42     |
| time/                 |          |
|    fps                | 734      |
|    iterations         | 97800    |
|    time_elapsed       | 665      |
|    total_timesteps    | 489000   |
| train/                |          |
|    entropy_loss       | -0.0157  |
|    explained_variance | -38.4    |
|    learning_rate      | 0.0007   |
|    n_updates          | 97799    |
|    policy_loss        | 5.52e-06 |
|    value_loss         | 7.24e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.42      |
| time/                 |           |
|    fps                | 734       |
|    iterations         | 97900     |
|    time_elapsed       | 666       |
|    total_timesteps    | 489500    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 4.78e+03 |
|    ep_rew_mean        | 1.45     |
| time/                 |          |
|    fps                | 732      |
|    iterations         | 99100    |
|    time_elapsed       | 676      |
|    total_timesteps    | 495500   |
| train/                |          |
|    entropy_loss       | -0.00351 |
|    explained_variance | -18.6    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99099    |
|    policy_loss        | 7.65e-08 |
|    value_loss         | 5.69e-08 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 4.78e+03  |
|    ep_rew_mean        | 1.45      |
| time/                 |           |
|    fps                | 732       |
|    iterations         | 99200     |
|    time_elapsed       | 677       |
|    total_timesteps    | 496000    |
| train/                |    

Step: 301, Action: 1, Reward: 0
Step: 302, Action: 1, Reward: 0
Step: 303, Action: 1, Reward: 0
Step: 304, Action: 1, Reward: 0
Step: 305, Action: 1, Reward: 0
Step: 306, Action: 1, Reward: 0
Step: 307, Action: 1, Reward: 0
Step: 308, Action: 1, Reward: 0
Step: 309, Action: 1, Reward: 0
Step: 310, Action: 1, Reward: 0
Step: 311, Action: 1, Reward: 0
Step: 312, Action: 1, Reward: 0
Step: 313, Action: 1, Reward: 0
Step: 314, Action: 1, Reward: 0
Step: 315, Action: 1, Reward: 0
Step: 316, Action: 1, Reward: 0
Step: 317, Action: 1, Reward: 0
Step: 318, Action: 1, Reward: 0
Step: 319, Action: 1, Reward: 0
Step: 320, Action: 1, Reward: 0
Step: 321, Action: 1, Reward: 0
Step: 322, Action: 1, Reward: 0
Step: 323, Action: 1, Reward: 0
Step: 324, Action: 1, Reward: 0
Step: 325, Action: 1, Reward: 0
Step: 326, Action: 1, Reward: 0
Step: 327, Action: 1, Reward: 0
Step: 328, Action: 1, Reward: 0
Step: 329, Action: 1, Reward: 0
Step: 330, Action: 1, Reward: 0
Step: 331, Action: 1, Reward: 0
Step: 33

Step: 558, Action: 1, Reward: 0
Step: 559, Action: 1, Reward: 0
Step: 560, Action: 1, Reward: 0
Step: 561, Action: 1, Reward: 0
Step: 562, Action: 1, Reward: 0
Step: 563, Action: 1, Reward: 0
Step: 564, Action: 1, Reward: 0
Step: 565, Action: 1, Reward: 0
Step: 566, Action: 1, Reward: 0
Step: 567, Action: 1, Reward: 0
Step: 568, Action: 1, Reward: 0
Step: 569, Action: 1, Reward: 0
Step: 570, Action: 1, Reward: 0
Step: 571, Action: 1, Reward: 0
Step: 572, Action: 1, Reward: 0
Step: 573, Action: 1, Reward: 0
Step: 574, Action: 1, Reward: 0
Step: 575, Action: 1, Reward: 0
Step: 576, Action: 1, Reward: 0
Step: 577, Action: 1, Reward: 0
Step: 578, Action: 1, Reward: 0
Step: 579, Action: 1, Reward: 0
Step: 580, Action: 1, Reward: 0
Step: 581, Action: 1, Reward: 0
Step: 582, Action: 1, Reward: 0
Step: 583, Action: 1, Reward: 0
Step: 584, Action: 1, Reward: 0
Step: 585, Action: 1, Reward: 0
Step: 586, Action: 1, Reward: 0
Step: 587, Action: 1, Reward: 0
Step: 588, Action: 1, Reward: 0
Step: 58

Step: 834, Action: 1, Reward: 0
Step: 835, Action: 1, Reward: 0
Step: 836, Action: 1, Reward: 0
Step: 837, Action: 1, Reward: 0
Step: 838, Action: 1, Reward: 0
Step: 839, Action: 1, Reward: 0
Step: 840, Action: 1, Reward: 0
Step: 841, Action: 1, Reward: 0
Step: 842, Action: 1, Reward: 0
Step: 843, Action: 1, Reward: 0
Step: 844, Action: 1, Reward: 0
Step: 845, Action: 1, Reward: 0
Step: 846, Action: 1, Reward: 0
Step: 847, Action: 1, Reward: 0
Step: 848, Action: 1, Reward: 0
Step: 849, Action: 1, Reward: 0
Step: 850, Action: 1, Reward: 0
Step: 851, Action: 1, Reward: 0
Step: 852, Action: 1, Reward: 0
Step: 853, Action: 1, Reward: 0
Step: 854, Action: 1, Reward: 0
Step: 855, Action: 1, Reward: 0
Step: 856, Action: 1, Reward: 0
Step: 857, Action: 1, Reward: 0
Step: 858, Action: 1, Reward: 0
Step: 859, Action: 1, Reward: 0
Step: 860, Action: 1, Reward: 0
Step: 861, Action: 1, Reward: 0
Step: 862, Action: 1, Reward: 0
Step: 863, Action: 1, Reward: 0
Step: 864, Action: 1, Reward: 0
Step: 86

Step: 1344, Action: 1, Reward: 0
Step: 1345, Action: 1, Reward: 0
Step: 1346, Action: 1, Reward: 0
Step: 1347, Action: 1, Reward: 0
Step: 1348, Action: 1, Reward: 0
Step: 1349, Action: 1, Reward: 0
Step: 1350, Action: 1, Reward: 0
Step: 1351, Action: 1, Reward: 0
Step: 1352, Action: 1, Reward: 0
Step: 1353, Action: 1, Reward: 0
Step: 1354, Action: 1, Reward: 0
Step: 1355, Action: 1, Reward: 0
Step: 1356, Action: 1, Reward: 0
Step: 1357, Action: 1, Reward: 0
Step: 1358, Action: 1, Reward: 0
Step: 1359, Action: 1, Reward: 0
Step: 1360, Action: 1, Reward: 0
Step: 1361, Action: 1, Reward: 0
Step: 1362, Action: 1, Reward: 0
Step: 1363, Action: 1, Reward: 0
Step: 1364, Action: 1, Reward: 0
Step: 1365, Action: 1, Reward: 0
Step: 1366, Action: 1, Reward: 0
Step: 1367, Action: 1, Reward: 0
Step: 1368, Action: 1, Reward: 0
Step: 1369, Action: 1, Reward: 0
Step: 1370, Action: 1, Reward: 0
Step: 1371, Action: 1, Reward: 0
Step: 1372, Action: 1, Reward: 0
Step: 1373, Action: 1, Reward: 0
Step: 1374

Step: 1655, Action: 1, Reward: 0
Step: 1656, Action: 0, Reward: 0
Step: 1657, Action: 1, Reward: 0
Step: 1658, Action: 1, Reward: 0
Step: 1659, Action: 1, Reward: 0
Step: 1660, Action: 0, Reward: 0
Step: 1661, Action: 1, Reward: 0
Step: 1662, Action: 1, Reward: 0
Step: 1663, Action: 1, Reward: 0
Step: 1664, Action: 1, Reward: 0
Step: 1665, Action: 1, Reward: 0
Step: 1666, Action: 1, Reward: 0
Step: 1667, Action: 1, Reward: 0
Step: 1668, Action: 1, Reward: 0
Step: 1669, Action: 1, Reward: 0
Step: 1670, Action: 1, Reward: 0
Step: 1671, Action: 1, Reward: 0
Step: 1672, Action: 1, Reward: 0
Step: 1673, Action: 1, Reward: 0
Step: 1674, Action: 1, Reward: 0
Step: 1675, Action: 1, Reward: 0
Step: 1676, Action: 1, Reward: 0
Step: 1677, Action: 1, Reward: 0
Step: 1678, Action: 1, Reward: 0
Step: 1679, Action: 1, Reward: 0
Step: 1680, Action: 1, Reward: 0
Step: 1681, Action: 1, Reward: 0
Step: 1682, Action: 1, Reward: 0
Step: 1683, Action: 1, Reward: 0
Step: 1684, Action: 1, Reward: 0
Step: 1685

Step: 1949, Action: 1, Reward: 0
Step: 1950, Action: 1, Reward: 0
Step: 1951, Action: 1, Reward: 0
Step: 1952, Action: 1, Reward: 0
Step: 1953, Action: 1, Reward: 0
Step: 1954, Action: 1, Reward: 0
Step: 1955, Action: 1, Reward: 0
Step: 1956, Action: 1, Reward: 0
Step: 1957, Action: 1, Reward: 0
Step: 1958, Action: 1, Reward: 0
Step: 1959, Action: 1, Reward: 0
Step: 1960, Action: 1, Reward: 0
Step: 1961, Action: 1, Reward: 0
Step: 1962, Action: 1, Reward: 0
Step: 1963, Action: 1, Reward: 0
Step: 1964, Action: 1, Reward: 0
Step: 1965, Action: 1, Reward: 0
Step: 1966, Action: 1, Reward: 0
Step: 1967, Action: 1, Reward: 0
Step: 1968, Action: 1, Reward: 0
Step: 1969, Action: 1, Reward: 0
Step: 1970, Action: 1, Reward: 0
Step: 1971, Action: 1, Reward: 0
Step: 1972, Action: 1, Reward: 0
Step: 1973, Action: 1, Reward: 0
Step: 1974, Action: 1, Reward: 0
Step: 1975, Action: 1, Reward: 0
Step: 1976, Action: 1, Reward: 0
Step: 1977, Action: 1, Reward: 0
Step: 1978, Action: 1, Reward: 0
Step: 1979

Step: 2257, Action: 0, Reward: 0
Step: 2258, Action: 0, Reward: 0
Step: 2259, Action: 0, Reward: 0
Step: 2260, Action: 2, Reward: 0
Step: 2261, Action: 2, Reward: 0
Step: 2262, Action: 2, Reward: 0
Step: 2263, Action: 0, Reward: 0
Step: 2264, Action: 2, Reward: 0
Step: 2265, Action: 2, Reward: 0
Step: 2266, Action: 2, Reward: 0
Step: 2267, Action: 2, Reward: 0
Step: 2268, Action: 2, Reward: 0
Step: 2269, Action: 2, Reward: 0
Step: 2270, Action: 2, Reward: 0
Step: 2271, Action: 2, Reward: 0
Step: 2272, Action: 2, Reward: 0
Step: 2273, Action: 2, Reward: 0
Step: 2274, Action: 2, Reward: 0
Step: 2275, Action: 0, Reward: 0
Step: 2276, Action: 2, Reward: 0
Step: 2277, Action: 2, Reward: 0
Step: 2278, Action: 2, Reward: 0
Step: 2279, Action: 2, Reward: 0
Step: 2280, Action: 2, Reward: 0
Step: 2281, Action: 0, Reward: 0
Step: 2282, Action: 2, Reward: 0
Step: 2283, Action: 2, Reward: 0
Step: 2284, Action: 0, Reward: 0
Step: 2285, Action: 0, Reward: 0
Step: 2286, Action: 2, Reward: 0
Step: 2287

Step: 2559, Action: 0, Reward: 0
Step: 2560, Action: 2, Reward: 0
Step: 2561, Action: 0, Reward: 0
Step: 2562, Action: 0, Reward: 0
Step: 2563, Action: 2, Reward: 0
Step: 2564, Action: 2, Reward: 0
Step: 2565, Action: 2, Reward: 0
Step: 2566, Action: 0, Reward: 0
Step: 2567, Action: 0, Reward: 0
Step: 2568, Action: 0, Reward: 0
Step: 2569, Action: 0, Reward: 0
Step: 2570, Action: 2, Reward: 0
Step: 2571, Action: 2, Reward: 0
Step: 2572, Action: 0, Reward: 0
Step: 2573, Action: 2, Reward: 0
Step: 2574, Action: 2, Reward: 0
Step: 2575, Action: 2, Reward: 0
Step: 2576, Action: 2, Reward: 0
Step: 2577, Action: 2, Reward: 0
Step: 2578, Action: 2, Reward: 0
Step: 2579, Action: 2, Reward: 0
Step: 2580, Action: 2, Reward: 0
Step: 2581, Action: 2, Reward: 0
Step: 2582, Action: 2, Reward: 0
Step: 2583, Action: 2, Reward: 0
Step: 2584, Action: 2, Reward: 0
Step: 2585, Action: 2, Reward: 0
Step: 2586, Action: 2, Reward: 0
Step: 2587, Action: 2, Reward: 0
Step: 2588, Action: 2, Reward: 0
Step: 2589

Step: 2858, Action: 2, Reward: 0
Step: 2859, Action: 2, Reward: 0
Step: 2860, Action: 2, Reward: 0
Step: 2861, Action: 2, Reward: 0
Step: 2862, Action: 2, Reward: 0
Step: 2863, Action: 2, Reward: 0
Step: 2864, Action: 2, Reward: 0
Step: 2865, Action: 2, Reward: 0
Step: 2866, Action: 2, Reward: 0
Step: 2867, Action: 2, Reward: 0
Step: 2868, Action: 2, Reward: 0
Step: 2869, Action: 2, Reward: 0
Step: 2870, Action: 2, Reward: 0
Step: 2871, Action: 2, Reward: 0
Step: 2872, Action: 2, Reward: 0
Step: 2873, Action: 2, Reward: 0
Step: 2874, Action: 2, Reward: 0
Step: 2875, Action: 2, Reward: 0
Step: 2876, Action: 2, Reward: 0
Step: 2877, Action: 2, Reward: 0
Step: 2878, Action: 2, Reward: 0
Step: 2879, Action: 2, Reward: 0
Step: 2880, Action: 2, Reward: 0
Step: 2881, Action: 2, Reward: 0
Step: 2882, Action: 2, Reward: 0
Step: 2883, Action: 2, Reward: 0
Step: 2884, Action: 2, Reward: 0
Step: 2885, Action: 2, Reward: 0
Step: 2886, Action: 2, Reward: 0
Step: 2887, Action: 2, Reward: 0
Step: 2888

Step: 3163, Action: 2, Reward: 0
Step: 3164, Action: 2, Reward: 0
Step: 3165, Action: 2, Reward: 0
Step: 3166, Action: 2, Reward: 0
Step: 3167, Action: 2, Reward: 0
Step: 3168, Action: 2, Reward: 0
Step: 3169, Action: 2, Reward: 0
Step: 3170, Action: 2, Reward: 0
Step: 3171, Action: 2, Reward: 0
Step: 3172, Action: 2, Reward: 0
Step: 3173, Action: 2, Reward: 0
Step: 3174, Action: 2, Reward: 0
Step: 3175, Action: 2, Reward: 0
Step: 3176, Action: 2, Reward: 0
Step: 3177, Action: 2, Reward: 0
Step: 3178, Action: 2, Reward: 0
Step: 3179, Action: 2, Reward: 0
Step: 3180, Action: 2, Reward: 0
Step: 3181, Action: 2, Reward: 0
Step: 3182, Action: 2, Reward: 0
Step: 3183, Action: 2, Reward: 0
Step: 3184, Action: 2, Reward: 0
Step: 3185, Action: 2, Reward: 0
Step: 3186, Action: 2, Reward: 0
Step: 3187, Action: 2, Reward: 0
Step: 3188, Action: 2, Reward: 0
Step: 3189, Action: 2, Reward: 0
Step: 3190, Action: 2, Reward: 0
Step: 3191, Action: 2, Reward: 0
Step: 3192, Action: 2, Reward: 0
Step: 3193

Step: 3463, Action: 2, Reward: 0
Step: 3464, Action: 2, Reward: 0
Step: 3465, Action: 2, Reward: 0
Step: 3466, Action: 2, Reward: 0
Step: 3467, Action: 2, Reward: 0
Step: 3468, Action: 2, Reward: 0
Step: 3469, Action: 2, Reward: 0
Step: 3470, Action: 2, Reward: 0
Step: 3471, Action: 2, Reward: 0
Step: 3472, Action: 2, Reward: 0
Step: 3473, Action: 2, Reward: 0
Step: 3474, Action: 2, Reward: 0
Step: 3475, Action: 2, Reward: 0
Step: 3476, Action: 2, Reward: 0
Step: 3477, Action: 2, Reward: 0
Step: 3478, Action: 2, Reward: 0
Step: 3479, Action: 2, Reward: 0
Step: 3480, Action: 2, Reward: 0
Step: 3481, Action: 2, Reward: 0
Step: 3482, Action: 2, Reward: 0
Step: 3483, Action: 2, Reward: 0
Step: 3484, Action: 2, Reward: 0
Step: 3485, Action: 2, Reward: 0
Step: 3486, Action: 2, Reward: 0
Step: 3487, Action: 2, Reward: 0
Step: 3488, Action: 2, Reward: 0
Step: 3489, Action: 2, Reward: 0
Step: 3490, Action: 2, Reward: 0
Step: 3491, Action: 2, Reward: 0
Step: 3492, Action: 0, Reward: 0
Step: 3493

Step: 3774, Action: 2, Reward: 0
Step: 3775, Action: 2, Reward: 0
Step: 3776, Action: 0, Reward: 0
Step: 3777, Action: 0, Reward: 0
Step: 3778, Action: 2, Reward: 0
Step: 3779, Action: 2, Reward: 0
Step: 3780, Action: 2, Reward: 0
Step: 3781, Action: 2, Reward: 0
Step: 3782, Action: 0, Reward: 0
Step: 3783, Action: 0, Reward: 0
Step: 3784, Action: 0, Reward: 0
Step: 3785, Action: 0, Reward: 0
Step: 3786, Action: 2, Reward: 0
Step: 3787, Action: 0, Reward: 0
Step: 3788, Action: 0, Reward: 0
Step: 3789, Action: 2, Reward: 0
Step: 3790, Action: 2, Reward: 0
Step: 3791, Action: 0, Reward: 0
Step: 3792, Action: 2, Reward: 0
Step: 3793, Action: 2, Reward: 0
Step: 3794, Action: 0, Reward: 0
Step: 3795, Action: 0, Reward: 0
Step: 3796, Action: 0, Reward: 0
Step: 3797, Action: 2, Reward: 0
Step: 3798, Action: 2, Reward: 0
Step: 3799, Action: 0, Reward: 0
Step: 3800, Action: 0, Reward: 0
Step: 3801, Action: 0, Reward: 0
Step: 3802, Action: 2, Reward: 0
Step: 3803, Action: 2, Reward: 0
Step: 3804

Step: 4069, Action: 0, Reward: 0
Step: 4070, Action: 1, Reward: 0
Step: 4071, Action: 1, Reward: 0
Step: 4072, Action: 1, Reward: 0
Step: 4073, Action: 1, Reward: 0
Step: 4074, Action: 1, Reward: 0
Step: 4075, Action: 2, Reward: 0.11707883665645975
Step: 4076, Action: 2, Reward: 0
Step: 4077, Action: 0, Reward: 0
Step: 4078, Action: 1, Reward: -0.018732613865033165
Step: 4079, Action: 0, Reward: 0
Step: 4080, Action: 0, Reward: 0
Step: 4081, Action: 2, Reward: 0
Step: 4082, Action: 0, Reward: 0
Step: 4083, Action: 1, Reward: 0.011707883665645319
Step: 4084, Action: 2, Reward: 0
Step: 4085, Action: 2, Reward: 0
Step: 4086, Action: 0, Reward: 0
Step: 4087, Action: 1, Reward: 0.060880995061359425
Step: 4088, Action: 0, Reward: 0
Step: 4089, Action: 0, Reward: 0
Step: 4090, Action: 2, Reward: 0
Step: 4091, Action: 1, Reward: -0.02575734406442101
Step: 4092, Action: 1, Reward: 0
Step: 4093, Action: 1, Reward: 0
Step: 4094, Action: 1, Reward: 0
Step: 4095, Action: 1, Reward: 0
Step: 4096, Ac

Step: 4363, Action: 0, Reward: 0
Step: 4364, Action: 2, Reward: 0
Step: 4365, Action: 2, Reward: 0
Step: 4366, Action: 2, Reward: 0
Step: 4367, Action: 0, Reward: 0
Step: 4368, Action: 0, Reward: 0
Step: 4369, Action: 2, Reward: 0
Step: 4370, Action: 2, Reward: 0
Step: 4371, Action: 0, Reward: 0
Step: 4372, Action: 0, Reward: 0
Step: 4373, Action: 2, Reward: 0
Step: 4374, Action: 2, Reward: 0
Step: 4375, Action: 2, Reward: 0
Step: 4376, Action: 2, Reward: 0
Step: 4377, Action: 2, Reward: 0
Step: 4378, Action: 2, Reward: 0
Step: 4379, Action: 2, Reward: 0
Step: 4380, Action: 2, Reward: 0
Step: 4381, Action: 2, Reward: 0
Step: 4382, Action: 2, Reward: 0
Step: 4383, Action: 2, Reward: 0
Step: 4384, Action: 2, Reward: 0
Step: 4385, Action: 2, Reward: 0
Step: 4386, Action: 2, Reward: 0
Step: 4387, Action: 0, Reward: 0
Step: 4388, Action: 2, Reward: 0
Step: 4389, Action: 2, Reward: 0
Step: 4390, Action: 2, Reward: 0
Step: 4391, Action: 0, Reward: 0
Step: 4392, Action: 0, Reward: 0
Step: 4393

Step: 4661, Action: 2, Reward: 0
Step: 4662, Action: 2, Reward: 0
Step: 4663, Action: 2, Reward: 0
Step: 4664, Action: 2, Reward: 0
Step: 4665, Action: 2, Reward: 0
Step: 4666, Action: 0, Reward: 0
Step: 4667, Action: 2, Reward: 0
Step: 4668, Action: 2, Reward: 0
Step: 4669, Action: 2, Reward: 0
Step: 4670, Action: 0, Reward: 0
Step: 4671, Action: 2, Reward: 0
Step: 4672, Action: 2, Reward: 0
Step: 4673, Action: 2, Reward: 0
Step: 4674, Action: 2, Reward: 0
Step: 4675, Action: 2, Reward: 0
Step: 4676, Action: 2, Reward: 0
Step: 4677, Action: 2, Reward: 0
Step: 4678, Action: 2, Reward: 0
Step: 4679, Action: 2, Reward: 0
Step: 4680, Action: 2, Reward: 0
Step: 4681, Action: 2, Reward: 0
Step: 4682, Action: 2, Reward: 0
Step: 4683, Action: 0, Reward: 0
Step: 4684, Action: 0, Reward: 0
Step: 4685, Action: 2, Reward: 0
Step: 4686, Action: 2, Reward: 0
Step: 4687, Action: 0, Reward: 0
Step: 4688, Action: 2, Reward: 0
Step: 4689, Action: 2, Reward: 0
Step: 4690, Action: 2, Reward: 0
Step: 4691

Step: 185, Action: 0, Reward: 0
Step: 186, Action: 1, Reward: 0.6322257179448825
Step: 187, Action: 0, Reward: 0
Step: 188, Action: 1, Reward: 0
Step: 189, Action: 1, Reward: 0
Step: 190, Action: 1, Reward: 0
Step: 191, Action: 1, Reward: 0
Step: 192, Action: 0, Reward: 0
Step: 193, Action: 1, Reward: 0
Step: 194, Action: 1, Reward: 0
Step: 195, Action: 1, Reward: 0
Step: 196, Action: 1, Reward: 0
Step: 197, Action: 0, Reward: 0
Step: 198, Action: 1, Reward: 0
Step: 199, Action: 1, Reward: 0
Step: 200, Action: 1, Reward: 0
Step: 201, Action: 1, Reward: 0
Step: 202, Action: 0, Reward: 0
Step: 203, Action: 0, Reward: 0
Step: 204, Action: 1, Reward: 0
Step: 205, Action: 0, Reward: 0
Step: 206, Action: 0, Reward: 0
Step: 207, Action: 0, Reward: 0
Step: 208, Action: 1, Reward: 0
Step: 209, Action: 1, Reward: 0
Step: 210, Action: 1, Reward: 0
Step: 211, Action: 1, Reward: 0
Step: 212, Action: 1, Reward: 0
Step: 213, Action: 0, Reward: 0
Step: 214, Action: 0, Reward: 0
Step: 215, Action: 1, R

Step: 485, Action: 1, Reward: 0
Step: 486, Action: 1, Reward: 0
Step: 487, Action: 1, Reward: 0
Step: 488, Action: 1, Reward: 0
Step: 489, Action: 1, Reward: 0
Step: 490, Action: 1, Reward: 0
Step: 491, Action: 1, Reward: 0
Step: 492, Action: 1, Reward: 0
Step: 493, Action: 1, Reward: 0
Step: 494, Action: 1, Reward: 0
Step: 495, Action: 1, Reward: 0
Step: 496, Action: 1, Reward: 0
Step: 497, Action: 1, Reward: 0
Step: 498, Action: 1, Reward: 0
Step: 499, Action: 1, Reward: 0
Step: 500, Action: 1, Reward: 0
Step: 501, Action: 1, Reward: 0
Step: 502, Action: 1, Reward: 0
Step: 503, Action: 1, Reward: 0
Step: 504, Action: 1, Reward: 0
Step: 505, Action: 1, Reward: 0
Step: 506, Action: 1, Reward: 0
Step: 507, Action: 1, Reward: 0
Step: 508, Action: 1, Reward: 0
Step: 509, Action: 1, Reward: 0
Step: 510, Action: 1, Reward: 0
Step: 511, Action: 1, Reward: 0
Step: 512, Action: 1, Reward: 0
Step: 513, Action: 2, Reward: 0.1241035668558476
Step: 514, Action: 2, Reward: 0
Step: 515, Action: 0, R

Step: 791, Action: 1, Reward: 0
Step: 792, Action: 1, Reward: 0
Step: 793, Action: 1, Reward: 0
Step: 794, Action: 1, Reward: 0
Step: 795, Action: 1, Reward: 0
Step: 796, Action: 1, Reward: 0
Step: 797, Action: 1, Reward: 0
Step: 798, Action: 1, Reward: 0
Step: 799, Action: 1, Reward: 0
Step: 800, Action: 1, Reward: 0
Step: 801, Action: 1, Reward: 0
Step: 802, Action: 1, Reward: 0
Step: 803, Action: 1, Reward: 0
Step: 804, Action: 1, Reward: 0
Step: 805, Action: 1, Reward: 0
Step: 806, Action: 1, Reward: 0
Step: 807, Action: 1, Reward: 0
Step: 808, Action: 1, Reward: 0
Step: 809, Action: 1, Reward: 0
Step: 810, Action: 1, Reward: 0
Step: 811, Action: 1, Reward: 0
Step: 812, Action: 1, Reward: 0
Step: 813, Action: 1, Reward: 0
Step: 814, Action: 1, Reward: 0
Step: 815, Action: 1, Reward: 0
Step: 816, Action: 1, Reward: 0
Step: 817, Action: 1, Reward: 0
Step: 818, Action: 1, Reward: 0
Step: 819, Action: 1, Reward: 0
Step: 820, Action: 1, Reward: 0
Step: 821, Action: 1, Reward: 0
Step: 82

Step: 1096, Action: 1, Reward: 0
Step: 1097, Action: 1, Reward: 0
Step: 1098, Action: 1, Reward: 0
Step: 1099, Action: 1, Reward: 0
Step: 1100, Action: 1, Reward: 0
Step: 1101, Action: 1, Reward: 0
Step: 1102, Action: 1, Reward: 0
Step: 1103, Action: 1, Reward: 0
Step: 1104, Action: 1, Reward: 0
Step: 1105, Action: 1, Reward: 0
Step: 1106, Action: 1, Reward: 0
Step: 1107, Action: 1, Reward: 0
Step: 1108, Action: 1, Reward: 0
Step: 1109, Action: 1, Reward: 0
Step: 1110, Action: 1, Reward: 0
Step: 1111, Action: 1, Reward: 0
Step: 1112, Action: 1, Reward: 0
Step: 1113, Action: 1, Reward: 0
Step: 1114, Action: 1, Reward: 0
Step: 1115, Action: 1, Reward: 0
Step: 1116, Action: 1, Reward: 0
Step: 1117, Action: 1, Reward: 0
Step: 1118, Action: 1, Reward: 0
Step: 1119, Action: 1, Reward: 0
Step: 1120, Action: 1, Reward: 0
Step: 1121, Action: 1, Reward: 0
Step: 1122, Action: 1, Reward: 0
Step: 1123, Action: 1, Reward: 0
Step: 1124, Action: 1, Reward: 0
Step: 1125, Action: 1, Reward: 0
Step: 1126

Step: 1408, Action: 1, Reward: 0
Step: 1409, Action: 1, Reward: 0
Step: 1410, Action: 1, Reward: 0
Step: 1411, Action: 1, Reward: 0
Step: 1412, Action: 1, Reward: 0
Step: 1413, Action: 1, Reward: 0
Step: 1414, Action: 1, Reward: 0
Step: 1415, Action: 1, Reward: 0
Step: 1416, Action: 1, Reward: 0
Step: 1417, Action: 1, Reward: 0
Step: 1418, Action: 1, Reward: 0
Step: 1419, Action: 1, Reward: 0
Step: 1420, Action: 1, Reward: 0
Step: 1421, Action: 1, Reward: 0
Step: 1422, Action: 1, Reward: 0
Step: 1423, Action: 1, Reward: 0
Step: 1424, Action: 1, Reward: 0
Step: 1425, Action: 1, Reward: 0
Step: 1426, Action: 1, Reward: 0
Step: 1427, Action: 1, Reward: 0
Step: 1428, Action: 1, Reward: 0
Step: 1429, Action: 1, Reward: 0
Step: 1430, Action: 1, Reward: 0
Step: 1431, Action: 1, Reward: 0
Step: 1432, Action: 1, Reward: 0
Step: 1433, Action: 1, Reward: 0
Step: 1434, Action: 1, Reward: 0
Step: 1435, Action: 1, Reward: 0
Step: 1436, Action: 1, Reward: 0
Step: 1437, Action: 1, Reward: 0
Step: 1438

Step: 1715, Action: 1, Reward: 0
Step: 1716, Action: 1, Reward: 0
Step: 1717, Action: 1, Reward: 0
Step: 1718, Action: 1, Reward: 0
Step: 1719, Action: 1, Reward: 0
Step: 1720, Action: 1, Reward: 0
Step: 1721, Action: 1, Reward: 0
Step: 1722, Action: 1, Reward: 0
Step: 1723, Action: 1, Reward: 0
Step: 1724, Action: 1, Reward: 0
Step: 1725, Action: 1, Reward: 0
Step: 1726, Action: 1, Reward: 0
Step: 1727, Action: 1, Reward: 0
Step: 1728, Action: 1, Reward: 0
Step: 1729, Action: 1, Reward: 0
Step: 1730, Action: 1, Reward: 0
Step: 1731, Action: 1, Reward: 0
Step: 1732, Action: 1, Reward: 0
Step: 1733, Action: 1, Reward: 0
Step: 1734, Action: 1, Reward: 0
Step: 1735, Action: 1, Reward: 0
Step: 1736, Action: 1, Reward: 0
Step: 1737, Action: 1, Reward: 0
Step: 1738, Action: 1, Reward: 0
Step: 1739, Action: 1, Reward: 0
Step: 1740, Action: 1, Reward: 0
Step: 1741, Action: 1, Reward: 0
Step: 1742, Action: 1, Reward: 0
Step: 1743, Action: 1, Reward: 0
Step: 1744, Action: 1, Reward: 0
Step: 1745

Step: 2007, Action: 1, Reward: 0
Step: 2008, Action: 1, Reward: 0
Step: 2009, Action: 1, Reward: 0
Step: 2010, Action: 1, Reward: 0
Step: 2011, Action: 1, Reward: 0
Step: 2012, Action: 1, Reward: 0
Step: 2013, Action: 1, Reward: 0
Step: 2014, Action: 1, Reward: 0
Step: 2015, Action: 1, Reward: 0
Step: 2016, Action: 1, Reward: 0
Step: 2017, Action: 1, Reward: 0
Step: 2018, Action: 1, Reward: 0
Step: 2019, Action: 1, Reward: 0
Step: 2020, Action: 1, Reward: 0
Step: 2021, Action: 1, Reward: 0
Step: 2022, Action: 1, Reward: 0
Step: 2023, Action: 1, Reward: 0
Step: 2024, Action: 1, Reward: 0
Step: 2025, Action: 1, Reward: 0
Step: 2026, Action: 1, Reward: 0
Step: 2027, Action: 1, Reward: 0
Step: 2028, Action: 1, Reward: 0
Step: 2029, Action: 1, Reward: 0
Step: 2030, Action: 1, Reward: 0
Step: 2031, Action: 1, Reward: 0
Step: 2032, Action: 1, Reward: 0
Step: 2033, Action: 1, Reward: 0
Step: 2034, Action: 1, Reward: 0
Step: 2035, Action: 1, Reward: 0
Step: 2036, Action: 1, Reward: 0
Step: 2037

Step: 2309, Action: 1, Reward: 0
Step: 2310, Action: 1, Reward: 0
Step: 2311, Action: 1, Reward: 0
Step: 2312, Action: 1, Reward: 0
Step: 2313, Action: 1, Reward: 0
Step: 2314, Action: 1, Reward: 0
Step: 2315, Action: 1, Reward: 0
Step: 2316, Action: 1, Reward: 0
Step: 2317, Action: 1, Reward: 0
Step: 2318, Action: 1, Reward: 0
Step: 2319, Action: 1, Reward: 0
Step: 2320, Action: 1, Reward: 0
Step: 2321, Action: 1, Reward: 0
Step: 2322, Action: 1, Reward: 0

--- Agent Report ---
Agent starts with 0 holdings (neutral position), Initial Balance: 10000
Agent sells (short) at 10.15, Current Balance: 10000, Holdings: 1 Short
Agent closes short at 8.8, profit: 1.3499999999999996, Current Balance: 10001.35, Holdings: 0
Agent buys at 8.7, Current Balance: 10001.35, Holdings: 1 Long
Agent closes long at 8.3, profit: -0.3999999999999986, Current Balance: 10000.95, Holdings: 0
Agent buys at 8.27, Current Balance: 10000.95, Holdings: 1 Long
Agent closes long at 8.27, profit: 0.0, Current Balance: 

In [8]:
#ensemble learning single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO, DQN, A2C
from sklearn.preprocessing import StandardScaler
from collections import Counter

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Ensemble model function
def ensemble_predict(actions):
    # Convert numpy arrays to integers for each action
    actions = [int(action) for action in actions]
    # Perform a majority vote among the actions (hold, buy, sell)
    action_counts = Counter(actions)
    return action_counts.most_common(1)[0][0]

# Train and evaluate the ensemble model
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'LPL_TRAINING.csv'
    test_file = 'LPL_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize each model and train them separately
    ppo_model = PPO("MlpPolicy", env_train, verbose=1)
    dqn_model = DQN("MlpPolicy", env_train, verbose=1)
    a2c_model = A2C("MlpPolicy", env_train, verbose=1)

    # Train each model
    ppo_model.learn(total_timesteps=50000)
    dqn_model.learn(total_timesteps=50000)
    a2c_model.learn(total_timesteps=50000)

    # Test the ensemble model on the training data
    obs, _ = env_train.reset()
    done = False
    while not done:
        # Get predictions from each model
        ppo_action, _ = ppo_model.predict(obs)
        dqn_action, _ = dqn_model.predict(obs)
        a2c_action, _ = a2c_model.predict(obs)

        # Aggregate the actions through majority voting
        final_action = ensemble_predict([ppo_action, dqn_action, a2c_action])

        # Step the environment with the final action
        obs, reward, done, truncated, info = env_train.step(final_action)

    # Calculate and display training metrics
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Test the ensemble model on the testing data
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    while not done:
        # Get predictions from each model
        ppo_action, _ = ppo_model.predict(obs)
        dqn_action, _ = dqn_model.predict(obs)
        a2c_action, _ = a2c_model.predict(obs)

        # Aggregate the actions through majority voting
        final_action = ensemble_predict([ppo_action, dqn_action, a2c_action])

        # Step the environment with the final action
        obs, reward, done, truncated, info = env_test.step(final_action)

    # Generate report for the testing session
    env_test.generate_report()

    # Calculate and display testing metrics
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 1623 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1389        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.012837998 |
|    clip_fraction        | 0.115       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -20.9       |
|   