In [1]:
#ppo algorithm single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate additional metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Train and evaluate the model with all metrics
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'MSFT_TRAINING.csv'
    test_file = 'MSFT_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize the PPO model and train
    model = PPO("MlpPolicy", env_train, verbose=1)
    model.learn(total_timesteps=100000)

    # Test the model on the training data
    obs, _ = env_train.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_train.step(action)

    # Generate report for the training session
    env_train.generate_report()

    # Calculate and display metrics for the training period
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Test the model on the testing data
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_test.step(action)

    # Generate report for the testing session
    env_test.generate_report()

    # Calculate and display metrics for the testing period
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 1391 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1066        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015277362 |
|    clip_fraction        | 0.215       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -18.8       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0386     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0146     |
|    value_loss         

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 8.05e+03     |
|    ep_rew_mean          | 3.17         |
| time/                   |              |
|    fps                  | 930          |
|    iterations           | 12           |
|    time_elapsed         | 26           |
|    total_timesteps      | 24576        |
| train/                  |              |
|    approx_kl            | 0.0073110294 |
|    clip_fraction        | 0.066        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.984       |
|    explained_variance   | -0.123       |
|    learning_rate        | 0.0003       |
|    loss                 | 0.0017       |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.0102      |
|    value_loss           | 0.00182      |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 8.05e+03     |
|    ep_rew_mean          | 6.11         |
| time/                   |              |
|    fps                  | 903          |
|    iterations           | 22           |
|    time_elapsed         | 49           |
|    total_timesteps      | 45056        |
| train/                  |              |
|    approx_kl            | 0.0103186015 |
|    clip_fraction        | 0.092        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.834       |
|    explained_variance   | 0.152        |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00576     |
|    n_updates            | 210          |
|    policy_gradient_loss | -0.00934     |
|    value_loss           | 0.00377      |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_mean          | 12.9        |
| time/                   |             |
|    fps                  | 879         |
|    iterations           | 32          |
|    time_elapsed         | 74          |
|    total_timesteps      | 65536       |
| train/                  |             |
|    approx_kl            | 0.008076161 |
|    clip_fraction        | 0.0958      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.844      |
|    explained_variance   | 0.76        |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00338     |
|    n_updates            | 310         |
|    policy_gradient_loss | -0.00435    |
|    value_loss           | 0.00185     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_mean          | 16.7        |
| time/                   |             |
|    fps                  | 842         |
|    iterations           | 42          |
|    time_elapsed         | 102         |
|    total_timesteps      | 86016       |
| train/                  |             |
|    approx_kl            | 0.010348206 |
|    clip_fraction        | 0.0982      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.593      |
|    explained_variance   | 0.402       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00376    |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.00489    |
|    value_loss           | 0.0043      |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+


--- Agent Report ---
Agent starts with 0 holdings (neutral position), Initial Balance: 10000
Agent sells (short) at 336.7, Current Balance: 10000, Holdings: 1 Short
Agent closes short at 332.17, profit: 4.529999999999973, Current Balance: 10004.53, Holdings: 0
Agent sells (short) at 334.74, Current Balance: 10004.53, Holdings: 1 Short
Agent closes short at 334.89, profit: -0.14999999999997726, Current Balance: 10004.380000000001, Holdings: 0
Agent sells (short) at 334.65, Current Balance: 10004.380000000001, Holdings: 1 Short
Agent closes short at 328.45, profit: 6.199999999999989, Current Balance: 10010.580000000002, Holdings: 0
Agent sells (short) at 328.28, Current Balance: 10010.580000000002, Holdings: 1 Short
Agent closes short at 316.11, profit: 12.169999999999959, Current Balance: 10022.750000000002, Holdings: 0
Agent sells (short) at 316.34, Current Balance: 10022.750000000002, Holdings: 1 Short
Agent closes short at 314.78, profit: 1.5600000000000023, Current Balance: 10024.3

In [2]:
#dqn algorithm single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import DQN
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate additional metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Train and evaluate the model with all metrics
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'MSFT_TRAINING.csv'
    test_file = 'MSFT_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize the DQN model and train
    model = DQN("MlpPolicy", env_train, verbose=1)
    model.learn(total_timesteps=100000)

    # Test the model on the training data
    obs, _ = env_train.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_train.step(action)

    # Generate report for the training session
    env_train.generate_report()

    # Calculate and display metrics for the training period
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Test the model on the testing data
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_test.step(action)

    # Generate report for the testing session
    env_test.generate_report()

    # Calculate and display metrics for the testing period
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 8.05e+03 |
|    ep_rew_mean      | 6.32     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 818      |
|    time_elapsed     | 39       |
|    total_timesteps  | 32192    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000102 |
|    n_updates        | 8022     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 8.05e+03 |
|    ep_rew_mean      | 4.9      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 845      |
|    time_elapsed     | 76       |
|    total_timesteps  | 64384    |
| train/              |        


--- Agent Report ---
Agent starts with 0 holdings (neutral position), Initial Balance: 10000
Agent sells (short) at 336.7, Current Balance: 10000, Holdings: 1 Short
Agent closes short at 316.39, profit: 20.310000000000002, Current Balance: 10020.31, Holdings: 0
Agent sells (short) at 314.94, Current Balance: 10020.31, Holdings: 1 Short
Agent closes short at 317.71, profit: -2.769999999999982, Current Balance: 10017.539999999999, Holdings: 0
Agent buys at 320.31, Current Balance: 10017.539999999999, Holdings: 1 Long
Agent closes long at 318.71, profit: -1.6000000000000227, Current Balance: 10015.939999999999, Holdings: 0
Agent buys at 313.6243, Current Balance: 10015.939999999999, Holdings: 1 Long
Agent closes long at 304.94, profit: -8.684300000000007, Current Balance: 10007.255699999998, Holdings: 0
Agent sells (short) at 305.7, Current Balance: 10007.255699999998, Holdings: 1 Short
Agent closes short at 308.31, profit: -2.6100000000000136, Current Balance: 10004.645699999997, Holdin

In [3]:
#a2c algorithm single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import A2C
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate additional metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Train and evaluate the model with action logging for debugging
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'MSFT_TRAINING.csv'
    test_file = 'MSFT_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize the A2C model and train with more timesteps
    model = A2C("MlpPolicy", env_train, verbose=1)
    model.learn(total_timesteps=500000)  # Increased timesteps

    # Testing on the training data with action logging
    obs, _ = env_train.reset()
    done = False
    print("\n--- Training Session ---")
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_train.step(action)
        # Log each action and reward for debugging
        print(f"Step: {env_train.current_step}, Action: {action}, Reward: {reward}")

    # Generate report and metrics for the training session
    env_train.generate_report()
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Testing on the testing data with action logging
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    print("\n--- Testing Session ---")
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, truncated, info = env_test.step(action)
        # Log each action and reward for debugging
        print(f"Step: {env_test.current_step}, Action: {action}, Reward: {reward}")

    # Generate report and metrics for the testing session
    env_test.generate_report()
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")



# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| time/                 |          |
|    fps                | 801      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.1     |
|    explained_variance | -6.65    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0328  |
|    value_loss         | 0.00257  |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 805      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.06    |
|    explained_variance | 0.0322   |
|    learning_rate      | 0.0007   |
|    n_updates    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.866    |
| time/                 |           |
|    fps                | 699       |
|    iterations         | 1700      |
|    time_elapsed       | 12        |
|    total_timesteps    | 8500      |
| train/                |           |
|    entropy_loss       | -0.718    |
|    explained_variance | -2.12e+05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 1699      |
|    policy_loss        | 0.0429    |
|    value_loss         | 0.00271   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.866    |
| time/                 |           |
|    fps                | 699       |
|    iterations         | 1800      |
|    time_elapsed       | 12        |
|    total_timesteps    | 9000      |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.866   |
| time/                 |          |
|    fps                | 705      |
|    iterations         | 3000     |
|    time_elapsed       | 21       |
|    total_timesteps    | 15000    |
| train/                |          |
|    entropy_loss       | -0.887   |
|    explained_variance | 0.0128   |
|    learning_rate      | 0.0007   |
|    n_updates          | 2999     |
|    policy_loss        | 0.00806  |
|    value_loss         | 0.000385 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.866   |
| time/                 |          |
|    fps                | 689      |
|    iterations         | 3100     |
|    time_elapsed       | 22       |
|    total_timesteps    | 15500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.243   |
| time/                 |          |
|    fps                | 686      |
|    iterations         | 4400     |
|    time_elapsed       | 32       |
|    total_timesteps    | 22000    |
| train/                |          |
|    entropy_loss       | -0.133   |
|    explained_variance | -0.821   |
|    learning_rate      | 0.0007   |
|    n_updates          | 4399     |
|    policy_loss        | 0.00632  |
|    value_loss         | 3.28e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.243    |
| time/                 |           |
|    fps                | 688       |
|    iterations         | 4500      |
|    time_elapsed       | 32        |
|    total_timesteps    | 22500     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.0611   |
| time/                 |          |
|    fps                | 712      |
|    iterations         | 5700     |
|    time_elapsed       | 40       |
|    total_timesteps    | 28500    |
| train/                |          |
|    entropy_loss       | -0.601   |
|    explained_variance | -1.24    |
|    learning_rate      | 0.0007   |
|    n_updates          | 5699     |
|    policy_loss        | -0.0305  |
|    value_loss         | 0.0017   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.0611   |
| time/                 |          |
|    fps                | 714      |
|    iterations         | 5800     |
|    time_elapsed       | 40       |
|    total_timesteps    | 29000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.0454   |
| time/                 |           |
|    fps                | 697       |
|    iterations         | 7000      |
|    time_elapsed       | 50        |
|    total_timesteps    | 35000     |
| train/                |           |
|    entropy_loss       | -0.535    |
|    explained_variance | -15.9     |
|    learning_rate      | 0.0007    |
|    n_updates          | 6999      |
|    policy_loss        | -0.000488 |
|    value_loss         | 4.42e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.0454   |
| time/                 |           |
|    fps                | 698       |
|    iterations         | 7100      |
|    time_elapsed       | 50        |
|    total_timesteps    | 35500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.0219    |
| time/                 |           |
|    fps                | 704       |
|    iterations         | 8300      |
|    time_elapsed       | 58        |
|    total_timesteps    | 41500     |
| train/                |           |
|    entropy_loss       | -0.12     |
|    explained_variance | -40       |
|    learning_rate      | 0.0007    |
|    n_updates          | 8299      |
|    policy_loss        | -3.58e-05 |
|    value_loss         | 2.12e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.0219    |
| time/                 |           |
|    fps                | 705       |
|    iterations         | 8400      |
|    time_elapsed       | 59        |
|    total_timesteps    | 42000     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.0219   |
| time/                 |          |
|    fps                | 707      |
|    iterations         | 9600     |
|    time_elapsed       | 67       |
|    total_timesteps    | 48000    |
| train/                |          |
|    entropy_loss       | -0.103   |
|    explained_variance | -32.7    |
|    learning_rate      | 0.0007   |
|    n_updates          | 9599     |
|    policy_loss        | 1.77e-06 |
|    value_loss         | 7.47e-09 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.158    |
| time/                 |          |
|    fps                | 707      |
|    iterations         | 9700     |
|    time_elapsed       | 68       |
|    total_timesteps    | 48500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.158    |
| time/                 |          |
|    fps                | 713      |
|    iterations         | 10900    |
|    time_elapsed       | 76       |
|    total_timesteps    | 54500    |
| train/                |          |
|    entropy_loss       | -0.0409  |
|    explained_variance | -20.3    |
|    learning_rate      | 0.0007   |
|    n_updates          | 10899    |
|    policy_loss        | 5.54e-06 |
|    value_loss         | 9.63e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.158     |
| time/                 |           |
|    fps                | 714       |
|    iterations         | 11000     |
|    time_elapsed       | 76        |
|    total_timesteps    | 55000     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.283     |
| time/                 |           |
|    fps                | 707       |
|    iterations         | 12200     |
|    time_elapsed       | 86        |
|    total_timesteps    | 61000     |
| train/                |           |
|    entropy_loss       | -0.0842   |
|    explained_variance | -3.63e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 12199     |
|    policy_loss        | -0.00145  |
|    value_loss         | 0.00591   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.283     |
| time/                 |           |
|    fps                | 707       |
|    iterations         | 12300     |
|    time_elapsed       | 86        |
|    total_timesteps    | 61500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.481    |
| time/                 |          |
|    fps                | 699      |
|    iterations         | 13500    |
|    time_elapsed       | 96       |
|    total_timesteps    | 67500    |
| train/                |          |
|    entropy_loss       | -0.537   |
|    explained_variance | -3.36    |
|    learning_rate      | 0.0007   |
|    n_updates          | 13499    |
|    policy_loss        | -0.00026 |
|    value_loss         | 9.12e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.481    |
| time/                 |          |
|    fps                | 699      |
|    iterations         | 13600    |
|    time_elapsed       | 97       |
|    total_timesteps    | 68000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.509     |
| time/                 |           |
|    fps                | 700       |
|    iterations         | 14800     |
|    time_elapsed       | 105       |
|    total_timesteps    | 74000     |
| train/                |           |
|    entropy_loss       | -0.268    |
|    explained_variance | -15.2     |
|    learning_rate      | 0.0007    |
|    n_updates          | 14799     |
|    policy_loss        | -2.83e-05 |
|    value_loss         | 4.36e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.509    |
| time/                 |          |
|    fps                | 701      |
|    iterations         | 14900    |
|    time_elapsed       | 106      |
|    total_timesteps    | 74500    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.437     |
| time/                 |           |
|    fps                | 692       |
|    iterations         | 16100     |
|    time_elapsed       | 116       |
|    total_timesteps    | 80500     |
| train/                |           |
|    entropy_loss       | -0.441    |
|    explained_variance | 0.326     |
|    learning_rate      | 0.0007    |
|    n_updates          | 16099     |
|    policy_loss        | -8.38e-05 |
|    value_loss         | 9.37e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.437     |
| time/                 |           |
|    fps                | 692       |
|    iterations         | 16200     |
|    time_elapsed       | 117       |
|    total_timesteps    | 81000     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.437    |
| time/                 |          |
|    fps                | 690      |
|    iterations         | 17400    |
|    time_elapsed       | 126      |
|    total_timesteps    | 87000    |
| train/                |          |
|    entropy_loss       | -0.0192  |
|    explained_variance | -44.1    |
|    learning_rate      | 0.0007   |
|    n_updates          | 17399    |
|    policy_loss        | 3.46e-08 |
|    value_loss         | 2.27e-08 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.437     |
| time/                 |           |
|    fps                | 689       |
|    iterations         | 17500     |
|    time_elapsed       | 126       |
|    total_timesteps    | 87500     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.569    |
| time/                 |          |
|    fps                | 688      |
|    iterations         | 18700    |
|    time_elapsed       | 135      |
|    total_timesteps    | 93500    |
| train/                |          |
|    entropy_loss       | -0.37    |
|    explained_variance | -160     |
|    learning_rate      | 0.0007   |
|    n_updates          | 18699    |
|    policy_loss        | -0.0278  |
|    value_loss         | 0.00243  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.569    |
| time/                 |          |
|    fps                | 688      |
|    iterations         | 18800    |
|    time_elapsed       | 136      |
|    total_timesteps    | 94000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.567    |
| time/                 |          |
|    fps                | 686      |
|    iterations         | 20000    |
|    time_elapsed       | 145      |
|    total_timesteps    | 100000   |
| train/                |          |
|    entropy_loss       | -0.217   |
|    explained_variance | 0.0404   |
|    learning_rate      | 0.0007   |
|    n_updates          | 19999    |
|    policy_loss        | -0.0275  |
|    value_loss         | 0.0174   |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.567     |
| time/                 |           |
|    fps                | 686       |
|    iterations         | 20100     |
|    time_elapsed       | 146       |
|    total_timesteps    | 100500    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.628    |
| time/                 |          |
|    fps                | 686      |
|    iterations         | 21300    |
|    time_elapsed       | 155      |
|    total_timesteps    | 106500   |
| train/                |          |
|    entropy_loss       | -0.342   |
|    explained_variance | -1.23    |
|    learning_rate      | 0.0007   |
|    n_updates          | 21299    |
|    policy_loss        | 7.11e-05 |
|    value_loss         | 7.75e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.628    |
| time/                 |          |
|    fps                | 686      |
|    iterations         | 21400    |
|    time_elapsed       | 155      |
|    total_timesteps    | 107000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.664    |
| time/                 |          |
|    fps                | 679      |
|    iterations         | 22600    |
|    time_elapsed       | 166      |
|    total_timesteps    | 113000   |
| train/                |          |
|    entropy_loss       | -0.00892 |
|    explained_variance | -1.24    |
|    learning_rate      | 0.0007   |
|    n_updates          | 22599    |
|    policy_loss        | 4.43e-08 |
|    value_loss         | 1.75e-08 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.664    |
| time/                 |          |
|    fps                | 679      |
|    iterations         | 22700    |
|    time_elapsed       | 166      |
|    total_timesteps    | 113500   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.664    |
| time/                 |          |
|    fps                | 679      |
|    iterations         | 23900    |
|    time_elapsed       | 175      |
|    total_timesteps    | 119500   |
| train/                |          |
|    entropy_loss       | -0.127   |
|    explained_variance | -25.2    |
|    learning_rate      | 0.0007   |
|    n_updates          | 23899    |
|    policy_loss        | 2.94e-07 |
|    value_loss         | 2.16e-08 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.664    |
| time/                 |          |
|    fps                | 679      |
|    iterations         | 24000    |
|    time_elapsed       | 176      |
|    total_timesteps    | 120000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.64     |
| time/                 |          |
|    fps                | 682      |
|    iterations         | 25200    |
|    time_elapsed       | 184      |
|    total_timesteps    | 126000   |
| train/                |          |
|    entropy_loss       | -0.346   |
|    explained_variance | -431     |
|    learning_rate      | 0.0007   |
|    n_updates          | 25199    |
|    policy_loss        | 0.000334 |
|    value_loss         | 1.51e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.64      |
| time/                 |           |
|    fps                | 682       |
|    iterations         | 25300     |
|    time_elapsed       | 185       |
|    total_timesteps    | 126500    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.659    |
| time/                 |          |
|    fps                | 686      |
|    iterations         | 26500    |
|    time_elapsed       | 192      |
|    total_timesteps    | 132500   |
| train/                |          |
|    entropy_loss       | -0.0331  |
|    explained_variance | -87.3    |
|    learning_rate      | 0.0007   |
|    n_updates          | 26499    |
|    policy_loss        | 1.59e-05 |
|    value_loss         | 8.13e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.659    |
| time/                 |          |
|    fps                | 687      |
|    iterations         | 26600    |
|    time_elapsed       | 193      |
|    total_timesteps    | 133000   |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.69      |
| time/                 |           |
|    fps                | 691       |
|    iterations         | 27800     |
|    time_elapsed       | 201       |
|    total_timesteps    | 139000    |
| train/                |           |
|    entropy_loss       | -0.0355   |
|    explained_variance | -4.19     |
|    learning_rate      | 0.0007    |
|    n_updates          | 27799     |
|    policy_loss        | -5.42e-07 |
|    value_loss         | 8.34e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.69      |
| time/                 |           |
|    fps                | 691       |
|    iterations         | 27900     |
|    time_elapsed       | 201       |
|    total_timesteps    | 139500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.724     |
| time/                 |           |
|    fps                | 695       |
|    iterations         | 29100     |
|    time_elapsed       | 209       |
|    total_timesteps    | 145500    |
| train/                |           |
|    entropy_loss       | -0.0235   |
|    explained_variance | -121      |
|    learning_rate      | 0.0007    |
|    n_updates          | 29099     |
|    policy_loss        | -8.78e-07 |
|    value_loss         | 1.55e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.724     |
| time/                 |           |
|    fps                | 696       |
|    iterations         | 29200     |
|    time_elapsed       | 209       |
|    total_timesteps    | 146000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.724     |
| time/                 |           |
|    fps                | 700       |
|    iterations         | 30400     |
|    time_elapsed       | 216       |
|    total_timesteps    | 152000    |
| train/                |           |
|    entropy_loss       | -0.18     |
|    explained_variance | -32.2     |
|    learning_rate      | 0.0007    |
|    n_updates          | 30399     |
|    policy_loss        | -2.56e-06 |
|    value_loss         | 1.09e-08  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.724    |
| time/                 |          |
|    fps                | 700      |
|    iterations         | 30500    |
|    time_elapsed       | 217      |
|    total_timesteps    | 152500   |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.667    |
| time/                 |          |
|    fps                | 701      |
|    iterations         | 31700    |
|    time_elapsed       | 225      |
|    total_timesteps    | 158500   |
| train/                |          |
|    entropy_loss       | -0.0339  |
|    explained_variance | -161     |
|    learning_rate      | 0.0007   |
|    n_updates          | 31699    |
|    policy_loss        | 5.79e-06 |
|    value_loss         | 2.49e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.667     |
| time/                 |           |
|    fps                | 702       |
|    iterations         | 31800     |
|    time_elapsed       | 226       |
|    total_timesteps    | 159000    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.708    |
| time/                 |          |
|    fps                | 703      |
|    iterations         | 33000    |
|    time_elapsed       | 234      |
|    total_timesteps    | 165000   |
| train/                |          |
|    entropy_loss       | -0.0803  |
|    explained_variance | -319     |
|    learning_rate      | 0.0007   |
|    n_updates          | 32999    |
|    policy_loss        | -2.4e-05 |
|    value_loss         | 2.15e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.708     |
| time/                 |           |
|    fps                | 703       |
|    iterations         | 33100     |
|    time_elapsed       | 235       |
|    total_timesteps    | 165500    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.745    |
| time/                 |          |
|    fps                | 703      |
|    iterations         | 34300    |
|    time_elapsed       | 243      |
|    total_timesteps    | 171500   |
| train/                |          |
|    entropy_loss       | -0.286   |
|    explained_variance | -1.29    |
|    learning_rate      | 0.0007   |
|    n_updates          | 34299    |
|    policy_loss        | 1.45e-05 |
|    value_loss         | 1.22e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.745    |
| time/                 |          |
|    fps                | 703      |
|    iterations         | 34400    |
|    time_elapsed       | 244      |
|    total_timesteps    | 172000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.768    |
| time/                 |          |
|    fps                | 707      |
|    iterations         | 35600    |
|    time_elapsed       | 251      |
|    total_timesteps    | 178000   |
| train/                |          |
|    entropy_loss       | -0.162   |
|    explained_variance | 0.0595   |
|    learning_rate      | 0.0007   |
|    n_updates          | 35599    |
|    policy_loss        | 4.4e-05  |
|    value_loss         | 1.54e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.768    |
| time/                 |          |
|    fps                | 707      |
|    iterations         | 35700    |
|    time_elapsed       | 252      |
|    total_timesteps    | 178500   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.768    |
| time/                 |          |
|    fps                | 704      |
|    iterations         | 36900    |
|    time_elapsed       | 261      |
|    total_timesteps    | 184500   |
| train/                |          |
|    entropy_loss       | -0.0276  |
|    explained_variance | -33.8    |
|    learning_rate      | 0.0007   |
|    n_updates          | 36899    |
|    policy_loss        | 3.58e-07 |
|    value_loss         | 8.39e-09 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.768     |
| time/                 |           |
|    fps                | 704       |
|    iterations         | 37000     |
|    time_elapsed       | 262       |
|    total_timesteps    | 185000    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.807    |
| time/                 |          |
|    fps                | 702      |
|    iterations         | 38200    |
|    time_elapsed       | 271      |
|    total_timesteps    | 191000   |
| train/                |          |
|    entropy_loss       | -0.16    |
|    explained_variance | -12.6    |
|    learning_rate      | 0.0007   |
|    n_updates          | 38199    |
|    policy_loss        | 0.000687 |
|    value_loss         | 6.15e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.807     |
| time/                 |           |
|    fps                | 702       |
|    iterations         | 38300     |
|    time_elapsed       | 272       |
|    total_timesteps    | 191500    |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.821    |
| time/                 |          |
|    fps                | 703      |
|    iterations         | 39500    |
|    time_elapsed       | 280      |
|    total_timesteps    | 197500   |
| train/                |          |
|    entropy_loss       | -0.124   |
|    explained_variance | -209     |
|    learning_rate      | 0.0007   |
|    n_updates          | 39499    |
|    policy_loss        | 0.000595 |
|    value_loss         | 0.00143  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.821     |
| time/                 |           |
|    fps                | 703       |
|    iterations         | 39600     |
|    time_elapsed       | 281       |
|    total_timesteps    | 198000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.835     |
| time/                 |           |
|    fps                | 700       |
|    iterations         | 40800     |
|    time_elapsed       | 291       |
|    total_timesteps    | 204000    |
| train/                |           |
|    entropy_loss       | -0.00189  |
|    explained_variance | -133      |
|    learning_rate      | 0.0007    |
|    n_updates          | 40799     |
|    policy_loss        | -2.84e-07 |
|    value_loss         | 3.49e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.835     |
| time/                 |           |
|    fps                | 700       |
|    iterations         | 40900     |
|    time_elapsed       | 291       |
|    total_timesteps    | 204500    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.853    |
| time/                 |          |
|    fps                | 698      |
|    iterations         | 42100    |
|    time_elapsed       | 301      |
|    total_timesteps    | 210500   |
| train/                |          |
|    entropy_loss       | -0.00538 |
|    explained_variance | 0.027    |
|    learning_rate      | 0.0007   |
|    n_updates          | 42099    |
|    policy_loss        | 1.74e-07 |
|    value_loss         | 9.87e-08 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.853    |
| time/                 |          |
|    fps                | 698      |
|    iterations         | 42200    |
|    time_elapsed       | 302      |
|    total_timesteps    | 211000   |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.853     |
| time/                 |           |
|    fps                | 698       |
|    iterations         | 43400     |
|    time_elapsed       | 310       |
|    total_timesteps    | 217000    |
| train/                |           |
|    entropy_loss       | -0.0148   |
|    explained_variance | -31.3     |
|    learning_rate      | 0.0007    |
|    n_updates          | 43399     |
|    policy_loss        | -9.32e-08 |
|    value_loss         | 2.06e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.871     |
| time/                 |           |
|    fps                | 698       |
|    iterations         | 43500     |
|    time_elapsed       | 311       |
|    total_timesteps    | 217500    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.871    |
| time/                 |          |
|    fps                | 693      |
|    iterations         | 44700    |
|    time_elapsed       | 322      |
|    total_timesteps    | 223500   |
| train/                |          |
|    entropy_loss       | -0.00692 |
|    explained_variance | -273     |
|    learning_rate      | 0.0007   |
|    n_updates          | 44699    |
|    policy_loss        | 5.17e-07 |
|    value_loss         | 4.21e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.871    |
| time/                 |          |
|    fps                | 693      |
|    iterations         | 44800    |
|    time_elapsed       | 322      |
|    total_timesteps    | 224000   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.867    |
| time/                 |          |
|    fps                | 694      |
|    iterations         | 46000    |
|    time_elapsed       | 331      |
|    total_timesteps    | 230000   |
| train/                |          |
|    entropy_loss       | -0.0463  |
|    explained_variance | -184     |
|    learning_rate      | 0.0007   |
|    n_updates          | 45999    |
|    policy_loss        | -0.00553 |
|    value_loss         | 0.000215 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.867     |
| time/                 |           |
|    fps                | 694       |
|    iterations         | 46100     |
|    time_elapsed       | 331       |
|    total_timesteps    | 230500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.872     |
| time/                 |           |
|    fps                | 698       |
|    iterations         | 47300     |
|    time_elapsed       | 338       |
|    total_timesteps    | 236500    |
| train/                |           |
|    entropy_loss       | -0.00572  |
|    explained_variance | -4.55     |
|    learning_rate      | 0.0007    |
|    n_updates          | 47299     |
|    policy_loss        | -5.52e-07 |
|    value_loss         | 7.71e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.872    |
| time/                 |          |
|    fps                | 698      |
|    iterations         | 47400    |
|    time_elapsed       | 339      |
|    total_timesteps    | 237000   |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.867    |
| time/                 |          |
|    fps                | 703      |
|    iterations         | 48600    |
|    time_elapsed       | 345      |
|    total_timesteps    | 243000   |
| train/                |          |
|    entropy_loss       | -0.00534 |
|    explained_variance | 0.298    |
|    learning_rate      | 0.0007   |
|    n_updates          | 48599    |
|    policy_loss        | 2.2e-07  |
|    value_loss         | 1.31e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.867     |
| time/                 |           |
|    fps                | 703       |
|    iterations         | 48700     |
|    time_elapsed       | 346       |
|    total_timesteps    | 243500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.864     |
| time/                 |           |
|    fps                | 706       |
|    iterations         | 49900     |
|    time_elapsed       | 353       |
|    total_timesteps    | 249500    |
| train/                |           |
|    entropy_loss       | -0.011    |
|    explained_variance | -0.0823   |
|    learning_rate      | 0.0007    |
|    n_updates          | 49899     |
|    policy_loss        | -8.49e-07 |
|    value_loss         | 4.29e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.864     |
| time/                 |           |
|    fps                | 706       |
|    iterations         | 50000     |
|    time_elapsed       | 353       |
|    total_timesteps    | 250000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.864    |
| time/                 |          |
|    fps                | 704      |
|    iterations         | 51200    |
|    time_elapsed       | 363      |
|    total_timesteps    | 256000   |
| train/                |          |
|    entropy_loss       | -0.00488 |
|    explained_variance | -9.02    |
|    learning_rate      | 0.0007   |
|    n_updates          | 51199    |
|    policy_loss        | 1.13e-07 |
|    value_loss         | 5.88e-08 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.864     |
| time/                 |           |
|    fps                | 704       |
|    iterations         | 51300     |
|    time_elapsed       | 364       |
|    total_timesteps    | 256500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.869     |
| time/                 |           |
|    fps                | 706       |
|    iterations         | 52500     |
|    time_elapsed       | 371       |
|    total_timesteps    | 262500    |
| train/                |           |
|    entropy_loss       | -0.0513   |
|    explained_variance | -4.57e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 52499     |
|    policy_loss        | 6.43e-05  |
|    value_loss         | 0.000153  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.869     |
| time/                 |           |
|    fps                | 706       |
|    iterations         | 52600     |
|    time_elapsed       | 372       |
|    total_timesteps    | 263000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.878    |
| time/                 |          |
|    fps                | 708      |
|    iterations         | 53800    |
|    time_elapsed       | 379      |
|    total_timesteps    | 269000   |
| train/                |          |
|    entropy_loss       | -0.00536 |
|    explained_variance | -1.47    |
|    learning_rate      | 0.0007   |
|    n_updates          | 53799    |
|    policy_loss        | 4.84e-07 |
|    value_loss         | 8.29e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.878     |
| time/                 |           |
|    fps                | 708       |
|    iterations         | 53900     |
|    time_elapsed       | 380       |
|    total_timesteps    | 269500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.88      |
| time/                 |           |
|    fps                | 711       |
|    iterations         | 55100     |
|    time_elapsed       | 387       |
|    total_timesteps    | 275500    |
| train/                |           |
|    entropy_loss       | -0.00667  |
|    explained_variance | -0.579    |
|    learning_rate      | 0.0007    |
|    n_updates          | 55099     |
|    policy_loss        | -2.74e-07 |
|    value_loss         | 1.35e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.88     |
| time/                 |          |
|    fps                | 711      |
|    iterations         | 55200    |
|    time_elapsed       | 387      |
|    total_timesteps    | 276000   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.888     |
| time/                 |           |
|    fps                | 715       |
|    iterations         | 56400     |
|    time_elapsed       | 394       |
|    total_timesteps    | 282000    |
| train/                |           |
|    entropy_loss       | -0.00669  |
|    explained_variance | 0.395     |
|    learning_rate      | 0.0007    |
|    n_updates          | 56399     |
|    policy_loss        | -1.88e-07 |
|    value_loss         | 1.45e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.888    |
| time/                 |          |
|    fps                | 715      |
|    iterations         | 56500    |
|    time_elapsed       | 394      |
|    total_timesteps    | 282500   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.888     |
| time/                 |           |
|    fps                | 718       |
|    iterations         | 57700     |
|    time_elapsed       | 401       |
|    total_timesteps    | 288500    |
| train/                |           |
|    entropy_loss       | -0.00317  |
|    explained_variance | -18.4     |
|    learning_rate      | 0.0007    |
|    n_updates          | 57699     |
|    policy_loss        | -9.33e-08 |
|    value_loss         | 8.89e-08  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.888    |
| time/                 |          |
|    fps                | 716      |
|    iterations         | 57800    |
|    time_elapsed       | 403      |
|    total_timesteps    | 289000   |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.888    |
| time/                 |          |
|    fps                | 719      |
|    iterations         | 59000    |
|    time_elapsed       | 410      |
|    total_timesteps    | 295000   |
| train/                |          |
|    entropy_loss       | -0.00132 |
|    explained_variance | -436     |
|    learning_rate      | 0.0007   |
|    n_updates          | 58999    |
|    policy_loss        | 9.72e-08 |
|    value_loss         | 1.33e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.888     |
| time/                 |           |
|    fps                | 719       |
|    iterations         | 59100     |
|    time_elapsed       | 410       |
|    total_timesteps    | 295500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.891     |
| time/                 |           |
|    fps                | 723       |
|    iterations         | 60300     |
|    time_elapsed       | 416       |
|    total_timesteps    | 301500    |
| train/                |           |
|    entropy_loss       | -0.0454   |
|    explained_variance | -6.24     |
|    learning_rate      | 0.0007    |
|    n_updates          | 60299     |
|    policy_loss        | -6.51e-06 |
|    value_loss         | 7.34e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.891     |
| time/                 |           |
|    fps                | 723       |
|    iterations         | 60400     |
|    time_elapsed       | 417       |
|    total_timesteps    | 302000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.895     |
| time/                 |           |
|    fps                | 726       |
|    iterations         | 61600     |
|    time_elapsed       | 423       |
|    total_timesteps    | 308000    |
| train/                |           |
|    entropy_loss       | -0.0223   |
|    explained_variance | -0.647    |
|    learning_rate      | 0.0007    |
|    n_updates          | 61599     |
|    policy_loss        | -1.03e-06 |
|    value_loss         | 1.32e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.895    |
| time/                 |          |
|    fps                | 727      |
|    iterations         | 61700    |
|    time_elapsed       | 424      |
|    total_timesteps    | 308500   |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.905    |
| time/                 |          |
|    fps                | 730      |
|    iterations         | 62900    |
|    time_elapsed       | 430      |
|    total_timesteps    | 314500   |
| train/                |          |
|    entropy_loss       | -0.00364 |
|    explained_variance | 0.458    |
|    learning_rate      | 0.0007   |
|    n_updates          | 62899    |
|    policy_loss        | 3.52e-07 |
|    value_loss         | 7.28e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.905    |
| time/                 |          |
|    fps                | 731      |
|    iterations         | 63000    |
|    time_elapsed       | 430      |
|    total_timesteps    | 315000   |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.905     |
| time/                 |           |
|    fps                | 734       |
|    iterations         | 64200     |
|    time_elapsed       | 437       |
|    total_timesteps    | 321000    |
| train/                |           |
|    entropy_loss       | -0.00788  |
|    explained_variance | -4.15     |
|    learning_rate      | 0.0007    |
|    n_updates          | 64199     |
|    policy_loss        | -2.04e-07 |
|    value_loss         | 4.45e-08  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.905    |
| time/                 |          |
|    fps                | 734      |
|    iterations         | 64300    |
|    time_elapsed       | 437      |
|    total_timesteps    | 321500   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.909     |
| time/                 |           |
|    fps                | 737       |
|    iterations         | 65500     |
|    time_elapsed       | 443       |
|    total_timesteps    | 327500    |
| train/                |           |
|    entropy_loss       | -0.00728  |
|    explained_variance | -93       |
|    learning_rate      | 0.0007    |
|    n_updates          | 65499     |
|    policy_loss        | -2.18e-06 |
|    value_loss         | 5.91e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.909    |
| time/                 |          |
|    fps                | 738      |
|    iterations         | 65600    |
|    time_elapsed       | 444      |
|    total_timesteps    | 328000   |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.913    |
| time/                 |          |
|    fps                | 741      |
|    iterations         | 66800    |
|    time_elapsed       | 450      |
|    total_timesteps    | 334000   |
| train/                |          |
|    entropy_loss       | -0.00093 |
|    explained_variance | -319     |
|    learning_rate      | 0.0007   |
|    n_updates          | 66799    |
|    policy_loss        | 2.36e-07 |
|    value_loss         | 7.68e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.913     |
| time/                 |           |
|    fps                | 741       |
|    iterations         | 66900     |
|    time_elapsed       | 450       |
|    total_timesteps    | 334500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.922     |
| time/                 |           |
|    fps                | 744       |
|    iterations         | 68100     |
|    time_elapsed       | 457       |
|    total_timesteps    | 340500    |
| train/                |           |
|    entropy_loss       | -0.000853 |
|    explained_variance | -6.42     |
|    learning_rate      | 0.0007    |
|    n_updates          | 68099     |
|    policy_loss        | 2.15e-08  |
|    value_loss         | 8.99e-08  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.922    |
| time/                 |          |
|    fps                | 745      |
|    iterations         | 68200    |
|    time_elapsed       | 457      |
|    total_timesteps    | 341000   |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.925    |
| time/                 |          |
|    fps                | 748      |
|    iterations         | 69400    |
|    time_elapsed       | 463      |
|    total_timesteps    | 347000   |
| train/                |          |
|    entropy_loss       | -0.00131 |
|    explained_variance | -2.61    |
|    learning_rate      | 0.0007   |
|    n_updates          | 69399    |
|    policy_loss        | 4.39e-09 |
|    value_loss         | 7.12e-09 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.925    |
| time/                 |          |
|    fps                | 748      |
|    iterations         | 69500    |
|    time_elapsed       | 464      |
|    total_timesteps    | 347500   |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.925     |
| time/                 |           |
|    fps                | 750       |
|    iterations         | 70700     |
|    time_elapsed       | 471       |
|    total_timesteps    | 353500    |
| train/                |           |
|    entropy_loss       | -0.00265  |
|    explained_variance | -4.88     |
|    learning_rate      | 0.0007    |
|    n_updates          | 70699     |
|    policy_loss        | -8.68e-08 |
|    value_loss         | 1.25e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.925    |
| time/                 |          |
|    fps                | 750      |
|    iterations         | 70800    |
|    time_elapsed       | 471      |
|    total_timesteps    | 354000   |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.93     |
| time/                 |          |
|    fps                | 753      |
|    iterations         | 72000    |
|    time_elapsed       | 477      |
|    total_timesteps    | 360000   |
| train/                |          |
|    entropy_loss       | -0.00309 |
|    explained_variance | -38      |
|    learning_rate      | 0.0007   |
|    n_updates          | 71999    |
|    policy_loss        | 3.73e-07 |
|    value_loss         | 1.63e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.93      |
| time/                 |           |
|    fps                | 753       |
|    iterations         | 72100     |
|    time_elapsed       | 478       |
|    total_timesteps    | 360500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.935     |
| time/                 |           |
|    fps                | 756       |
|    iterations         | 73300     |
|    time_elapsed       | 484       |
|    total_timesteps    | 366500    |
| train/                |           |
|    entropy_loss       | -0.0237   |
|    explained_variance | -7.32e+04 |
|    learning_rate      | 0.0007    |
|    n_updates          | 73299     |
|    policy_loss        | 1.03e-05  |
|    value_loss         | 0.000839  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.935    |
| time/                 |          |
|    fps                | 756      |
|    iterations         | 73400    |
|    time_elapsed       | 485      |
|    total_timesteps    | 367000   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.94      |
| time/                 |           |
|    fps                | 759       |
|    iterations         | 74600     |
|    time_elapsed       | 491       |
|    total_timesteps    | 373000    |
| train/                |           |
|    entropy_loss       | -0.000233 |
|    explained_variance | -29.1     |
|    learning_rate      | 0.0007    |
|    n_updates          | 74599     |
|    policy_loss        | 2.21e-08  |
|    value_loss         | 1.6e-06   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.94      |
| time/                 |           |
|    fps                | 759       |
|    iterations         | 74700     |
|    time_elapsed       | 491       |
|    total_timesteps    | 373500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.936     |
| time/                 |           |
|    fps                | 761       |
|    iterations         | 75900     |
|    time_elapsed       | 498       |
|    total_timesteps    | 379500    |
| train/                |           |
|    entropy_loss       | -0.000185 |
|    explained_variance | -0.299    |
|    learning_rate      | 0.0007    |
|    n_updates          | 75899     |
|    policy_loss        | 8.75e-10  |
|    value_loss         | 7.64e-09  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.936     |
| time/                 |           |
|    fps                | 761       |
|    iterations         | 76000     |
|    time_elapsed       | 498       |
|    total_timesteps    | 380000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.936     |
| time/                 |           |
|    fps                | 473       |
|    iterations         | 77200     |
|    time_elapsed       | 814       |
|    total_timesteps    | 386000    |
| train/                |           |
|    entropy_loss       | -0.00348  |
|    explained_variance | -8.52e+04 |
|    learning_rate      | 0.0007    |
|    n_updates          | 77199     |
|    policy_loss        | -3.83e-08 |
|    value_loss         | 1.07e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.938     |
| time/                 |           |
|    fps                | 474       |
|    iterations         | 77300     |
|    time_elapsed       | 815       |
|    total_timesteps    | 386500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.938     |
| time/                 |           |
|    fps                | 430       |
|    iterations         | 78500     |
|    time_elapsed       | 912       |
|    total_timesteps    | 392500    |
| train/                |           |
|    entropy_loss       | -0.00197  |
|    explained_variance | -272      |
|    learning_rate      | 0.0007    |
|    n_updates          | 78499     |
|    policy_loss        | -9.06e-08 |
|    value_loss         | 2.23e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.938     |
| time/                 |           |
|    fps                | 430       |
|    iterations         | 78600     |
|    time_elapsed       | 912       |
|    total_timesteps    | 393000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.937    |
| time/                 |          |
|    fps                | 379      |
|    iterations         | 79800    |
|    time_elapsed       | 1050     |
|    total_timesteps    | 399000   |
| train/                |          |
|    entropy_loss       | -0.127   |
|    explained_variance | -476     |
|    learning_rate      | 0.0007   |
|    n_updates          | 79799    |
|    policy_loss        | 0.000479 |
|    value_loss         | 0.000694 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.937     |
| time/                 |           |
|    fps                | 380       |
|    iterations         | 79900     |
|    time_elapsed       | 1050      |
|    total_timesteps    | 399500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.939     |
| time/                 |           |
|    fps                | 287       |
|    iterations         | 81100     |
|    time_elapsed       | 1409      |
|    total_timesteps    | 405500    |
| train/                |           |
|    entropy_loss       | -0.000163 |
|    explained_variance | -0.322    |
|    learning_rate      | 0.0007    |
|    n_updates          | 81099     |
|    policy_loss        | 1.05e-09  |
|    value_loss         | 8.94e-09  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.939     |
| time/                 |           |
|    fps                | 288       |
|    iterations         | 81200     |
|    time_elapsed       | 1409      |
|    total_timesteps    | 406000    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.939     |
| time/                 |           |
|    fps                | 290       |
|    iterations         | 82400     |
|    time_elapsed       | 1416      |
|    total_timesteps    | 412000    |
| train/                |           |
|    entropy_loss       | -0.000213 |
|    explained_variance | 0.0976    |
|    learning_rate      | 0.0007    |
|    n_updates          | 82399     |
|    policy_loss        | 3.55e-09  |
|    value_loss         | 4.35e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.939     |
| time/                 |           |
|    fps                | 291       |
|    iterations         | 82500     |
|    time_elapsed       | 1417      |
|    total_timesteps    | 412500    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.939    |
| time/                 |          |
|    fps                | 293      |
|    iterations         | 83700    |
|    time_elapsed       | 1424     |
|    total_timesteps    | 418500   |
| train/                |          |
|    entropy_loss       | -0.00359 |
|    explained_variance | -8.66    |
|    learning_rate      | 0.0007   |
|    n_updates          | 83699    |
|    policy_loss        | 1.17e-08 |
|    value_loss         | 9.03e-10 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.939     |
| time/                 |           |
|    fps                | 294       |
|    iterations         | 83800     |
|    time_elapsed       | 1424      |
|    total_timesteps    | 419000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.939     |
| time/                 |           |
|    fps                | 296       |
|    iterations         | 85000     |
|    time_elapsed       | 1432      |
|    total_timesteps    | 425000    |
| train/                |           |
|    entropy_loss       | -0.00164  |
|    explained_variance | -393      |
|    learning_rate      | 0.0007    |
|    n_updates          | 84999     |
|    policy_loss        | -2.39e-09 |
|    value_loss         | 5.21e-09  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.939    |
| time/                 |          |
|    fps                | 296      |
|    iterations         | 85100    |
|    time_elapsed       | 1433     |
|    total_timesteps    | 425500   |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.94     |
| time/                 |          |
|    fps                | 299      |
|    iterations         | 86300    |
|    time_elapsed       | 1439     |
|    total_timesteps    | 431500   |
| train/                |          |
|    entropy_loss       | -0.00277 |
|    explained_variance | -197     |
|    learning_rate      | 0.0007   |
|    n_updates          | 86299    |
|    policy_loss        | 5.14e-06 |
|    value_loss         | 0.000376 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.94      |
| time/                 |           |
|    fps                | 299       |
|    iterations         | 86400     |
|    time_elapsed       | 1440      |
|    total_timesteps    | 432000    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.939     |
| time/                 |           |
|    fps                | 302       |
|    iterations         | 87600     |
|    time_elapsed       | 1449      |
|    total_timesteps    | 438000    |
| train/                |           |
|    entropy_loss       | -0.000425 |
|    explained_variance | -5.13     |
|    learning_rate      | 0.0007    |
|    n_updates          | 87599     |
|    policy_loss        | 1.39e-08  |
|    value_loss         | 5.85e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.939     |
| time/                 |           |
|    fps                | 302       |
|    iterations         | 87700     |
|    time_elapsed       | 1450      |
|    total_timesteps    | 438500    |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.942     |
| time/                 |           |
|    fps                | 304       |
|    iterations         | 88900     |
|    time_elapsed       | 1457      |
|    total_timesteps    | 444500    |
| train/                |           |
|    entropy_loss       | -0.000316 |
|    explained_variance | -2.71     |
|    learning_rate      | 0.0007    |
|    n_updates          | 88899     |
|    policy_loss        | 4.99e-09  |
|    value_loss         | 4.18e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.942     |
| time/                 |           |
|    fps                | 305       |
|    iterations         | 89000     |
|    time_elapsed       | 1458      |
|    total_timesteps    | 445000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.942    |
| time/                 |          |
|    fps                | 307      |
|    iterations         | 90200    |
|    time_elapsed       | 1465     |
|    total_timesteps    | 451000   |
| train/                |          |
|    entropy_loss       | -0.00159 |
|    explained_variance | -0.742   |
|    learning_rate      | 0.0007   |
|    n_updates          | 90199    |
|    policy_loss        | 2.98e-07 |
|    value_loss         | 4.47e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.942    |
| time/                 |          |
|    fps                | 308      |
|    iterations         | 90300    |
|    time_elapsed       | 1465     |
|    total_timesteps    | 451500   |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.942    |
| time/                 |          |
|    fps                | 310      |
|    iterations         | 91500    |
|    time_elapsed       | 1471     |
|    total_timesteps    | 457500   |
| train/                |          |
|    entropy_loss       | -0.0349  |
|    explained_variance | -5.38    |
|    learning_rate      | 0.0007   |
|    n_updates          | 91499    |
|    policy_loss        | 4.62e-07 |
|    value_loss         | 2.12e-08 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.942    |
| time/                 |          |
|    fps                | 311      |
|    iterations         | 91600    |
|    time_elapsed       | 1472     |
|    total_timesteps    | 458000   |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.941     |
| time/                 |           |
|    fps                | 313       |
|    iterations         | 92800     |
|    time_elapsed       | 1478      |
|    total_timesteps    | 464000    |
| train/                |           |
|    entropy_loss       | -0.0271   |
|    explained_variance | -411      |
|    learning_rate      | 0.0007    |
|    n_updates          | 92799     |
|    policy_loss        | -2.82e-06 |
|    value_loss         | 8.45e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.941    |
| time/                 |          |
|    fps                | 314      |
|    iterations         | 92900    |
|    time_elapsed       | 1478     |
|    total_timesteps    | 464500   |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.94      |
| time/                 |           |
|    fps                | 316       |
|    iterations         | 94100     |
|    time_elapsed       | 1484      |
|    total_timesteps    | 470500    |
| train/                |           |
|    entropy_loss       | -0.000266 |
|    explained_variance | -8.1e+03  |
|    learning_rate      | 0.0007    |
|    n_updates          | 94099     |
|    policy_loss        | 2.58e-08  |
|    value_loss         | 1.71e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.94      |
| time/                 |           |
|    fps                | 317       |
|    iterations         | 94200     |
|    time_elapsed       | 1485      |
|    total_timesteps    | 471000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.943    |
| time/                 |          |
|    fps                | 319      |
|    iterations         | 95400    |
|    time_elapsed       | 1491     |
|    total_timesteps    | 477000   |
| train/                |          |
|    entropy_loss       | -0.00044 |
|    explained_variance | -8.08    |
|    learning_rate      | 0.0007   |
|    n_updates          | 95399    |
|    policy_loss        | 4.48e-09 |
|    value_loss         | 3.35e-08 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.943     |
| time/                 |           |
|    fps                | 320       |
|    iterations         | 95500     |
|    time_elapsed       | 1492      |
|    total_timesteps    | 477500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.945     |
| time/                 |           |
|    fps                | 322       |
|    iterations         | 96700     |
|    time_elapsed       | 1498      |
|    total_timesteps    | 483500    |
| train/                |           |
|    entropy_loss       | -0.000553 |
|    explained_variance | -4.69     |
|    learning_rate      | 0.0007    |
|    n_updates          | 96699     |
|    policy_loss        | 1.2e-08   |
|    value_loss         | 1.3e-07   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.945     |
| time/                 |           |
|    fps                | 322       |
|    iterations         | 96800     |
|    time_elapsed       | 1498      |
|    total_timesteps    | 484000    |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.945    |
| time/                 |          |
|    fps                | 325      |
|    iterations         | 98000    |
|    time_elapsed       | 1504     |
|    total_timesteps    | 490000   |
| train/                |          |
|    entropy_loss       | -0.11    |
|    explained_variance | -29      |
|    learning_rate      | 0.0007   |
|    n_updates          | 97999    |
|    policy_loss        | 5.93e-06 |
|    value_loss         | 8.11e-08 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.945     |
| time/                 |           |
|    fps                | 325       |
|    iterations         | 98100     |
|    time_elapsed       | 1505      |
|    total_timesteps    | 490500    |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.951     |
| time/                 |           |
|    fps                | 328       |
|    iterations         | 99300     |
|    time_elapsed       | 1511      |
|    total_timesteps    | 496500    |
| train/                |           |
|    entropy_loss       | -0.0276   |
|    explained_variance | -3.39     |
|    learning_rate      | 0.0007    |
|    n_updates          | 99299     |
|    policy_loss        | -6.37e-06 |
|    value_loss         | 2.28e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.951     |
| time/                 |           |
|    fps                | 328       |
|    iterations         | 99400     |
|    time_elapsed       | 1512      |
|    total_timesteps    | 497000    |
| train/    

Step: 292, Action: 1, Reward: 0
Step: 293, Action: 1, Reward: 0
Step: 294, Action: 1, Reward: 0
Step: 295, Action: 1, Reward: 0
Step: 296, Action: 1, Reward: 0
Step: 297, Action: 1, Reward: 0
Step: 298, Action: 1, Reward: 0
Step: 299, Action: 1, Reward: 0
Step: 300, Action: 1, Reward: 0
Step: 301, Action: 1, Reward: 0
Step: 302, Action: 1, Reward: 0
Step: 303, Action: 1, Reward: 0
Step: 304, Action: 1, Reward: 0
Step: 305, Action: 1, Reward: 0
Step: 306, Action: 1, Reward: 0
Step: 307, Action: 1, Reward: 0
Step: 308, Action: 1, Reward: 0
Step: 309, Action: 1, Reward: 0
Step: 310, Action: 1, Reward: 0
Step: 311, Action: 1, Reward: 0
Step: 312, Action: 1, Reward: 0
Step: 313, Action: 1, Reward: 0
Step: 314, Action: 1, Reward: 0
Step: 315, Action: 1, Reward: 0
Step: 316, Action: 1, Reward: 0
Step: 317, Action: 1, Reward: 0
Step: 318, Action: 1, Reward: 0
Step: 319, Action: 1, Reward: 0
Step: 320, Action: 1, Reward: 0
Step: 321, Action: 1, Reward: 0
Step: 322, Action: 1, Reward: 0
Step: 32

Step: 625, Action: 1, Reward: 0
Step: 626, Action: 1, Reward: 0
Step: 627, Action: 1, Reward: 0
Step: 628, Action: 1, Reward: 0
Step: 629, Action: 1, Reward: 0
Step: 630, Action: 1, Reward: 0
Step: 631, Action: 1, Reward: 0
Step: 632, Action: 1, Reward: 0
Step: 633, Action: 1, Reward: 0
Step: 634, Action: 1, Reward: 0
Step: 635, Action: 1, Reward: 0
Step: 636, Action: 1, Reward: 0
Step: 637, Action: 1, Reward: 0
Step: 638, Action: 1, Reward: 0
Step: 639, Action: 1, Reward: 0
Step: 640, Action: 1, Reward: 0
Step: 641, Action: 1, Reward: 0
Step: 642, Action: 1, Reward: 0
Step: 643, Action: 1, Reward: 0
Step: 644, Action: 1, Reward: 0
Step: 645, Action: 1, Reward: 0
Step: 646, Action: 1, Reward: 0
Step: 647, Action: 1, Reward: 0
Step: 648, Action: 1, Reward: 0
Step: 649, Action: 1, Reward: 0
Step: 650, Action: 1, Reward: 0
Step: 651, Action: 1, Reward: 0
Step: 652, Action: 1, Reward: 0
Step: 653, Action: 1, Reward: 0
Step: 654, Action: 1, Reward: 0
Step: 655, Action: 1, Reward: 0
Step: 65

Step: 1037, Action: 1, Reward: 0
Step: 1038, Action: 1, Reward: 0
Step: 1039, Action: 1, Reward: 0
Step: 1040, Action: 1, Reward: 0
Step: 1041, Action: 1, Reward: 0
Step: 1042, Action: 1, Reward: 0
Step: 1043, Action: 1, Reward: 0
Step: 1044, Action: 1, Reward: 0
Step: 1045, Action: 1, Reward: 0
Step: 1046, Action: 1, Reward: 0
Step: 1047, Action: 1, Reward: 0
Step: 1048, Action: 1, Reward: 0
Step: 1049, Action: 1, Reward: 0
Step: 1050, Action: 1, Reward: 0
Step: 1051, Action: 1, Reward: 0
Step: 1052, Action: 1, Reward: 0
Step: 1053, Action: 1, Reward: 0
Step: 1054, Action: 1, Reward: 0
Step: 1055, Action: 1, Reward: 0
Step: 1056, Action: 1, Reward: 0
Step: 1057, Action: 1, Reward: 0
Step: 1058, Action: 1, Reward: 0
Step: 1059, Action: 1, Reward: 0
Step: 1060, Action: 1, Reward: 0
Step: 1061, Action: 1, Reward: 0
Step: 1062, Action: 1, Reward: 0
Step: 1063, Action: 1, Reward: 0
Step: 1064, Action: 1, Reward: 0
Step: 1065, Action: 1, Reward: 0
Step: 1066, Action: 1, Reward: 0
Step: 1067

Step: 1457, Action: 1, Reward: 0
Step: 1458, Action: 1, Reward: 0
Step: 1459, Action: 1, Reward: 0
Step: 1460, Action: 1, Reward: 0
Step: 1461, Action: 1, Reward: 0
Step: 1462, Action: 1, Reward: 0
Step: 1463, Action: 1, Reward: 0
Step: 1464, Action: 1, Reward: 0
Step: 1465, Action: 1, Reward: 0
Step: 1466, Action: 1, Reward: 0
Step: 1467, Action: 1, Reward: 0
Step: 1468, Action: 1, Reward: 0
Step: 1469, Action: 1, Reward: 0
Step: 1470, Action: 1, Reward: 0
Step: 1471, Action: 1, Reward: 0
Step: 1472, Action: 1, Reward: 0
Step: 1473, Action: 1, Reward: 0
Step: 1474, Action: 1, Reward: 0
Step: 1475, Action: 1, Reward: 0
Step: 1476, Action: 1, Reward: 0
Step: 1477, Action: 1, Reward: 0
Step: 1478, Action: 1, Reward: 0
Step: 1479, Action: 1, Reward: 0
Step: 1480, Action: 1, Reward: 0
Step: 1481, Action: 1, Reward: 0
Step: 1482, Action: 1, Reward: 0
Step: 1483, Action: 1, Reward: 0
Step: 1484, Action: 1, Reward: 0
Step: 1485, Action: 1, Reward: 0
Step: 1486, Action: 1, Reward: 0
Step: 1487

Step: 1871, Action: 1, Reward: 0
Step: 1872, Action: 1, Reward: 0
Step: 1873, Action: 1, Reward: 0
Step: 1874, Action: 1, Reward: 0
Step: 1875, Action: 1, Reward: 0
Step: 1876, Action: 1, Reward: 0
Step: 1877, Action: 1, Reward: 0
Step: 1878, Action: 1, Reward: 0
Step: 1879, Action: 1, Reward: 0
Step: 1880, Action: 1, Reward: 0
Step: 1881, Action: 1, Reward: 0
Step: 1882, Action: 1, Reward: 0
Step: 1883, Action: 1, Reward: 0
Step: 1884, Action: 1, Reward: 0
Step: 1885, Action: 1, Reward: 0
Step: 1886, Action: 1, Reward: 0
Step: 1887, Action: 1, Reward: 0
Step: 1888, Action: 1, Reward: 0
Step: 1889, Action: 1, Reward: 0
Step: 1890, Action: 1, Reward: 0
Step: 1891, Action: 1, Reward: 0
Step: 1892, Action: 1, Reward: 0
Step: 1893, Action: 1, Reward: 0
Step: 1894, Action: 1, Reward: 0
Step: 1895, Action: 1, Reward: 0
Step: 1896, Action: 1, Reward: 0
Step: 1897, Action: 1, Reward: 0
Step: 1898, Action: 1, Reward: 0
Step: 1899, Action: 1, Reward: 0
Step: 1900, Action: 1, Reward: 0
Step: 1901

Step: 2273, Action: 1, Reward: 0
Step: 2274, Action: 1, Reward: 0
Step: 2275, Action: 1, Reward: 0
Step: 2276, Action: 1, Reward: 0
Step: 2277, Action: 1, Reward: 0
Step: 2278, Action: 1, Reward: 0
Step: 2279, Action: 1, Reward: 0
Step: 2280, Action: 1, Reward: 0
Step: 2281, Action: 1, Reward: 0
Step: 2282, Action: 1, Reward: 0
Step: 2283, Action: 1, Reward: 0
Step: 2284, Action: 1, Reward: 0
Step: 2285, Action: 1, Reward: 0
Step: 2286, Action: 1, Reward: 0
Step: 2287, Action: 1, Reward: 0
Step: 2288, Action: 1, Reward: 0
Step: 2289, Action: 1, Reward: 0
Step: 2290, Action: 1, Reward: 0
Step: 2291, Action: 1, Reward: 0
Step: 2292, Action: 1, Reward: 0
Step: 2293, Action: 1, Reward: 0
Step: 2294, Action: 1, Reward: 0
Step: 2295, Action: 1, Reward: 0
Step: 2296, Action: 1, Reward: 0
Step: 2297, Action: 1, Reward: 0
Step: 2298, Action: 1, Reward: 0
Step: 2299, Action: 1, Reward: 0
Step: 2300, Action: 1, Reward: 0
Step: 2301, Action: 1, Reward: 0
Step: 2302, Action: 1, Reward: 0
Step: 2303

Step: 2682, Action: 1, Reward: 0
Step: 2683, Action: 1, Reward: 0
Step: 2684, Action: 1, Reward: 0
Step: 2685, Action: 1, Reward: 0
Step: 2686, Action: 2, Reward: 0.03064423305091521
Step: 2687, Action: 2, Reward: 0
Step: 2688, Action: 1, Reward: -0.01726162494640121
Step: 2689, Action: 1, Reward: 0
Step: 2690, Action: 1, Reward: 0
Step: 2691, Action: 1, Reward: 0
Step: 2692, Action: 1, Reward: 0
Step: 2693, Action: 1, Reward: 0
Step: 2694, Action: 1, Reward: 0
Step: 2695, Action: 1, Reward: 0
Step: 2696, Action: 1, Reward: 0
Step: 2697, Action: 1, Reward: 0
Step: 2698, Action: 1, Reward: 0
Step: 2699, Action: 2, Reward: 0.007951984525870273
Step: 2700, Action: 2, Reward: 0
Step: 2701, Action: 1, Reward: -0.003491115157699315
Step: 2702, Action: 2, Reward: 0
Step: 2703, Action: 2, Reward: 0
Step: 2704, Action: 2, Reward: 0
Step: 2705, Action: 2, Reward: 0
Step: 2706, Action: 0, Reward: 0
Step: 2707, Action: 1, Reward: -0.006400377789115397
Step: 2708, Action: 1, Reward: 0
Step: 2709, A

Step: 3091, Action: 1, Reward: 0
Step: 3092, Action: 1, Reward: 0
Step: 3093, Action: 1, Reward: 0
Step: 3094, Action: 1, Reward: 0
Step: 3095, Action: 1, Reward: 0
Step: 3096, Action: 1, Reward: 0
Step: 3097, Action: 1, Reward: 0
Step: 3098, Action: 1, Reward: 0
Step: 3099, Action: 1, Reward: 0
Step: 3100, Action: 1, Reward: 0
Step: 3101, Action: 1, Reward: 0
Step: 3102, Action: 1, Reward: 0
Step: 3103, Action: 1, Reward: 0
Step: 3104, Action: 1, Reward: 0
Step: 3105, Action: 1, Reward: 0
Step: 3106, Action: 1, Reward: 0
Step: 3107, Action: 1, Reward: 0
Step: 3108, Action: 1, Reward: 0
Step: 3109, Action: 1, Reward: 0
Step: 3110, Action: 1, Reward: 0
Step: 3111, Action: 1, Reward: 0
Step: 3112, Action: 1, Reward: 0
Step: 3113, Action: 1, Reward: 0
Step: 3114, Action: 1, Reward: 0
Step: 3115, Action: 1, Reward: 0
Step: 3116, Action: 1, Reward: 0
Step: 3117, Action: 1, Reward: 0
Step: 3118, Action: 1, Reward: 0
Step: 3119, Action: 1, Reward: 0
Step: 3120, Action: 1, Reward: 0
Step: 3121

Step: 3505, Action: 1, Reward: 0
Step: 3506, Action: 1, Reward: 0
Step: 3507, Action: 1, Reward: 0
Step: 3508, Action: 1, Reward: 0
Step: 3509, Action: 1, Reward: 0
Step: 3510, Action: 1, Reward: 0
Step: 3511, Action: 1, Reward: 0
Step: 3512, Action: 1, Reward: 0
Step: 3513, Action: 1, Reward: 0
Step: 3514, Action: 1, Reward: 0
Step: 3515, Action: 1, Reward: 0
Step: 3516, Action: 1, Reward: 0
Step: 3517, Action: 1, Reward: 0
Step: 3518, Action: 1, Reward: 0
Step: 3519, Action: 1, Reward: 0
Step: 3520, Action: 1, Reward: 0
Step: 3521, Action: 1, Reward: 0
Step: 3522, Action: 1, Reward: 0
Step: 3523, Action: 1, Reward: 0
Step: 3524, Action: 1, Reward: 0
Step: 3525, Action: 1, Reward: 0
Step: 3526, Action: 1, Reward: 0
Step: 3527, Action: 1, Reward: 0
Step: 3528, Action: 1, Reward: 0
Step: 3529, Action: 1, Reward: 0
Step: 3530, Action: 1, Reward: 0
Step: 3531, Action: 1, Reward: 0
Step: 3532, Action: 1, Reward: 0
Step: 3533, Action: 1, Reward: 0
Step: 3534, Action: 1, Reward: 0
Step: 3535

Step: 3915, Action: 1, Reward: 0
Step: 3916, Action: 1, Reward: 0
Step: 3917, Action: 1, Reward: 0
Step: 3918, Action: 1, Reward: 0
Step: 3919, Action: 1, Reward: 0
Step: 3920, Action: 1, Reward: 0
Step: 3921, Action: 1, Reward: 0
Step: 3922, Action: 1, Reward: 0
Step: 3923, Action: 1, Reward: 0
Step: 3924, Action: 1, Reward: 0
Step: 3925, Action: 1, Reward: 0
Step: 3926, Action: 1, Reward: 0
Step: 3927, Action: 1, Reward: 0
Step: 3928, Action: 1, Reward: 0
Step: 3929, Action: 1, Reward: 0
Step: 3930, Action: 1, Reward: 0
Step: 3931, Action: 1, Reward: 0
Step: 3932, Action: 1, Reward: 0
Step: 3933, Action: 1, Reward: 0
Step: 3934, Action: 1, Reward: 0
Step: 3935, Action: 1, Reward: 0
Step: 3936, Action: 1, Reward: 0
Step: 3937, Action: 1, Reward: 0
Step: 3938, Action: 1, Reward: 0
Step: 3939, Action: 1, Reward: 0
Step: 3940, Action: 1, Reward: 0
Step: 3941, Action: 1, Reward: 0
Step: 3942, Action: 1, Reward: 0
Step: 3943, Action: 1, Reward: 0
Step: 3944, Action: 1, Reward: 0
Step: 3945

Step: 4316, Action: 0, Reward: 0
Step: 4317, Action: 0, Reward: 0
Step: 4318, Action: 0, Reward: 0
Step: 4319, Action: 0, Reward: 0
Step: 4320, Action: 2, Reward: 0
Step: 4321, Action: 2, Reward: 0
Step: 4322, Action: 2, Reward: 0
Step: 4323, Action: 2, Reward: 0
Step: 4324, Action: 2, Reward: 0
Step: 4325, Action: 2, Reward: 0
Step: 4326, Action: 2, Reward: 0
Step: 4327, Action: 2, Reward: 0
Step: 4328, Action: 0, Reward: 0
Step: 4329, Action: 0, Reward: 0
Step: 4330, Action: 0, Reward: 0
Step: 4331, Action: 1, Reward: -0.04693610378684422
Step: 4332, Action: 1, Reward: 0
Step: 4333, Action: 1, Reward: 0
Step: 4334, Action: 1, Reward: 0
Step: 4335, Action: 1, Reward: 0
Step: 4336, Action: 2, Reward: -0.009794517525767044
Step: 4337, Action: 2, Reward: 0
Step: 4338, Action: 2, Reward: 0
Step: 4339, Action: 2, Reward: 0
Step: 4340, Action: 2, Reward: 0
Step: 4341, Action: 2, Reward: 0
Step: 4342, Action: 2, Reward: 0
Step: 4343, Action: 2, Reward: 0
Step: 4344, Action: 2, Reward: 0
Step

Step: 4729, Action: 1, Reward: 0
Step: 4730, Action: 1, Reward: 0
Step: 4731, Action: 1, Reward: 0
Step: 4732, Action: 1, Reward: 0
Step: 4733, Action: 1, Reward: 0
Step: 4734, Action: 1, Reward: 0
Step: 4735, Action: 1, Reward: 0
Step: 4736, Action: 2, Reward: -0.042669185260767475
Step: 4737, Action: 2, Reward: 0
Step: 4738, Action: 2, Reward: 0
Step: 4739, Action: 2, Reward: 0
Step: 4740, Action: 2, Reward: 0
Step: 4741, Action: 2, Reward: 0
Step: 4742, Action: 2, Reward: 0
Step: 4743, Action: 2, Reward: 0
Step: 4744, Action: 0, Reward: 0
Step: 4745, Action: 0, Reward: 0
Step: 4746, Action: 0, Reward: 0
Step: 4747, Action: 0, Reward: 0
Step: 4748, Action: 0, Reward: 0
Step: 4749, Action: 0, Reward: 0
Step: 4750, Action: 0, Reward: 0
Step: 4751, Action: 0, Reward: 0
Step: 4752, Action: 2, Reward: 0
Step: 4753, Action: 2, Reward: 0
Step: 4754, Action: 2, Reward: 0
Step: 4755, Action: 2, Reward: 0
Step: 4756, Action: 2, Reward: 0
Step: 4757, Action: 2, Reward: 0
Step: 4758, Action: 2, 

Step: 5133, Action: 2, Reward: 0
Step: 5134, Action: 2, Reward: 0
Step: 5135, Action: 2, Reward: 0
Step: 5136, Action: 2, Reward: 0
Step: 5137, Action: 2, Reward: 0
Step: 5138, Action: 2, Reward: 0
Step: 5139, Action: 2, Reward: 0
Step: 5140, Action: 2, Reward: 0
Step: 5141, Action: 2, Reward: 0
Step: 5142, Action: 2, Reward: 0
Step: 5143, Action: 2, Reward: 0
Step: 5144, Action: 2, Reward: 0
Step: 5145, Action: 2, Reward: 0
Step: 5146, Action: 2, Reward: 0
Step: 5147, Action: 2, Reward: 0
Step: 5148, Action: 2, Reward: 0
Step: 5149, Action: 2, Reward: 0
Step: 5150, Action: 2, Reward: 0
Step: 5151, Action: 2, Reward: 0
Step: 5152, Action: 2, Reward: 0
Step: 5153, Action: 2, Reward: 0
Step: 5154, Action: 2, Reward: 0
Step: 5155, Action: 2, Reward: 0
Step: 5156, Action: 2, Reward: 0
Step: 5157, Action: 2, Reward: 0
Step: 5158, Action: 2, Reward: 0
Step: 5159, Action: 2, Reward: 0
Step: 5160, Action: 2, Reward: 0
Step: 5161, Action: 2, Reward: 0
Step: 5162, Action: 2, Reward: 0
Step: 5163

Step: 5555, Action: 2, Reward: 0
Step: 5556, Action: 2, Reward: 0
Step: 5557, Action: 2, Reward: 0
Step: 5558, Action: 2, Reward: 0
Step: 5559, Action: 2, Reward: 0
Step: 5560, Action: 2, Reward: 0
Step: 5561, Action: 2, Reward: 0
Step: 5562, Action: 2, Reward: 0
Step: 5563, Action: 2, Reward: 0
Step: 5564, Action: 2, Reward: 0
Step: 5565, Action: 2, Reward: 0
Step: 5566, Action: 2, Reward: 0
Step: 5567, Action: 2, Reward: 0
Step: 5568, Action: 2, Reward: 0
Step: 5569, Action: 2, Reward: 0
Step: 5570, Action: 2, Reward: 0
Step: 5571, Action: 2, Reward: 0
Step: 5572, Action: 2, Reward: 0
Step: 5573, Action: 2, Reward: 0
Step: 5574, Action: 2, Reward: 0
Step: 5575, Action: 2, Reward: 0
Step: 5576, Action: 2, Reward: 0
Step: 5577, Action: 2, Reward: 0
Step: 5578, Action: 2, Reward: 0
Step: 5579, Action: 2, Reward: 0
Step: 5580, Action: 2, Reward: 0
Step: 5581, Action: 2, Reward: 0
Step: 5582, Action: 2, Reward: 0
Step: 5583, Action: 2, Reward: 0
Step: 5584, Action: 2, Reward: 0
Step: 5585

Step: 5958, Action: 2, Reward: 0
Step: 5959, Action: 2, Reward: 0
Step: 5960, Action: 2, Reward: 0
Step: 5961, Action: 2, Reward: 0
Step: 5962, Action: 2, Reward: 0
Step: 5963, Action: 2, Reward: 0
Step: 5964, Action: 2, Reward: 0
Step: 5965, Action: 2, Reward: 0
Step: 5966, Action: 2, Reward: 0
Step: 5967, Action: 2, Reward: 0
Step: 5968, Action: 2, Reward: 0
Step: 5969, Action: 2, Reward: 0
Step: 5970, Action: 2, Reward: 0
Step: 5971, Action: 2, Reward: 0
Step: 5972, Action: 2, Reward: 0
Step: 5973, Action: 2, Reward: 0
Step: 5974, Action: 2, Reward: 0
Step: 5975, Action: 2, Reward: 0
Step: 5976, Action: 2, Reward: 0
Step: 5977, Action: 2, Reward: 0
Step: 5978, Action: 2, Reward: 0
Step: 5979, Action: 2, Reward: 0
Step: 5980, Action: 2, Reward: 0
Step: 5981, Action: 2, Reward: 0
Step: 5982, Action: 2, Reward: 0
Step: 5983, Action: 2, Reward: 0
Step: 5984, Action: 2, Reward: 0
Step: 5985, Action: 2, Reward: 0
Step: 5986, Action: 2, Reward: 0
Step: 5987, Action: 2, Reward: 0
Step: 5988

Step: 6380, Action: 2, Reward: 0
Step: 6381, Action: 2, Reward: 0
Step: 6382, Action: 2, Reward: 0
Step: 6383, Action: 2, Reward: 0
Step: 6384, Action: 2, Reward: 0
Step: 6385, Action: 2, Reward: 0
Step: 6386, Action: 2, Reward: 0
Step: 6387, Action: 2, Reward: 0
Step: 6388, Action: 2, Reward: 0
Step: 6389, Action: 2, Reward: 0
Step: 6390, Action: 2, Reward: 0
Step: 6391, Action: 2, Reward: 0
Step: 6392, Action: 2, Reward: 0
Step: 6393, Action: 2, Reward: 0
Step: 6394, Action: 2, Reward: 0
Step: 6395, Action: 2, Reward: 0
Step: 6396, Action: 2, Reward: 0
Step: 6397, Action: 2, Reward: 0
Step: 6398, Action: 2, Reward: 0
Step: 6399, Action: 2, Reward: 0
Step: 6400, Action: 2, Reward: 0
Step: 6401, Action: 2, Reward: 0
Step: 6402, Action: 2, Reward: 0
Step: 6403, Action: 2, Reward: 0
Step: 6404, Action: 2, Reward: 0
Step: 6405, Action: 2, Reward: 0
Step: 6406, Action: 2, Reward: 0
Step: 6407, Action: 2, Reward: 0
Step: 6408, Action: 2, Reward: 0
Step: 6409, Action: 2, Reward: 0
Step: 6410

Step: 6809, Action: 2, Reward: 0
Step: 6810, Action: 2, Reward: 0
Step: 6811, Action: 2, Reward: 0
Step: 6812, Action: 2, Reward: 0
Step: 6813, Action: 2, Reward: 0
Step: 6814, Action: 2, Reward: 0
Step: 6815, Action: 2, Reward: 0
Step: 6816, Action: 2, Reward: 0
Step: 6817, Action: 2, Reward: 0
Step: 6818, Action: 2, Reward: 0
Step: 6819, Action: 2, Reward: 0
Step: 6820, Action: 2, Reward: 0
Step: 6821, Action: 2, Reward: 0
Step: 6822, Action: 2, Reward: 0
Step: 6823, Action: 2, Reward: 0
Step: 6824, Action: 2, Reward: 0
Step: 6825, Action: 2, Reward: 0
Step: 6826, Action: 2, Reward: 0
Step: 6827, Action: 2, Reward: 0
Step: 6828, Action: 2, Reward: 0
Step: 6829, Action: 2, Reward: 0
Step: 6830, Action: 2, Reward: 0
Step: 6831, Action: 2, Reward: 0
Step: 6832, Action: 2, Reward: 0
Step: 6833, Action: 2, Reward: 0
Step: 6834, Action: 2, Reward: 0
Step: 6835, Action: 2, Reward: 0
Step: 6836, Action: 2, Reward: 0
Step: 6837, Action: 2, Reward: 0
Step: 6838, Action: 2, Reward: 0
Step: 6839

Step: 7222, Action: 2, Reward: 0
Step: 7223, Action: 2, Reward: 0
Step: 7224, Action: 2, Reward: 0
Step: 7225, Action: 2, Reward: 0
Step: 7226, Action: 2, Reward: 0
Step: 7227, Action: 2, Reward: 0
Step: 7228, Action: 2, Reward: 0
Step: 7229, Action: 2, Reward: 0
Step: 7230, Action: 2, Reward: 0
Step: 7231, Action: 2, Reward: 0
Step: 7232, Action: 2, Reward: 0
Step: 7233, Action: 2, Reward: 0
Step: 7234, Action: 2, Reward: 0
Step: 7235, Action: 2, Reward: 0
Step: 7236, Action: 2, Reward: 0
Step: 7237, Action: 2, Reward: 0
Step: 7238, Action: 2, Reward: 0
Step: 7239, Action: 2, Reward: 0
Step: 7240, Action: 2, Reward: 0
Step: 7241, Action: 2, Reward: 0
Step: 7242, Action: 2, Reward: 0
Step: 7243, Action: 2, Reward: 0
Step: 7244, Action: 2, Reward: 0
Step: 7245, Action: 2, Reward: 0
Step: 7246, Action: 2, Reward: 0
Step: 7247, Action: 2, Reward: 0
Step: 7248, Action: 2, Reward: 0
Step: 7249, Action: 2, Reward: 0
Step: 7250, Action: 2, Reward: 0
Step: 7251, Action: 2, Reward: 0
Step: 7252

Step: 7648, Action: 2, Reward: 0
Step: 7649, Action: 2, Reward: 0
Step: 7650, Action: 2, Reward: 0
Step: 7651, Action: 2, Reward: 0
Step: 7652, Action: 2, Reward: 0
Step: 7653, Action: 2, Reward: 0
Step: 7654, Action: 2, Reward: 0
Step: 7655, Action: 2, Reward: 0
Step: 7656, Action: 2, Reward: 0
Step: 7657, Action: 2, Reward: 0
Step: 7658, Action: 2, Reward: 0
Step: 7659, Action: 2, Reward: 0
Step: 7660, Action: 2, Reward: 0
Step: 7661, Action: 2, Reward: 0
Step: 7662, Action: 2, Reward: 0
Step: 7663, Action: 2, Reward: 0
Step: 7664, Action: 2, Reward: 0
Step: 7665, Action: 2, Reward: 0
Step: 7666, Action: 2, Reward: 0
Step: 7667, Action: 2, Reward: 0
Step: 7668, Action: 2, Reward: 0
Step: 7669, Action: 2, Reward: 0
Step: 7670, Action: 2, Reward: 0
Step: 7671, Action: 2, Reward: 0
Step: 7672, Action: 2, Reward: 0
Step: 7673, Action: 2, Reward: 0
Step: 7674, Action: 2, Reward: 0
Step: 7675, Action: 2, Reward: 0
Step: 7676, Action: 2, Reward: 0
Step: 7677, Action: 2, Reward: 0
Step: 7678

Step: 3, Action: 2, Reward: 0
Step: 4, Action: 2, Reward: 0
Step: 5, Action: 2, Reward: 0
Step: 6, Action: 2, Reward: 0
Step: 7, Action: 2, Reward: 0
Step: 8, Action: 2, Reward: 0
Step: 9, Action: 0, Reward: 0
Step: 10, Action: 2, Reward: 0
Step: 11, Action: 2, Reward: 0
Step: 12, Action: 2, Reward: 0
Step: 13, Action: 2, Reward: 0
Step: 14, Action: 2, Reward: 0
Step: 15, Action: 2, Reward: 0
Step: 16, Action: 2, Reward: 0
Step: 17, Action: 2, Reward: 0
Step: 18, Action: 2, Reward: 0
Step: 19, Action: 2, Reward: 0
Step: 20, Action: 2, Reward: 0
Step: 21, Action: 2, Reward: 0
Step: 22, Action: 2, Reward: 0
Step: 23, Action: 2, Reward: 0
Step: 24, Action: 2, Reward: 0
Step: 25, Action: 2, Reward: 0
Step: 26, Action: 2, Reward: 0
Step: 27, Action: 2, Reward: 0
Step: 28, Action: 2, Reward: 0
Step: 29, Action: 2, Reward: 0
Step: 30, Action: 2, Reward: 0
Step: 31, Action: 2, Reward: 0
Step: 32, Action: 2, Reward: 0
Step: 33, Action: 2, Reward: 0
Step: 34, Action: 2, Reward: 0
Step: 35, Actio

Step: 427, Action: 2, Reward: 0
Step: 428, Action: 2, Reward: 0
Step: 429, Action: 2, Reward: 0
Step: 430, Action: 2, Reward: 0
Step: 431, Action: 2, Reward: 0
Step: 432, Action: 2, Reward: 0
Step: 433, Action: 2, Reward: 0
Step: 434, Action: 2, Reward: 0
Step: 435, Action: 2, Reward: 0
Step: 436, Action: 2, Reward: 0
Step: 437, Action: 2, Reward: 0
Step: 438, Action: 2, Reward: 0
Step: 439, Action: 2, Reward: 0
Step: 440, Action: 2, Reward: 0
Step: 441, Action: 2, Reward: 0
Step: 442, Action: 2, Reward: 0
Step: 443, Action: 2, Reward: 0
Step: 444, Action: 2, Reward: 0
Step: 445, Action: 2, Reward: 0
Step: 446, Action: 2, Reward: 0
Step: 447, Action: 2, Reward: 0
Step: 448, Action: 2, Reward: 0
Step: 449, Action: 2, Reward: 0
Step: 450, Action: 2, Reward: 0
Step: 451, Action: 2, Reward: 0
Step: 452, Action: 2, Reward: 0
Step: 453, Action: 2, Reward: 0
Step: 454, Action: 2, Reward: 0
Step: 455, Action: 2, Reward: 0
Step: 456, Action: 2, Reward: 0
Step: 457, Action: 2, Reward: 0
Step: 45

Step: 857, Action: 2, Reward: 0
Step: 858, Action: 2, Reward: 0
Step: 859, Action: 2, Reward: 0
Step: 860, Action: 2, Reward: 0
Step: 861, Action: 2, Reward: 0
Step: 862, Action: 2, Reward: 0
Step: 863, Action: 2, Reward: 0
Step: 864, Action: 2, Reward: 0
Step: 865, Action: 2, Reward: 0
Step: 866, Action: 2, Reward: 0
Step: 867, Action: 2, Reward: 0
Step: 868, Action: 2, Reward: 0
Step: 869, Action: 2, Reward: 0
Step: 870, Action: 2, Reward: 0
Step: 871, Action: 2, Reward: 0
Step: 872, Action: 2, Reward: 0
Step: 873, Action: 2, Reward: 0
Step: 874, Action: 2, Reward: 0
Step: 875, Action: 2, Reward: 0
Step: 876, Action: 2, Reward: 0
Step: 877, Action: 2, Reward: 0
Step: 878, Action: 2, Reward: 0
Step: 879, Action: 2, Reward: 0
Step: 880, Action: 2, Reward: 0
Step: 881, Action: 2, Reward: 0
Step: 882, Action: 2, Reward: 0
Step: 883, Action: 2, Reward: 0
Step: 884, Action: 2, Reward: 0
Step: 885, Action: 2, Reward: 0
Step: 886, Action: 2, Reward: 0
Step: 887, Action: 2, Reward: 0
Step: 88

Step: 1292, Action: 2, Reward: 0
Step: 1293, Action: 2, Reward: 0
Step: 1294, Action: 2, Reward: 0
Step: 1295, Action: 2, Reward: 0
Step: 1296, Action: 2, Reward: 0
Step: 1297, Action: 2, Reward: 0
Step: 1298, Action: 2, Reward: 0
Step: 1299, Action: 2, Reward: 0
Step: 1300, Action: 2, Reward: 0
Step: 1301, Action: 2, Reward: 0
Step: 1302, Action: 2, Reward: 0
Step: 1303, Action: 2, Reward: 0
Step: 1304, Action: 2, Reward: 0
Step: 1305, Action: 2, Reward: 0
Step: 1306, Action: 2, Reward: 0
Step: 1307, Action: 2, Reward: 0
Step: 1308, Action: 2, Reward: 0
Step: 1309, Action: 2, Reward: 0
Step: 1310, Action: 2, Reward: 0
Step: 1311, Action: 2, Reward: 0
Step: 1312, Action: 2, Reward: 0
Step: 1313, Action: 2, Reward: 0
Step: 1314, Action: 2, Reward: 0
Step: 1315, Action: 2, Reward: 0
Step: 1316, Action: 2, Reward: 0
Step: 1317, Action: 2, Reward: 0
Step: 1318, Action: 2, Reward: 0
Step: 1319, Action: 2, Reward: 0
Step: 1320, Action: 2, Reward: 0
Step: 1321, Action: 2, Reward: 0
Step: 1322

Step: 1727, Action: 2, Reward: 0
Step: 1728, Action: 2, Reward: 0
Step: 1729, Action: 2, Reward: 0
Step: 1730, Action: 2, Reward: 0
Step: 1731, Action: 2, Reward: 0
Step: 1732, Action: 2, Reward: 0
Step: 1733, Action: 2, Reward: 0
Step: 1734, Action: 2, Reward: 0
Step: 1735, Action: 2, Reward: 0
Step: 1736, Action: 2, Reward: 0
Step: 1737, Action: 2, Reward: 0
Step: 1738, Action: 2, Reward: 0
Step: 1739, Action: 2, Reward: 0
Step: 1740, Action: 2, Reward: 0
Step: 1741, Action: 2, Reward: 0
Step: 1742, Action: 2, Reward: 0
Step: 1743, Action: 2, Reward: 0
Step: 1744, Action: 2, Reward: 0
Step: 1745, Action: 2, Reward: 0
Step: 1746, Action: 2, Reward: 0
Step: 1747, Action: 2, Reward: 0
Step: 1748, Action: 2, Reward: 0
Step: 1749, Action: 2, Reward: 0
Step: 1750, Action: 2, Reward: 0
Step: 1751, Action: 2, Reward: 0
Step: 1752, Action: 2, Reward: 0
Step: 1753, Action: 2, Reward: 0
Step: 1754, Action: 2, Reward: 0
Step: 1755, Action: 2, Reward: 0
Step: 1756, Action: 2, Reward: 0
Step: 1757

Step: 2161, Action: 2, Reward: 0
Step: 2162, Action: 2, Reward: 0
Step: 2163, Action: 2, Reward: 0
Step: 2164, Action: 2, Reward: 0
Step: 2165, Action: 2, Reward: 0
Step: 2166, Action: 2, Reward: 0
Step: 2167, Action: 2, Reward: 0
Step: 2168, Action: 2, Reward: 0
Step: 2169, Action: 2, Reward: 0
Step: 2170, Action: 2, Reward: 0
Step: 2171, Action: 2, Reward: 0
Step: 2172, Action: 2, Reward: 0
Step: 2173, Action: 2, Reward: 0
Step: 2174, Action: 2, Reward: 0
Step: 2175, Action: 2, Reward: 0
Step: 2176, Action: 2, Reward: 0
Step: 2177, Action: 2, Reward: 0
Step: 2178, Action: 2, Reward: 0
Step: 2179, Action: 2, Reward: 0
Step: 2180, Action: 2, Reward: 0
Step: 2181, Action: 2, Reward: 0
Step: 2182, Action: 2, Reward: 0
Step: 2183, Action: 2, Reward: 0
Step: 2184, Action: 2, Reward: 0
Step: 2185, Action: 2, Reward: 0
Step: 2186, Action: 2, Reward: 0
Step: 2187, Action: 2, Reward: 0
Step: 2188, Action: 2, Reward: 0
Step: 2189, Action: 2, Reward: 0
Step: 2190, Action: 2, Reward: 0
Step: 2191

Step: 2572, Action: 2, Reward: 0
Step: 2573, Action: 2, Reward: 0
Step: 2574, Action: 2, Reward: 0
Step: 2575, Action: 2, Reward: 0
Step: 2576, Action: 2, Reward: 0
Step: 2577, Action: 2, Reward: 0
Step: 2578, Action: 2, Reward: 0
Step: 2579, Action: 2, Reward: 0
Step: 2580, Action: 2, Reward: 0
Step: 2581, Action: 2, Reward: 0
Step: 2582, Action: 2, Reward: 0
Step: 2583, Action: 2, Reward: 0
Step: 2584, Action: 2, Reward: 0
Step: 2585, Action: 2, Reward: 0
Step: 2586, Action: 2, Reward: 0
Step: 2587, Action: 2, Reward: 0
Step: 2588, Action: 2, Reward: 0
Step: 2589, Action: 2, Reward: 0
Step: 2590, Action: 2, Reward: 0
Step: 2591, Action: 2, Reward: 0
Step: 2592, Action: 2, Reward: 0
Step: 2593, Action: 2, Reward: 0
Step: 2594, Action: 2, Reward: 0
Step: 2595, Action: 2, Reward: 0
Step: 2596, Action: 2, Reward: 0
Step: 2597, Action: 2, Reward: 0
Step: 2598, Action: 2, Reward: 0
Step: 2599, Action: 2, Reward: 0
Step: 2600, Action: 2, Reward: 0
Step: 2601, Action: 2, Reward: 0
Step: 2602

Step: 2984, Action: 0, Reward: 0
Step: 2985, Action: 0, Reward: 0
Step: 2986, Action: 0, Reward: 0
Step: 2987, Action: 2, Reward: 0
Step: 2988, Action: 2, Reward: 0
Step: 2989, Action: 2, Reward: 0
Step: 2990, Action: 2, Reward: 0
Step: 2991, Action: 2, Reward: 0
Step: 2992, Action: 2, Reward: 0
Step: 2993, Action: 2, Reward: 0
Step: 2994, Action: 2, Reward: 0
Step: 2995, Action: 2, Reward: 0
Step: 2996, Action: 0, Reward: 0
Step: 2997, Action: 0, Reward: 0
Step: 2998, Action: 0, Reward: 0
Step: 2999, Action: 0, Reward: 0
Step: 3000, Action: 0, Reward: 0
Step: 3001, Action: 1, Reward: 1.9746135233630724
Step: 3002, Action: 1, Reward: 0
Step: 3003, Action: 2, Reward: 0.018619280841062423
Step: 3004, Action: 2, Reward: 0
Step: 3005, Action: 2, Reward: 0
Step: 3006, Action: 2, Reward: 0
Step: 3007, Action: 2, Reward: 0
Step: 3008, Action: 2, Reward: 0
Step: 3009, Action: 2, Reward: 0
Step: 3010, Action: 2, Reward: 0
Step: 3011, Action: 2, Reward: 0
Step: 3012, Action: 0, Reward: 0
Step: 3

Step: 3389, Action: 1, Reward: 0
Step: 3390, Action: 1, Reward: 0
Step: 3391, Action: 1, Reward: 0
Step: 3392, Action: 1, Reward: 0
Step: 3393, Action: 1, Reward: 0
Step: 3394, Action: 1, Reward: 0
Step: 3395, Action: 1, Reward: 0
Step: 3396, Action: 1, Reward: 0
Step: 3397, Action: 1, Reward: 0
Step: 3398, Action: 1, Reward: 0
Step: 3399, Action: 1, Reward: 0
Step: 3400, Action: 1, Reward: 0
Step: 3401, Action: 1, Reward: 0
Step: 3402, Action: 1, Reward: 0
Step: 3403, Action: 1, Reward: 0
Step: 3404, Action: 1, Reward: 0
Step: 3405, Action: 1, Reward: 0
Step: 3406, Action: 1, Reward: 0
Step: 3407, Action: 1, Reward: 0
Step: 3408, Action: 1, Reward: 0
Step: 3409, Action: 1, Reward: 0
Step: 3410, Action: 1, Reward: 0
Step: 3411, Action: 1, Reward: 0
Step: 3412, Action: 1, Reward: 0
Step: 3413, Action: 1, Reward: 0
Step: 3414, Action: 1, Reward: 0
Step: 3415, Action: 1, Reward: 0
Step: 3416, Action: 1, Reward: 0
Step: 3417, Action: 1, Reward: 0
Step: 3418, Action: 1, Reward: 0
Step: 3419

Step: 3800, Action: 2, Reward: 0
Step: 3801, Action: 2, Reward: 0
Step: 3802, Action: 2, Reward: 0
Step: 3803, Action: 2, Reward: 0
Step: 3804, Action: 2, Reward: 0
Step: 3805, Action: 2, Reward: 0
Step: 3806, Action: 2, Reward: 0
Step: 3807, Action: 2, Reward: 0
Step: 3808, Action: 2, Reward: 0
Step: 3809, Action: 2, Reward: 0
Step: 3810, Action: 2, Reward: 0
Step: 3811, Action: 2, Reward: 0
Step: 3812, Action: 2, Reward: 0
Step: 3813, Action: 2, Reward: 0
Step: 3814, Action: 2, Reward: 0
Step: 3815, Action: 2, Reward: 0
Step: 3816, Action: 2, Reward: 0
Step: 3817, Action: 2, Reward: 0
Step: 3818, Action: 2, Reward: 0
Step: 3819, Action: 2, Reward: 0
Step: 3820, Action: 2, Reward: 0
Step: 3821, Action: 2, Reward: 0
Step: 3822, Action: 2, Reward: 0
Step: 3823, Action: 2, Reward: 0
Step: 3824, Action: 2, Reward: 0
Step: 3825, Action: 2, Reward: 0
Step: 3826, Action: 2, Reward: 0
Step: 3827, Action: 2, Reward: 0
Step: 3828, Action: 2, Reward: 0
Step: 3829, Action: 2, Reward: 0
Step: 3830

In [4]:
#ensemble learning single agent
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO, DQN, A2C
from sklearn.preprocessing import StandardScaler
from collections import Counter

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.log = []  # Log for detailed reporting
        self.scaler = scaler  # Store the scaler for inverse scaling

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        self.log = []  # Reset log

        # Log initial holdings
        self.log.append(f"Agent starts with 0 holdings (neutral position), Initial Balance: {self.balance}")
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        return obs

    def inverse_scale_price(self, price):
        # Inverse transform the scaled price to get the original value (considering the 'close' column)
        inverse_scaled = self.scaler.inverse_transform([[0, 0, 0, price, 0]])[0][3]
        return inverse_scaled

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        original_price = self.inverse_scale_price(current_price)  # Get original (inverse-scaled) price

        # If agent buys
        if action == 1:
            if self.position == 0:  # Only buy if neutral
                self.position = 1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent buys at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Long")
            elif self.position == -1:  # Close short position
                reward = self.entry_price - current_price  # Scaled reward
                original_reward = self.inverse_scale_price(self.entry_price) - original_price
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes short at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        # If agent sells
        elif action == 2:
            if self.position == 0:  # Only sell if neutral
                self.position = -1
                self.entry_price = current_price
                original_entry_price = self.inverse_scale_price(current_price)
                self.log.append(f"Agent sells (short) at {original_entry_price}, Current Balance: {self.balance}, Holdings: 1 Short")
            elif self.position == 1:  # Close long position
                reward = current_price - self.entry_price  # Scaled reward
                original_reward = original_price - self.inverse_scale_price(self.entry_price)
                self.balance += original_reward
                self.position = 0
                self.trades.append(original_reward)
                self.log.append(f"Agent closes long at {original_price}, profit: {original_reward}, Current Balance: {self.balance}, Holdings: 0")

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

    def generate_report(self):
        print("\n--- Agent Report ---")
        for log in self.log:
            print(log)
        print(f"Final Balance: {self.balance}")
        print(f"Total Profit: {self.balance - self.initial_balance}")
        print(f"Number of Holdings (Long): {1 if self.position == 1 else 0}, Short: {1 if self.position == -1 else 0}")
        print("-" * 40)

# Function to calculate metrics
def calculate_metrics(trades, initial_balance, final_balance):
    # Total Profit
    total_profit = final_balance - initial_balance

    # Cumulative Return
    cumulative_return = (final_balance - initial_balance) / initial_balance

    # Win Rate
    positive_trades = [trade for trade in trades if trade > 0]
    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Profit Factor
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf

    # Sharpe Ratio
    returns = np.array(trades)
    mean_return = np.mean(returns)
    std_return = np.std(returns)
    sharpe_ratio = mean_return / std_return if std_return != 0 else 0

    # Sortino Ratio (using only downside standard deviation)
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = mean_return / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    balance_series = np.cumsum([initial_balance] + trades)  # Series of balance over time
    peak_balance = np.maximum.accumulate(balance_series)
    drawdowns = (peak_balance - balance_series) / peak_balance
    max_drawdown = np.max(drawdowns) if drawdowns.size > 0 else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Ensemble model function
def ensemble_predict(actions):
    # Convert numpy arrays to integers for each action
    actions = [int(action) for action in actions]
    # Perform a majority vote among the actions (hold, buy, sell)
    action_counts = Counter(actions)
    return action_counts.most_common(1)[0][0]

# Train and evaluate the ensemble model
def train_and_evaluate():
    # Load and normalize the data
    train_file = 'MSFT_TRAINING.csv'
    test_file = 'MSFT_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Create the environment using the training data
    env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)

    # Initialize each model and train them separately
    ppo_model = PPO("MlpPolicy", env_train, verbose=1)
    dqn_model = DQN("MlpPolicy", env_train, verbose=1)
    a2c_model = A2C("MlpPolicy", env_train, verbose=1)

    # Train each model
    ppo_model.learn(total_timesteps=50000)
    dqn_model.learn(total_timesteps=50000)
    a2c_model.learn(total_timesteps=50000)

    # Test the ensemble model on the training data
    obs, _ = env_train.reset()
    done = False
    while not done:
        # Get predictions from each model
        ppo_action, _ = ppo_model.predict(obs)
        dqn_action, _ = dqn_model.predict(obs)
        a2c_action, _ = a2c_model.predict(obs)

        # Aggregate the actions through majority voting
        final_action = ensemble_predict([ppo_action, dqn_action, a2c_action])

        # Step the environment with the final action
        obs, reward, done, truncated, info = env_train.step(final_action)

    # Calculate and display training metrics
    training_metrics = calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance)
    print("\n--- Training Metrics ---")
    for metric, value in training_metrics.items():
        print(f"{metric}: {value}")

    # Test the ensemble model on the testing data
    env_test = SingleAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    while not done:
        # Get predictions from each model
        ppo_action, _ = ppo_model.predict(obs)
        dqn_action, _ = dqn_model.predict(obs)
        a2c_action, _ = a2c_model.predict(obs)

        # Aggregate the actions through majority voting
        final_action = ensemble_predict([ppo_action, dqn_action, a2c_action])

        # Step the environment with the final action
        obs, reward, done, truncated, info = env_test.step(final_action)

    # Generate report for the testing session
    env_test.generate_report()

    # Calculate and display testing metrics
    testing_metrics = calculate_metrics(env_test.trades, env_test.initial_balance, env_test.balance)
    print("\n--- Testing Metrics ---")
    for metric, value in testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 1540 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1395         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0076965587 |
|    clip_fraction        | 0.0474       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.09        |
|    explained_variance   | -7.59  

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 8.05e+03   |
|    ep_rew_mean          | 3.83       |
| time/                   |            |
|    fps                  | 1229       |
|    iterations           | 12         |
|    time_elapsed         | 19         |
|    total_timesteps      | 24576      |
| train/                  |            |
|    approx_kl            | 0.00782019 |
|    clip_fraction        | 0.0675     |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.02      |
|    explained_variance   | -0.185     |
|    learning_rate        | 0.0003     |
|    loss                 | -0.00783   |
|    n_updates            | 110        |
|    policy_gradient_loss | -0.0111    |
|    value_loss           | 0.00178    |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_mean          | 8.94        |
| time/                   |             |
|    fps                  | 1244        |
|    iterations           | 22          |
|    time_elapsed         | 36          |
|    total_timesteps      | 45056       |
| train/                  |             |
|    approx_kl            | 0.006597498 |
|    clip_fraction        | 0.0979      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.905      |
|    explained_variance   | 0.478       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0206     |
|    n_updates            | 210         |
|    policy_gradient_loss | -0.00631    |
|    value_loss           | 0.00501     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+

------------------------------------
| time/                 |          |
|    fps                | 970      |
|    iterations         | 900      |
|    time_elapsed       | 4        |
|    total_timesteps    | 4500     |
| train/                |          |
|    entropy_loss       | -0.969   |
|    explained_variance | -18      |
|    learning_rate      | 0.0007   |
|    n_updates          | 899      |
|    policy_loss        | 0.00845  |
|    value_loss         | 0.000262 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 968       |
|    iterations         | 1000      |
|    time_elapsed       | 5         |
|    total_timesteps    | 5000      |
| train/                |           |
|    entropy_loss       | -0.861    |
|    explained_variance | -1.85e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 999       |
|    policy_loss        | -0.0424   |
|    value_loss         | 

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 1.05      |
| time/                 |           |
|    fps                | 969       |
|    iterations         | 2400      |
|    time_elapsed       | 12        |
|    total_timesteps    | 12000     |
| train/                |           |
|    entropy_loss       | -0.503    |
|    explained_variance | -4.96e+06 |
|    learning_rate      | 0.0007    |
|    n_updates          | 2399      |
|    policy_loss        | -0.00438  |
|    value_loss         | 0.000994  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 1.05     |
| time/                 |          |
|    fps                | 969      |
|    iterations         | 2500     |
|    time_elapsed       | 12       |
|    total_timesteps    | 12500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -1.38    |
| time/                 |          |
|    fps                | 968      |
|    iterations         | 3700     |
|    time_elapsed       | 19       |
|    total_timesteps    | 18500    |
| train/                |          |
|    entropy_loss       | -0.326   |
|    explained_variance | -21.1    |
|    learning_rate      | 0.0007   |
|    n_updates          | 3699     |
|    policy_loss        | 0.000658 |
|    value_loss         | 0.000128 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -1.38    |
| time/                 |          |
|    fps                | 968      |
|    iterations         | 3800     |
|    time_elapsed       | 19       |
|    total_timesteps    | 19000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -1.17    |
| time/                 |          |
|    fps                | 963      |
|    iterations         | 5000     |
|    time_elapsed       | 25       |
|    total_timesteps    | 25000    |
| train/                |          |
|    entropy_loss       | -0.247   |
|    explained_variance | -1.38    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4999     |
|    policy_loss        | 3.14e-05 |
|    value_loss         | 4.65e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -1.17    |
| time/                 |          |
|    fps                | 963      |
|    iterations         | 5100     |
|    time_elapsed       | 26       |
|    total_timesteps    | 25500    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -1.17     |
| time/                 |           |
|    fps                | 957       |
|    iterations         | 6300      |
|    time_elapsed       | 32        |
|    total_timesteps    | 31500     |
| train/                |           |
|    entropy_loss       | -0.0355   |
|    explained_variance | -22       |
|    learning_rate      | 0.0007    |
|    n_updates          | 6299      |
|    policy_loss        | -7.91e-07 |
|    value_loss         | 6.41e-08  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -1.17    |
| time/                 |          |
|    fps                | 958      |
|    iterations         | 6400     |
|    time_elapsed       | 33       |
|    total_timesteps    | 32000    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -1.04    |
| time/                 |          |
|    fps                | 958      |
|    iterations         | 7600     |
|    time_elapsed       | 39       |
|    total_timesteps    | 38000    |
| train/                |          |
|    entropy_loss       | -0.00464 |
|    explained_variance | 0.57     |
|    learning_rate      | 0.0007   |
|    n_updates          | 7599     |
|    policy_loss        | 5.27e-07 |
|    value_loss         | 1.24e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -1.04     |
| time/                 |           |
|    fps                | 958       |
|    iterations         | 7700      |
|    time_elapsed       | 40        |
|    total_timesteps    | 38500     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.778    |
| time/                 |           |
|    fps                | 961       |
|    iterations         | 8900      |
|    time_elapsed       | 46        |
|    total_timesteps    | 44500     |
| train/                |           |
|    entropy_loss       | -0.028    |
|    explained_variance | -331      |
|    learning_rate      | 0.0007    |
|    n_updates          | 8899      |
|    policy_loss        | -4.76e-05 |
|    value_loss         | 0.000174  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.778   |
| time/                 |          |
|    fps                | 961      |
|    iterations         | 9000     |
|    time_elapsed       | 46       |
|    total_timesteps    | 45000    |
| train/             