In [1]:
#multiagent ppo
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment for Training
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.entry_price = 0
        self.trades = []
        self.scaler = scaler

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)
        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, **kwargs):
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        return self._get_observation(), {}

    def _get_observation(self):
        return self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        if action == 1:  # Buy
            if self.position == 0:
                self.position = 1
                self.entry_price = current_price
            elif self.position == -1:  # Close short
                reward = self.entry_price - current_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0
        elif action == 2:  # Sell
            if self.position == 0:
                self.position = -1
                self.entry_price = current_price
            elif self.position == 1:  # Close long
                reward = current_price - self.entry_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

# Mixed Multi-Agent Testing Environment
class MixedMultiAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(MixedMultiAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.scaler = scaler

        # Two teams with two agents each
        self.num_teams = 2
        self.agents_per_team = 2
        self.num_agents = self.num_teams * self.agents_per_team

        # Initialize agent balances, positions, and trades
        self.balances = [initial_balance] * self.num_agents
        self.positions = [0] * self.num_agents
        self.entry_prices = [0] * self.num_agents
        self.trades = [[] for _ in range(self.num_agents)]

    def reset(self, **kwargs):
        self.current_step = 0
        self.balances = [self.initial_balance] * self.num_agents
        self.positions = [0] * self.num_agents
        self.entry_prices = [0] * self.num_agents
        self.trades = [[] for _ in range(self.num_agents)]
        return [self._get_observation() for _ in range(self.num_agents)], {}

    def _get_observation(self):
        return self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)

    def step(self, actions):
        rewards = [0] * self.num_agents
        current_price = self.data.iloc[self.current_step]['close']
        for i, action in enumerate(actions):
            if action == 1:  # Buy
                if self.positions[i] == 0:
                    self.positions[i] = 1
                    self.entry_prices[i] = current_price
                elif self.positions[i] == -1:  # Close short
                    reward = self.entry_prices[i] - current_price
                    self.balances[i] += reward
                    rewards[i] = reward
                    self.positions[i] = 0
                    self.trades[i].append(reward)
            elif action == 2:  # Sell
                if self.positions[i] == 0:
                    self.positions[i] = -1
                    self.entry_prices[i] = current_price
                elif self.positions[i] == 1:  # Close long
                    reward = current_price - self.entry_prices[i]
                    self.balances[i] += reward
                    rewards[i] = reward
                    self.positions[i] = 0
                    self.trades[i].append(reward)

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return [self._get_observation() for _ in range(self.num_agents)], rewards, terminated, truncated, {}

# Calculate individual metrics
def calculate_metrics(trades, initial_balance, final_balance):
    total_profit = final_balance - initial_balance
    cumulative_return = total_profit / initial_balance
    
    # Calculate positive and negative trades for profit factor
    positive_trades = [trade for trade in trades if trade > 0]
    negative_trades = [trade for trade in trades if trade < 0]
    profit_factor = sum(positive_trades) / abs(sum(negative_trades)) if negative_trades else float('inf')

    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Calculate Sharpe Ratio
    sharpe_ratio = np.mean(trades) / np.std(trades) if np.std(trades) != 0 else 0

    # Sortino Ratio (uses only negative trades as downside deviation)
    downside_std = np.std([trade for trade in trades if trade < 0])
    sortino_ratio = np.mean(trades) / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    cumulative_balance = np.cumsum(trades)
    running_max = np.maximum.accumulate(cumulative_balance)
    drawdown = running_max - cumulative_balance
    max_drawdown = np.max(drawdown) if len(drawdown) > 0 else 0

    return {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }

# Sharpe Ratio-weighted aggregation
def aggregate_metrics_sharpe_weighted(metrics_list):
    # Filter out agents with non-positive Sharpe Ratios
    positive_sharpe_metrics = [m for m in metrics_list if m["Sharpe Ratio"] > 0]
    total_sharpe = sum(m["Sharpe Ratio"] for m in positive_sharpe_metrics)
    
    # If no agents have a positive Sharpe Ratio, return zeros for all metrics
    if total_sharpe == 0:
        return {metric: 0 for metric in metrics_list[0]}
    
    combined_metrics = {
        "Total Profit": sum(m["Total Profit"] * m["Sharpe Ratio"] / total_sharpe for m in positive_sharpe_metrics),
        "Cumulative Return": sum(m["Cumulative Return"] * m["Sharpe Ratio"] / total_sharpe for m in positive_sharpe_metrics),
        "Win Rate": sum(m["Win Rate"] * m["Sharpe Ratio"] / total_sharpe for m in positive_sharpe_metrics),
        "Profit Factor": sum(m["Profit Factor"] * m["Sharpe Ratio"] / total_sharpe for m in positive_sharpe_metrics),
        "Sharpe Ratio": sum(m["Sharpe Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_sharpe_metrics),
        "Sortino Ratio": sum(m["Sortino Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_sharpe_metrics),
        "Maximum Drawdown": sum(m["Maximum Drawdown"] * m["Sharpe Ratio"] / total_sharpe for m in positive_sharpe_metrics),
    }
    return combined_metrics

# Train and evaluate each agent
def train_and_evaluate():
    train_file = 'NVDA_TRAINING.csv'
    test_file = 'NVDA_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Track training metrics
    training_metrics = []

    # Train each agent independently in single-agent environments
    models = []
    for i in range(4):  # 4 agents
        env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)
        model = PPO("MlpPolicy", env_train, verbose=1)
        model.learn(total_timesteps=50000)
        models.append(model)

        # Calculate training metrics for each agent
        final_balance = env_train.balance
        metrics = calculate_metrics(env_train.trades, env_train.initial_balance, final_balance)
        training_metrics.append(metrics)
        print(f"\n--- Agent {i+1} Training Metrics ---")
        for metric, value in metrics.items():
            print(f"{metric}: {value}")

    # Aggregate training metrics with Sharpe Ratio weighting
    combined_training_metrics = aggregate_metrics_sharpe_weighted(training_metrics)
    print("\n=== Combined Training Metrics for All Agents (Sharpe Ratio Weighted) ===")
    for metric, value in combined_training_metrics.items():
        print(f"{metric}: {value}")

    # Test in the mixed multi-agent environment
    env_test = MixedMultiAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    while not done:
        actions = [model.predict(obs[i])[0] for i, model in enumerate(models)]
        obs, rewards, done, truncated, _ = env_test.step(actions)

    # Calculate and display testing metrics for each agent in the testing environment
    testing_metrics = []
    for i in range(4):  # 4 agents
        final_balance = env_test.balances[i]
        metrics = calculate_metrics(env_test.trades[i], env_test.initial_balance, final_balance)
        testing_metrics.append(metrics)
        print(f"\n--- Agent {i+1} Testing Metrics ---")
        for metric, value in metrics.items():
            print(f"{metric}: {value}")

    # Aggregate testing metrics with Sharpe Ratio weighting
    combined_testing_metrics = aggregate_metrics_sharpe_weighted(testing_metrics)
    print("\n=== Combined Testing Metrics for All Agents (Sharpe Ratio Weighted) ===")
    for metric, value in combined_testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 859  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 814          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0115012415 |
|    clip_fraction        | 0.0614       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.09        |
|    explained_variance   | -18.4        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.000696     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00736     |
|    val

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 7.93e+03     |
|    ep_rew_mean          | 1.62         |
| time/                   |              |
|    fps                  | 938          |
|    iterations           | 12           |
|    time_elapsed         | 26           |
|    total_timesteps      | 24576        |
| train/                  |              |
|    approx_kl            | 0.0036267336 |
|    clip_fraction        | 0.0516       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.989       |
|    explained_variance   | 0.269        |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0169      |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00868     |
|    value_loss           | 0.00161      |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+03    |
|    ep_rew_mean          | 3.21        |
| time/                   |             |
|    fps                  | 929         |
|    iterations           | 22          |
|    time_elapsed         | 48          |
|    total_timesteps      | 45056       |
| train/                  |             |
|    approx_kl            | 0.009275574 |
|    clip_fraction        | 0.0977      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.789      |
|    explained_variance   | -0.565      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0136     |
|    n_updates            | 210         |
|    policy_gradient_loss | -0.0126     |
|    value_loss           | 0.000814    |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 7.93

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+03    |
|    ep_rew_mean          | 1.05        |
| time/                   |             |
|    fps                  | 993         |
|    iterations           | 7           |
|    time_elapsed         | 14          |
|    total_timesteps      | 14336       |
| train/                  |             |
|    approx_kl            | 0.008415882 |
|    clip_fraction        | 0.0641      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.06       |
|    explained_variance   | 0.0639      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0244     |
|    n_updates            | 60          |
|    policy_gradient_loss | -0.0105     |
|    value_loss           | 0.00181     |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 7.93

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+03    |
|    ep_rew_mean          | 2.36        |
| time/                   |             |
|    fps                  | 863         |
|    iterations           | 17          |
|    time_elapsed         | 40          |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008283861 |
|    clip_fraction        | 0.0751      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.998      |
|    explained_variance   | 0.301       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0144     |
|    n_updates            | 160         |
|    policy_gradient_loss | -0.00916    |
|    value_loss           | 0.00471     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+

-----------------------------
| time/              |      |
|    fps             | 881  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 764         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009144236 |
|    clip_fraction        | 0.0421      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -11.3       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00548    |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0056     |
|    value_loss           | 0.00919     |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+03    |
|    ep_rew_mean          | 1.68        |
| time/                   |             |
|    fps                  | 784         |
|    iterations           | 12          |
|    time_elapsed         | 31          |
|    total_timesteps      | 24576       |
| train/                  |             |
|    approx_kl            | 0.007996848 |
|    clip_fraction        | 0.07        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.04       |
|    explained_variance   | -0.0929     |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00313     |
|    n_updates            | 110         |
|    policy_gradient_loss | -0.00986    |
|    value_loss           | 0.00154     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+03    |
|    ep_rew_mean          | 3.79        |
| time/                   |             |
|    fps                  | 821         |
|    iterations           | 22          |
|    time_elapsed         | 54          |
|    total_timesteps      | 45056       |
| train/                  |             |
|    approx_kl            | 0.007927354 |
|    clip_fraction        | 0.0678      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.994      |
|    explained_variance   | -0.484      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0133     |
|    n_updates            | 210         |
|    policy_gradient_loss | -0.00776    |
|    value_loss           | 0.00103     |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 7.93

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+03    |
|    ep_rew_mean          | -1.68       |
| time/                   |             |
|    fps                  | 769         |
|    iterations           | 7           |
|    time_elapsed         | 18          |
|    total_timesteps      | 14336       |
| train/                  |             |
|    approx_kl            | 0.007809998 |
|    clip_fraction        | 0.0484      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.05       |
|    explained_variance   | -0.239      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0135     |
|    n_updates            | 60          |
|    policy_gradient_loss | -0.0064     |
|    value_loss           | 0.00176     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 7.93e+03     |
|    ep_rew_mean          | 2.11         |
| time/                   |              |
|    fps                  | 823          |
|    iterations           | 17           |
|    time_elapsed         | 42           |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0073111774 |
|    clip_fraction        | 0.0648       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.01        |
|    explained_variance   | 0.632        |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0196      |
|    n_updates            | 160          |
|    policy_gradient_loss | -0.00637     |
|    value_loss           | 0.00417      |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len


--- Agent 1 Testing Metrics ---
Total Profit: 14.662891536683674
Cumulative Return: 0.0014662891536683673
Win Rate: 0.6056338028169014
Profit Factor: 2.8804466756159646
Sharpe Ratio: 0.31746287522127536
Sortino Ratio: 0.6070888744828945
Maximum Drawdown: 0.49286116978835004

--- Agent 2 Testing Metrics ---
Total Profit: 10.52940020764072
Cumulative Return: 0.001052940020764072
Win Rate: 0.5748792270531401
Profit Factor: 2.1309575405538963
Sharpe Ratio: 0.24234208137457372
Sortino Ratio: 0.45049020113699756
Maximum Drawdown: 0.43084592033460156

--- Agent 3 Testing Metrics ---
Total Profit: 18.51808105841883
Cumulative Return: 0.001851808105841883
Win Rate: 0.6206088992974239
Profit Factor: 3.8185695604975933
Sharpe Ratio: 0.3806844811058458
Sortino Ratio: 0.9216651156865749
Maximum Drawdown: 0.21299595280572703

--- Agent 4 Testing Metrics ---
Total Profit: 15.753809011315752
Cumulative Return: 0.0015753809011315753
Win Rate: 0.6572379367720466
Profit Factor: 4.554107391793602
Sharpe 

In [2]:
#multiagent DQN
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import DQN
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.scaler = scaler

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        return self._get_observation(), {}

    def _get_observation(self):
        return self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']

        if action == 1:  # Buy
            if self.position == 0:
                self.position = 1
                self.entry_price = current_price
            elif self.position == -1:
                reward = self.entry_price - current_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0

        elif action == 2:  # Sell
            if self.position == 0:
                self.position = -1
                self.entry_price = current_price
            elif self.position == 1:
                reward = current_price - self.entry_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

# Multi-Agent Trading Environment
class MultiAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None, num_agents=4):
        super(MultiAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.scaler = scaler
        self.num_agents = num_agents
        self.agents = [SingleAgentEnv(data, window_size, initial_balance, scaler) for _ in range(num_agents)]

    def reset(self):
        obs = []
        for agent in self.agents:
            agent_obs, _ = agent.reset()
            obs.append(agent_obs)
        return obs

    def step(self, actions):
        obs, rewards, terminated, truncated, infos = [], [], [], [], []
        for agent, action in zip(self.agents, actions):
            agent_obs, reward, done, truncate, info = agent.step(action)
            obs.append(agent_obs)
            rewards.append(reward)
            terminated.append(done)
            truncated.append(truncate)
            infos.append(info)
        return obs, rewards, any(terminated), any(truncated), infos

# Function to calculate metrics for each agent
def calculate_metrics(trades, initial_balance, final_balance):
    total_profit = final_balance - initial_balance
    cumulative_return = (final_balance - initial_balance) / initial_balance
    win_rate = len([trade for trade in trades if trade > 0]) / len(trades) if trades else 0
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf
    returns = np.array(trades)
    sharpe_ratio = np.mean(returns) / np.std(returns) if np.std(returns) != 0 else 0
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = np.mean(returns) / downside_std if downside_std != 0 else 0
    max_drawdown = np.max(np.maximum.accumulate(np.cumsum(trades)) - np.cumsum(trades)) if trades else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Sharpe Ratio-weighted aggregation for combined metrics
def aggregate_metrics_sharpe_weighted(metrics_list):
    positive_metrics = [m for m in metrics_list if m["Sharpe Ratio"] > 0]
    total_sharpe = sum(m["Sharpe Ratio"] for m in positive_metrics)
    if total_sharpe == 0:
        return {metric: 0 for metric in metrics_list[0]}
    
    combined_metrics = {
        "Total Profit": sum(m["Total Profit"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Cumulative Return": sum(m["Cumulative Return"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Win Rate": sum(m["Win Rate"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Profit Factor": sum(m["Profit Factor"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Sharpe Ratio": sum(m["Sharpe Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Sortino Ratio": sum(m["Sortino Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Maximum Drawdown": sum(m["Maximum Drawdown"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
    }
    return combined_metrics

# Train and evaluate each agent
def train_and_evaluate():
    train_file = 'NVDA_TRAINING.csv'
    test_file = 'NVDA_TESTING.csv'
    df_train, df_test, scaler = load_and_normalize_data(train_file, test_file)

    training_metrics = []
    models = []

    # Train each agent independently
    for i in range(4):  # 4 agents
        env_train = SingleAgentEnv(df_train, window_size=10, scaler=scaler)
        model = DQN("MlpPolicy", env_train, verbose=1)
        model.learn(total_timesteps=50000)
        models.append(model)

        # Record training metrics
        training_metrics.append(calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance))

    combined_training_metrics = aggregate_metrics_sharpe_weighted(training_metrics)
    print("\n=== Combined Training Metrics (Sharpe Ratio Weighted) ===")
    for metric, value in combined_training_metrics.items():
        print(f"{metric}: {value}")

    # Test in the multi-agent environment
    env_test = MultiAgentEnv(df_test, window_size=10, scaler=scaler, num_agents=4)
    obs = env_test.reset()
    done = False
    while not done:
        actions = [model.predict(obs[i])[0] for i, model in enumerate(models)]
        obs, rewards, done, truncated, _ = env_test.step(actions)

    # Calculate and display testing metrics
    testing_metrics = []
    for agent in env_test.agents:
        testing_metrics.append(calculate_metrics(agent.trades, agent.initial_balance, agent.balance))

    combined_testing_metrics = aggregate_metrics_sharpe_weighted(testing_metrics)
    print("\n=== Combined Testing Metrics (Sharpe Ratio Weighted) ===")
    for metric, value in combined_testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 7.93e+03 |
|    ep_rew_mean      | -2.61    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1096     |
|    time_elapsed     | 28       |
|    total_timesteps  | 31732    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 6.32e-05 |
|    n_updates        | 7907     |
----------------------------------
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 7.93e+03 |
|    ep_rew_mean      | 0.108    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1105     |
|    ti

In [3]:
#multiagent a2c
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import A2C
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment with Modified Reward Structure
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.scaler = scaler

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Normalized stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=0, high=1, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        obs -= np.min(obs, axis=0)
        obs /= np.max(obs, axis=0) + 1e-8  # Normalizing to [0,1]
        return obs

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']

        if action == 1:  # Buy
            if self.position == 0:
                self.position = 1
                self.entry_price = current_price
            elif self.position == -1:
                reward = self.entry_price - current_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0

        elif action == 2:  # Sell
            if self.position == 0:
                self.position = -1
                self.entry_price = current_price
            elif self.position == 1:
                reward = current_price - self.entry_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0

        # Normalize reward and add transaction cost penalty
        reward = (reward / self.initial_balance) - 0.001  # Small penalty for holding a position

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), np.clip(reward, -1, 1), terminated, truncated, {}

# Multi-Agent Trading Environment
class MultiAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None, num_agents=4):
        super(MultiAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.scaler = scaler
        self.num_agents = num_agents
        self.agents = [SingleAgentEnv(data, window_size, initial_balance, scaler) for _ in range(num_agents)]

    def reset(self):
        obs = []
        for agent in self.agents:
            agent_obs, _ = agent.reset()
            obs.append(agent_obs)
        return obs

    def step(self, actions):
        obs, rewards, terminated, truncated, infos = [], [], [], [], []
        for agent, action in zip(self.agents, actions):
            agent_obs, reward, done, truncate, info = agent.step(action)
            obs.append(agent_obs)
            rewards.append(reward)
            terminated.append(done)
            truncated.append(truncate)
            infos.append(info)
        return obs, rewards, any(terminated), any(truncated), infos

# Function to calculate metrics for each agent
def calculate_metrics(trades, initial_balance, final_balance):
    total_profit = final_balance - initial_balance
    cumulative_return = (final_balance - initial_balance) / initial_balance
    win_rate = len([trade for trade in trades if trade > 0]) / len(trades) if trades else 0
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf
    returns = np.array(trades)
    sharpe_ratio = np.mean(returns) / np.std(returns) if np.std(returns) != 0 else 0
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = np.mean(returns) / downside_std if downside_std != 0 else 0
    max_drawdown = np.max(np.maximum.accumulate(np.cumsum(trades)) - np.cumsum(trades)) if trades else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Sharpe Ratio-weighted aggregation for combined metrics
def aggregate_metrics_sharpe_weighted(metrics_list):
    positive_metrics = [m for m in metrics_list if m["Sharpe Ratio"] > 0]
    total_sharpe = sum(m["Sharpe Ratio"] for m in positive_metrics)
    if total_sharpe == 0:
        return {metric: 0 for metric in metrics_list[0]}
    
    combined_metrics = {
        "Total Profit": sum(m["Total Profit"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Cumulative Return": sum(m["Cumulative Return"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Win Rate": sum(m["Win Rate"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Profit Factor": sum(m["Profit Factor"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Sharpe Ratio": sum(m["Sharpe Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Sortino Ratio": sum(m["Sortino Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Maximum Drawdown": sum(m["Maximum Drawdown"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
    }
    return combined_metrics

# Train and evaluate each agent
def train_and_evaluate():
    train_file = 'NVDA_TRAINING.csv'
    test_file = 'NVDA_TESTING.csv'
    df_train, df_test, scaler = load_and_normalize_data(train_file, test_file)

    training_metrics = []
    models = []

    # Train each agent independently
    for i in range(4):  # 4 agents
        env_train = SingleAgentEnv(df_train, window_size=10, scaler=scaler)
        model = A2C("MlpPolicy", env_train, verbose=1)
        model.learn(total_timesteps=100000)  # Increased timesteps
        models.append(model)

        # Record training metrics
        training_metrics.append(calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance))

    combined_training_metrics = aggregate_metrics_sharpe_weighted(training_metrics)
    print("\n=== Combined Training Metrics (Sharpe Ratio Weighted) ===")
    for metric, value in combined_training_metrics.items():
        print(f"{metric}: {value}")

    # Test in the multi-agent environment
    env_test = MultiAgentEnv(df_test, window_size=10, scaler=scaler, num_agents=4)
    obs = env_test.reset()
    done = False
    while not done:
        actions = [model.predict(obs[i])[0] for i, model in enumerate(models)]
        obs, rewards, done, truncated, _ = env_test.step(actions)

    # Calculate and display testing metrics
    testing_metrics = []
    for agent in env_test.agents:
        testing_metrics.append(calculate_metrics(agent.trades, agent.initial_balance, agent.balance))

    combined_testing_metrics = aggregate_metrics_sharpe_weighted(testing_metrics)
    print("\n=== Combined Testing Metrics (Sharpe Ratio Weighted) ===")
    for metric, value in combined_testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-------------------------------------
| time/                 |           |
|    fps                | 854       |
|    iterations         | 100       |
|    time_elapsed       | 0         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -0.836    |
|    explained_variance | -4.26e+04 |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | 0.0283    |
|    value_loss         | 0.00971   |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 784       |
|    iterations         | 200       |
|    time_elapsed       | 1         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -0.723    |
|    explained_variance | -2.56e+04 |
|    learning_rate      | 0.0007

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 746      |
|    iterations         | 1700     |
|    time_elapsed       | 11       |
|    total_timesteps    | 8500     |
| train/                |          |
|    entropy_loss       | -0.593   |
|    explained_variance | -92.1    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1699     |
|    policy_loss        | -0.00593 |
|    value_loss         | 8.81e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 750      |
|    iterations         | 1800     |
|    time_elapsed       | 11       |
|    total_timesteps    | 9000     |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 737      |
|    iterations         | 3000     |
|    time_elapsed       | 20       |
|    total_timesteps    | 15000    |
| train/                |          |
|    entropy_loss       | -0.263   |
|    explained_variance | -27.1    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2999     |
|    policy_loss        | 9.98e-05 |
|    value_loss         | 8.6e-06  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 739      |
|    iterations         | 3100     |
|    time_elapsed       | 20       |
|    total_timesteps    | 15500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 759      |
|    iterations         | 4300     |
|    time_elapsed       | 28       |
|    total_timesteps    | 21500    |
| train/                |          |
|    entropy_loss       | -0.287   |
|    explained_variance | -196     |
|    learning_rate      | 0.0007   |
|    n_updates          | 4299     |
|    policy_loss        | -0.00287 |
|    value_loss         | 2.07e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 761      |
|    iterations         | 4400     |
|    time_elapsed       | 28       |
|    total_timesteps    | 22000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 774       |
|    iterations         | 5600      |
|    time_elapsed       | 36        |
|    total_timesteps    | 28000     |
| train/                |           |
|    entropy_loss       | -0.11     |
|    explained_variance | -43.6     |
|    learning_rate      | 0.0007    |
|    n_updates          | 5599      |
|    policy_loss        | -2.72e-05 |
|    value_loss         | 6.21e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 775      |
|    iterations         | 5700     |
|    time_elapsed       | 36       |
|    total_timesteps    | 28500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 779      |
|    iterations         | 6900     |
|    time_elapsed       | 44       |
|    total_timesteps    | 34500    |
| train/                |          |
|    entropy_loss       | -0.276   |
|    explained_variance | -82.6    |
|    learning_rate      | 0.0007   |
|    n_updates          | 6899     |
|    policy_loss        | 3.27e-06 |
|    value_loss         | 6.38e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 775      |
|    iterations         | 7000     |
|    time_elapsed       | 45       |
|    total_timesteps    | 35000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 760       |
|    iterations         | 8200      |
|    time_elapsed       | 53        |
|    total_timesteps    | 41000     |
| train/                |           |
|    entropy_loss       | -0.0349   |
|    explained_variance | -1.03e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 8199      |
|    policy_loss        | -1.35e-05 |
|    value_loss         | 9.89e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 760       |
|    iterations         | 8300      |
|    time_elapsed       | 54        |
|    total_timesteps    | 41500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 754       |
|    iterations         | 9500      |
|    time_elapsed       | 62        |
|    total_timesteps    | 47500     |
| train/                |           |
|    entropy_loss       | -0.0268   |
|    explained_variance | -1.29e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 9499      |
|    policy_loss        | -5.92e-06 |
|    value_loss         | 3.68e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 755      |
|    iterations         | 9600     |
|    time_elapsed       | 63       |
|    total_timesteps    | 48000    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 759       |
|    iterations         | 10800     |
|    time_elapsed       | 71        |
|    total_timesteps    | 54000     |
| train/                |           |
|    entropy_loss       | -0.0245   |
|    explained_variance | -16.1     |
|    learning_rate      | 0.0007    |
|    n_updates          | 10799     |
|    policy_loss        | -8.99e-06 |
|    value_loss         | 8.17e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 760      |
|    iterations         | 10900    |
|    time_elapsed       | 71       |
|    total_timesteps    | 54500    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 740       |
|    iterations         | 12100     |
|    time_elapsed       | 81        |
|    total_timesteps    | 60500     |
| train/                |           |
|    entropy_loss       | -0.00692  |
|    explained_variance | -14.1     |
|    learning_rate      | 0.0007    |
|    n_updates          | 12099     |
|    policy_loss        | -7.52e-07 |
|    value_loss         | 1.12e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 738      |
|    iterations         | 12200    |
|    time_elapsed       | 82       |
|    total_timesteps    | 61000    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 737       |
|    iterations         | 13400     |
|    time_elapsed       | 90        |
|    total_timesteps    | 67000     |
| train/                |           |
|    entropy_loss       | -0.0178   |
|    explained_variance | -10.3     |
|    learning_rate      | 0.0007    |
|    n_updates          | 13399     |
|    policy_loss        | -4.26e-07 |
|    value_loss         | 5.74e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 737       |
|    iterations         | 13500     |
|    time_elapsed       | 91        |
|    total_timesteps    | 67500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 746       |
|    iterations         | 14700     |
|    time_elapsed       | 98        |
|    total_timesteps    | 73500     |
| train/                |           |
|    entropy_loss       | -0.00865  |
|    explained_variance | -51       |
|    learning_rate      | 0.0007    |
|    n_updates          | 14699     |
|    policy_loss        | -3.55e-07 |
|    value_loss         | 2.22e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 744       |
|    iterations         | 14800     |
|    time_elapsed       | 99        |
|    total_timesteps    | 74000     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 748      |
|    iterations         | 16000    |
|    time_elapsed       | 106      |
|    total_timesteps    | 80000    |
| train/                |          |
|    entropy_loss       | -0.00942 |
|    explained_variance | -27      |
|    learning_rate      | 0.0007   |
|    n_updates          | 15999    |
|    policy_loss        | 4.03e-06 |
|    value_loss         | 1.54e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 749       |
|    iterations         | 16100     |
|    time_elapsed       | 107       |
|    total_timesteps    | 80500     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 754       |
|    iterations         | 17300     |
|    time_elapsed       | 114       |
|    total_timesteps    | 86500     |
| train/                |           |
|    entropy_loss       | -0.0196   |
|    explained_variance | -3.03     |
|    learning_rate      | 0.0007    |
|    n_updates          | 17299     |
|    policy_loss        | -4.81e-06 |
|    value_loss         | 3.86e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 755      |
|    iterations         | 17400    |
|    time_elapsed       | 115      |
|    total_timesteps    | 87000    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 758       |
|    iterations         | 18600     |
|    time_elapsed       | 122       |
|    total_timesteps    | 93000     |
| train/                |           |
|    entropy_loss       | -0.0121   |
|    explained_variance | -5.84     |
|    learning_rate      | 0.0007    |
|    n_updates          | 18599     |
|    policy_loss        | -6.42e-07 |
|    value_loss         | 2.03e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 758      |
|    iterations         | 18700    |
|    time_elapsed       | 123      |
|    total_timesteps    | 93500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 765      |
|    iterations         | 19900    |
|    time_elapsed       | 130      |
|    total_timesteps    | 99500    |
| train/                |          |
|    entropy_loss       | -0.0304  |
|    explained_variance | -21.3    |
|    learning_rate      | 0.0007   |
|    n_updates          | 19899    |
|    policy_loss        | 3.8e-07  |
|    value_loss         | 1.26e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 765      |
|    iterations         | 20000    |
|    time_elapsed       | 130      |
|    total_timesteps    | 100000   |
| train/                |          |
|

------------------------------------
| time/                 |          |
|    fps                | 851      |
|    iterations         | 1500     |
|    time_elapsed       | 8        |
|    total_timesteps    | 7500     |
| train/                |          |
|    entropy_loss       | -0.732   |
|    explained_variance | -661     |
|    learning_rate      | 0.0007   |
|    n_updates          | 1499     |
|    policy_loss        | -0.00123 |
|    value_loss         | 9.33e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 849      |
|    iterations         | 1600     |
|    time_elapsed       | 9        |
|    total_timesteps    | 8000     |
| train/                |          |
|    entropy_loss       | -0.748   |
|    explained_variance | -1e+03   |
|    learning_rate      | 0.0007   |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 860      |
|    iterations         | 2900     |
|    time_elapsed       | 16       |
|    total_timesteps    | 14500    |
| train/                |          |
|    entropy_loss       | -0.826   |
|    explained_variance | -66.4    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2899     |
|    policy_loss        | -0.00354 |
|    value_loss         | 3.65e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 862      |
|    iterations         | 3000     |
|    time_elapsed       | 17       |
|    total_timesteps    | 15000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 855      |
|    iterations         | 4200     |
|    time_elapsed       | 24       |
|    total_timesteps    | 21000    |
| train/                |          |
|    entropy_loss       | -1.01    |
|    explained_variance | -37.8    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4199     |
|    policy_loss        | 0.00117  |
|    value_loss         | 4.78e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 856      |
|    iterations         | 4300     |
|    time_elapsed       | 25       |
|    total_timesteps    | 21500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 863      |
|    iterations         | 5500     |
|    time_elapsed       | 31       |
|    total_timesteps    | 27500    |
| train/                |          |
|    entropy_loss       | -0.372   |
|    explained_variance | -53      |
|    learning_rate      | 0.0007   |
|    n_updates          | 5499     |
|    policy_loss        | 0.00048  |
|    value_loss         | 1.84e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 862      |
|    iterations         | 5600     |
|    time_elapsed       | 32       |
|    total_timesteps    | 28000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 864       |
|    iterations         | 6800      |
|    time_elapsed       | 39        |
|    total_timesteps    | 34000     |
| train/                |           |
|    entropy_loss       | -0.0217   |
|    explained_variance | -2.21     |
|    learning_rate      | 0.0007    |
|    n_updates          | 6799      |
|    policy_loss        | -6.78e-07 |
|    value_loss         | 2.24e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 863      |
|    iterations         | 6900     |
|    time_elapsed       | 39       |
|    total_timesteps    | 34500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 857      |
|    iterations         | 8100     |
|    time_elapsed       | 47       |
|    total_timesteps    | 40500    |
| train/                |          |
|    entropy_loss       | -0.0465  |
|    explained_variance | -13.2    |
|    learning_rate      | 0.0007   |
|    n_updates          | 8099     |
|    policy_loss        | -1.3e-05 |
|    value_loss         | 3.48e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 857      |
|    iterations         | 8200     |
|    time_elapsed       | 47       |
|    total_timesteps    | 41000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 847       |
|    iterations         | 9400      |
|    time_elapsed       | 55        |
|    total_timesteps    | 47000     |
| train/                |           |
|    entropy_loss       | -0.0103   |
|    explained_variance | -708      |
|    learning_rate      | 0.0007    |
|    n_updates          | 9399      |
|    policy_loss        | -3.69e-06 |
|    value_loss         | 1.19e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 845       |
|    iterations         | 9500      |
|    time_elapsed       | 56        |
|    total_timesteps    | 47500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 838       |
|    iterations         | 10700     |
|    time_elapsed       | 63        |
|    total_timesteps    | 53500     |
| train/                |           |
|    entropy_loss       | -0.0769   |
|    explained_variance | -49.7     |
|    learning_rate      | 0.0007    |
|    n_updates          | 10699     |
|    policy_loss        | -4.15e-06 |
|    value_loss         | 7.29e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 837      |
|    iterations         | 10800    |
|    time_elapsed       | 64       |
|    total_timesteps    | 54000    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 834       |
|    iterations         | 12000     |
|    time_elapsed       | 71        |
|    total_timesteps    | 60000     |
| train/                |           |
|    entropy_loss       | -0.0201   |
|    explained_variance | -24.5     |
|    learning_rate      | 0.0007    |
|    n_updates          | 11999     |
|    policy_loss        | -2.41e-06 |
|    value_loss         | 1.74e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 834       |
|    iterations         | 12100     |
|    time_elapsed       | 72        |
|    total_timesteps    | 60500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 832       |
|    iterations         | 13300     |
|    time_elapsed       | 79        |
|    total_timesteps    | 66500     |
| train/                |           |
|    entropy_loss       | -0.0133   |
|    explained_variance | -6.92     |
|    learning_rate      | 0.0007    |
|    n_updates          | 13299     |
|    policy_loss        | -2.31e-06 |
|    value_loss         | 1.73e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 832      |
|    iterations         | 13400    |
|    time_elapsed       | 80       |
|    total_timesteps    | 67000    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 831       |
|    iterations         | 14600     |
|    time_elapsed       | 87        |
|    total_timesteps    | 73000     |
| train/                |           |
|    entropy_loss       | -0.00586  |
|    explained_variance | -2.55e+07 |
|    learning_rate      | 0.0007    |
|    n_updates          | 14599     |
|    policy_loss        | -3.15e-07 |
|    value_loss         | 3.89e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 830      |
|    iterations         | 14700    |
|    time_elapsed       | 88       |
|    total_timesteps    | 73500    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 830       |
|    iterations         | 15900     |
|    time_elapsed       | 95        |
|    total_timesteps    | 79500     |
| train/                |           |
|    entropy_loss       | -0.00502  |
|    explained_variance | -43.7     |
|    learning_rate      | 0.0007    |
|    n_updates          | 15899     |
|    policy_loss        | -2.94e-06 |
|    value_loss         | 6.15e-05  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 830      |
|    iterations         | 16000    |
|    time_elapsed       | 96       |
|    total_timesteps    | 80000    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 820       |
|    iterations         | 17200     |
|    time_elapsed       | 104       |
|    total_timesteps    | 86000     |
| train/                |           |
|    entropy_loss       | -0.00258  |
|    explained_variance | -5.38e+04 |
|    learning_rate      | 0.0007    |
|    n_updates          | 17199     |
|    policy_loss        | 1.23e-08  |
|    value_loss         | 2.64e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 820      |
|    iterations         | 17300    |
|    time_elapsed       | 105      |
|    total_timesteps    | 86500    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 816       |
|    iterations         | 18500     |
|    time_elapsed       | 113       |
|    total_timesteps    | 92500     |
| train/                |           |
|    entropy_loss       | -0.00242  |
|    explained_variance | -3.42     |
|    learning_rate      | 0.0007    |
|    n_updates          | 18499     |
|    policy_loss        | -3.09e-07 |
|    value_loss         | 1.8e-06   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 816       |
|    iterations         | 18600     |
|    time_elapsed       | 113       |
|    total_timesteps    | 93000     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 814       |
|    iterations         | 19800     |
|    time_elapsed       | 121       |
|    total_timesteps    | 99000     |
| train/                |           |
|    entropy_loss       | -0.00161  |
|    explained_variance | -79.8     |
|    learning_rate      | 0.0007    |
|    n_updates          | 19799     |
|    policy_loss        | -9.01e-08 |
|    value_loss         | 4.81e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 814       |
|    iterations         | 19900     |
|    time_elapsed       | 122       |
|    total_timesteps    | 99500     |
| train/    

-------------------------------------
| time/                 |           |
|    fps                | 780       |
|    iterations         | 1300      |
|    time_elapsed       | 8         |
|    total_timesteps    | 6500      |
| train/                |           |
|    entropy_loss       | -0.592    |
|    explained_variance | -4.47e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 1299      |
|    policy_loss        | -0.000128 |
|    value_loss         | 0.000188  |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 785      |
|    iterations         | 1400     |
|    time_elapsed       | 8        |
|    total_timesteps    | 7000     |
| train/                |          |
|    entropy_loss       | -0.716   |
|    explained_variance | -390     |
|    learning_rate      | 0.0007   |
|    n_updates          | 1399     |
|    policy_loss        | -0.00197 |
|    value_loss         

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 805      |
|    iterations         | 2700     |
|    time_elapsed       | 16       |
|    total_timesteps    | 13500    |
| train/                |          |
|    entropy_loss       | -0.671   |
|    explained_variance | -90.3    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2699     |
|    policy_loss        | -0.00489 |
|    value_loss         | 0.000102 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 806       |
|    iterations         | 2800      |
|    time_elapsed       | 17        |
|    total_timesteps    | 14000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 817      |
|    iterations         | 4000     |
|    time_elapsed       | 24       |
|    total_timesteps    | 20000    |
| train/                |          |
|    entropy_loss       | -0.468   |
|    explained_variance | -21.6    |
|    learning_rate      | 0.0007   |
|    n_updates          | 3999     |
|    policy_loss        | 0.00209  |
|    value_loss         | 1.03e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 818       |
|    iterations         | 4100      |
|    time_elapsed       | 25        |
|    total_timesteps    | 20500     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 828       |
|    iterations         | 5300      |
|    time_elapsed       | 31        |
|    total_timesteps    | 26500     |
| train/                |           |
|    entropy_loss       | -0.554    |
|    explained_variance | -6.39     |
|    learning_rate      | 0.0007    |
|    n_updates          | 5299      |
|    policy_loss        | -0.000349 |
|    value_loss         | 1.28e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 828      |
|    iterations         | 5400     |
|    time_elapsed       | 32       |
|    total_timesteps    | 27000    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 814      |
|    iterations         | 6600     |
|    time_elapsed       | 40       |
|    total_timesteps    | 33000    |
| train/                |          |
|    entropy_loss       | -0.72    |
|    explained_variance | -204     |
|    learning_rate      | 0.0007   |
|    n_updates          | 6599     |
|    policy_loss        | 0.000281 |
|    value_loss         | 5.01e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 815      |
|    iterations         | 6700     |
|    time_elapsed       | 41       |
|    total_timesteps    | 33500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 813      |
|    iterations         | 7900     |
|    time_elapsed       | 48       |
|    total_timesteps    | 39500    |
| train/                |          |
|    entropy_loss       | -0.389   |
|    explained_variance | -141     |
|    learning_rate      | 0.0007   |
|    n_updates          | 7899     |
|    policy_loss        | 0.000455 |
|    value_loss         | 1.23e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 814       |
|    iterations         | 8000      |
|    time_elapsed       | 49        |
|    total_timesteps    | 40000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 820      |
|    iterations         | 9200     |
|    time_elapsed       | 56       |
|    total_timesteps    | 46000    |
| train/                |          |
|    entropy_loss       | -0.489   |
|    explained_variance | -871     |
|    learning_rate      | 0.0007   |
|    n_updates          | 9199     |
|    policy_loss        | 0.00109  |
|    value_loss         | 3.46e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 820       |
|    iterations         | 9300      |
|    time_elapsed       | 56        |
|    total_timesteps    | 46500     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 823      |
|    iterations         | 10500    |
|    time_elapsed       | 63       |
|    total_timesteps    | 52500    |
| train/                |          |
|    entropy_loss       | -0.24    |
|    explained_variance | -102     |
|    learning_rate      | 0.0007   |
|    n_updates          | 10499    |
|    policy_loss        | 0.00658  |
|    value_loss         | 5.15e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 823       |
|    iterations         | 10600     |
|    time_elapsed       | 64        |
|    total_timesteps    | 53000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 826      |
|    iterations         | 11800    |
|    time_elapsed       | 71       |
|    total_timesteps    | 59000    |
| train/                |          |
|    entropy_loss       | -0.149   |
|    explained_variance | -281     |
|    learning_rate      | 0.0007   |
|    n_updates          | 11799    |
|    policy_loss        | 1.12e-05 |
|    value_loss         | 3.7e-07  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 826       |
|    iterations         | 11900     |
|    time_elapsed       | 71        |
|    total_timesteps    | 59500     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 831       |
|    iterations         | 13100     |
|    time_elapsed       | 78        |
|    total_timesteps    | 65500     |
| train/                |           |
|    entropy_loss       | -0.605    |
|    explained_variance | -340      |
|    learning_rate      | 0.0007    |
|    n_updates          | 13099     |
|    policy_loss        | -0.000205 |
|    value_loss         | 1.93e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 832      |
|    iterations         | 13200    |
|    time_elapsed       | 79       |
|    total_timesteps    | 66000    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 835      |
|    iterations         | 14400    |
|    time_elapsed       | 86       |
|    total_timesteps    | 72000    |
| train/                |          |
|    entropy_loss       | -0.266   |
|    explained_variance | -280     |
|    learning_rate      | 0.0007   |
|    n_updates          | 14399    |
|    policy_loss        | 3.7e-05  |
|    value_loss         | 4.2e-07  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 836      |
|    iterations         | 14500    |
|    time_elapsed       | 86       |
|    total_timesteps    | 72500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 838      |
|    iterations         | 15700    |
|    time_elapsed       | 93       |
|    total_timesteps    | 78500    |
| train/                |          |
|    entropy_loss       | -0.42    |
|    explained_variance | -1.6     |
|    learning_rate      | 0.0007   |
|    n_updates          | 15699    |
|    policy_loss        | 0.00014  |
|    value_loss         | 9.55e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 838       |
|    iterations         | 15800     |
|    time_elapsed       | 94        |
|    total_timesteps    | 79000     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 837       |
|    iterations         | 17000     |
|    time_elapsed       | 101       |
|    total_timesteps    | 85000     |
| train/                |           |
|    entropy_loss       | -0.0187   |
|    explained_variance | -9.47     |
|    learning_rate      | 0.0007    |
|    n_updates          | 16999     |
|    policy_loss        | -2.83e-06 |
|    value_loss         | 1.51e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 837       |
|    iterations         | 17100     |
|    time_elapsed       | 102       |
|    total_timesteps    | 85500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 836      |
|    iterations         | 18300    |
|    time_elapsed       | 109      |
|    total_timesteps    | 91500    |
| train/                |          |
|    entropy_loss       | -0.0032  |
|    explained_variance | -32.4    |
|    learning_rate      | 0.0007   |
|    n_updates          | 18299    |
|    policy_loss        | 3.35e-07 |
|    value_loss         | 1.04e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 836      |
|    iterations         | 18400    |
|    time_elapsed       | 110      |
|    total_timesteps    | 92000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 834      |
|    iterations         | 19600    |
|    time_elapsed       | 117      |
|    total_timesteps    | 98000    |
| train/                |          |
|    entropy_loss       | -0.00299 |
|    explained_variance | -14      |
|    learning_rate      | 0.0007   |
|    n_updates          | 19599    |
|    policy_loss        | 8.69e-08 |
|    value_loss         | 2.08e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 834      |
|    iterations         | 19700    |
|    time_elapsed       | 118      |
|    total_timesteps    | 98500    |
| train/                |          |
|

------------------------------------
| time/                 |          |
|    fps                | 775      |
|    iterations         | 1100     |
|    time_elapsed       | 7        |
|    total_timesteps    | 5500     |
| train/                |          |
|    entropy_loss       | -1.05    |
|    explained_variance | -301     |
|    learning_rate      | 0.0007   |
|    n_updates          | 1099     |
|    policy_loss        | 0.00849  |
|    value_loss         | 0.000322 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 779      |
|    iterations         | 1200     |
|    time_elapsed       | 7        |
|    total_timesteps    | 6000     |
| train/                |          |
|    entropy_loss       | -1.09    |
|    explained_variance | -487     |
|    learning_rate      | 0.0007   |
|    n_updates          | 1199     |
|    policy_loss        | 0.00311  |
|    value_loss         | 0.000387 |
-

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 804      |
|    iterations         | 2500     |
|    time_elapsed       | 15       |
|    total_timesteps    | 12500    |
| train/                |          |
|    entropy_loss       | -0.831   |
|    explained_variance | -88.4    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2499     |
|    policy_loss        | -0.0027  |
|    value_loss         | 4.14e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 804       |
|    iterations         | 2600      |
|    time_elapsed       | 16        |
|    total_timesteps    | 13000     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 813       |
|    iterations         | 3800      |
|    time_elapsed       | 23        |
|    total_timesteps    | 19000     |
| train/                |           |
|    entropy_loss       | -0.133    |
|    explained_variance | -2.15     |
|    learning_rate      | 0.0007    |
|    n_updates          | 3799      |
|    policy_loss        | -2.59e-05 |
|    value_loss         | 1.96e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 812      |
|    iterations         | 3900     |
|    time_elapsed       | 24       |
|    total_timesteps    | 19500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 812      |
|    iterations         | 5100     |
|    time_elapsed       | 31       |
|    total_timesteps    | 25500    |
| train/                |          |
|    entropy_loss       | -0.996   |
|    explained_variance | -5.31    |
|    learning_rate      | 0.0007   |
|    n_updates          | 5099     |
|    policy_loss        | 0.000181 |
|    value_loss         | 2.82e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 812       |
|    iterations         | 5200      |
|    time_elapsed       | 31        |
|    total_timesteps    | 26000     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 781       |
|    iterations         | 6400      |
|    time_elapsed       | 40        |
|    total_timesteps    | 32000     |
| train/                |           |
|    entropy_loss       | -0.902    |
|    explained_variance | -1.38e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 6399      |
|    policy_loss        | 0.00201   |
|    value_loss         | 7.22e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 777       |
|    iterations         | 6500      |
|    time_elapsed       | 41        |
|    total_timesteps    | 32500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 742      |
|    iterations         | 7700     |
|    time_elapsed       | 51       |
|    total_timesteps    | 38500    |
| train/                |          |
|    entropy_loss       | -0.983   |
|    explained_variance | -14.4    |
|    learning_rate      | 0.0007   |
|    n_updates          | 7699     |
|    policy_loss        | 0.000907 |
|    value_loss         | 1.59e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 742      |
|    iterations         | 7800     |
|    time_elapsed       | 52       |
|    total_timesteps    | 39000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 736       |
|    iterations         | 9000      |
|    time_elapsed       | 61        |
|    total_timesteps    | 45000     |
| train/                |           |
|    entropy_loss       | -0.982    |
|    explained_variance | -22.1     |
|    learning_rate      | 0.0007    |
|    n_updates          | 8999      |
|    policy_loss        | -5.71e-05 |
|    value_loss         | 7.49e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 735       |
|    iterations         | 9100      |
|    time_elapsed       | 61        |
|    total_timesteps    | 45500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 742       |
|    iterations         | 10300     |
|    time_elapsed       | 69        |
|    total_timesteps    | 51500     |
| train/                |           |
|    entropy_loss       | -0.355    |
|    explained_variance | -69.5     |
|    learning_rate      | 0.0007    |
|    n_updates          | 10299     |
|    policy_loss        | -3.91e-05 |
|    value_loss         | 4.79e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 743      |
|    iterations         | 10400    |
|    time_elapsed       | 69       |
|    total_timesteps    | 52000    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 740      |
|    iterations         | 11600    |
|    time_elapsed       | 78       |
|    total_timesteps    | 58000    |
| train/                |          |
|    entropy_loss       | -0.166   |
|    explained_variance | -95.6    |
|    learning_rate      | 0.0007   |
|    n_updates          | 11599    |
|    policy_loss        | 1.66e-05 |
|    value_loss         | 8.94e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 738      |
|    iterations         | 11700    |
|    time_elapsed       | 79       |
|    total_timesteps    | 58500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 729      |
|    iterations         | 12900    |
|    time_elapsed       | 88       |
|    total_timesteps    | 64500    |
| train/                |          |
|    entropy_loss       | -0.103   |
|    explained_variance | -66      |
|    learning_rate      | 0.0007   |
|    n_updates          | 12899    |
|    policy_loss        | 3.76e-06 |
|    value_loss         | 1.72e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 730      |
|    iterations         | 13000    |
|    time_elapsed       | 89       |
|    total_timesteps    | 65000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 732       |
|    iterations         | 14200     |
|    time_elapsed       | 96        |
|    total_timesteps    | 71000     |
| train/                |           |
|    entropy_loss       | -0.24     |
|    explained_variance | -6.17     |
|    learning_rate      | 0.0007    |
|    n_updates          | 14199     |
|    policy_loss        | -7.89e-05 |
|    value_loss         | 3.23e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 732      |
|    iterations         | 14300    |
|    time_elapsed       | 97       |
|    total_timesteps    | 71500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 737      |
|    iterations         | 15500    |
|    time_elapsed       | 105      |
|    total_timesteps    | 77500    |
| train/                |          |
|    entropy_loss       | -0.159   |
|    explained_variance | -8.45    |
|    learning_rate      | 0.0007   |
|    n_updates          | 15499    |
|    policy_loss        | 1.84e-05 |
|    value_loss         | 4.24e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 738       |
|    iterations         | 15600     |
|    time_elapsed       | 105       |
|    total_timesteps    | 78000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 747      |
|    iterations         | 16800    |
|    time_elapsed       | 112      |
|    total_timesteps    | 84000    |
| train/                |          |
|    entropy_loss       | -0.0648  |
|    explained_variance | -4.46    |
|    learning_rate      | 0.0007   |
|    n_updates          | 16799    |
|    policy_loss        | 1.86e-05 |
|    value_loss         | 3.71e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 747      |
|    iterations         | 16900    |
|    time_elapsed       | 112      |
|    total_timesteps    | 84500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 755      |
|    iterations         | 18100    |
|    time_elapsed       | 119      |
|    total_timesteps    | 90500    |
| train/                |          |
|    entropy_loss       | -0.849   |
|    explained_variance | -6.7     |
|    learning_rate      | 0.0007   |
|    n_updates          | 18099    |
|    policy_loss        | 0.00108  |
|    value_loss         | 5.32e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 755      |
|    iterations         | 18200    |
|    time_elapsed       | 120      |
|    total_timesteps    | 91000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -7.93    |
| time/                 |          |
|    fps                | 762      |
|    iterations         | 19400    |
|    time_elapsed       | 127      |
|    total_timesteps    | 97000    |
| train/                |          |
|    entropy_loss       | -0.38    |
|    explained_variance | -19      |
|    learning_rate      | 0.0007   |
|    n_updates          | 19399    |
|    policy_loss        | 9.79e-05 |
|    value_loss         | 8.41e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -7.93     |
| time/                 |           |
|    fps                | 763       |
|    iterations         | 19500     |
|    time_elapsed       | 127       |
|    total_timesteps    | 97500     |
| train/                |    

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [4]:
#multiagent ensemble
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO, DQN, A2C
from sklearn.preprocessing import StandardScaler
from collections import Counter

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.scaler = scaler

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        return self._get_observation(), {}

    def _get_observation(self):
        return self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']

        if action == 1:  # Buy
            if self.position == 0:
                self.position = 1
                self.entry_price = current_price
            elif self.position == -1:
                reward = self.entry_price - current_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0

        elif action == 2:  # Sell
            if self.position == 0:
                self.position = -1
                self.entry_price = current_price
            elif self.position == 1:
                reward = current_price - self.entry_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

# Multi-Agent Trading Environment
class MultiAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None, num_agents=4):
        super(MultiAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.scaler = scaler
        self.num_agents = num_agents
        self.agents = [SingleAgentEnv(data, window_size, initial_balance, scaler) for _ in range(num_agents)]

    def reset(self):
        obs = []
        for agent in self.agents:
            agent_obs, _ = agent.reset()
            obs.append(agent_obs)
        return obs

    def step(self, actions):
        obs, rewards, terminated, truncated, infos = [], [], [], [], []
        for agent, action in zip(self.agents, actions):
            agent_obs, reward, done, truncate, info = agent.step(action)
            obs.append(agent_obs)
            rewards.append(reward)
            terminated.append(done)
            truncated.append(truncate)
            infos.append(info)
        return obs, rewards, any(terminated), any(truncated), infos

# Ensemble model function
def ensemble_predict(actions):
    actions = [int(action) for action in actions]
    action_counts = Counter(actions)
    return action_counts.most_common(1)[0][0]

# Function to calculate metrics for each agent
def calculate_metrics(trades, initial_balance, final_balance):
    total_profit = final_balance - initial_balance
    cumulative_return = (final_balance - initial_balance) / initial_balance
    win_rate = len([trade for trade in trades if trade > 0]) / len(trades) if trades else 0
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf
    returns = np.array(trades)
    sharpe_ratio = np.mean(returns) / np.std(returns) if np.std(returns) != 0 else 0
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = np.mean(returns) / downside_std if downside_std != 0 else 0
    max_drawdown = np.max(np.maximum.accumulate(np.cumsum(trades)) - np.cumsum(trades)) if trades else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Sharpe Ratio-weighted aggregation for combined metrics
def aggregate_metrics_sharpe_weighted(metrics_list):
    positive_metrics = [m for m in metrics_list if m["Sharpe Ratio"] > 0]
    total_sharpe = sum(m["Sharpe Ratio"] for m in positive_metrics)
    if total_sharpe == 0:
        return {metric: 0 for metric in metrics_list[0]}
    
    combined_metrics = {
        "Total Profit": sum(m["Total Profit"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Cumulative Return": sum(m["Cumulative Return"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Win Rate": sum(m["Win Rate"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Profit Factor": sum(m["Profit Factor"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Sharpe Ratio": sum(m["Sharpe Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Sortino Ratio": sum(m["Sortino Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Maximum Drawdown": sum(m["Maximum Drawdown"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
    }
    return combined_metrics

# Train and evaluate each agent
def train_and_evaluate():
    train_file = 'NVDA_TRAINING.csv'
    test_file = 'NVDA_TESTING.csv'
    df_train, df_test, scaler = load_and_normalize_data(train_file, test_file)

    training_metrics = []
    ensemble_models = []

    # Train each agent independently
    for i in range(4):  # 4 agents
        env_train = SingleAgentEnv(df_train, window_size=10, scaler=scaler)

        # Initialize each model
        ppo_model = PPO("MlpPolicy", env_train, verbose=1)
        dqn_model = DQN("MlpPolicy", env_train, verbose=1)
        a2c_model = A2C("MlpPolicy", env_train, verbose=1)

        # Train each model
        ppo_model.learn(total_timesteps=50000)
        dqn_model.learn(total_timesteps=50000)
        a2c_model.learn(total_timesteps=50000)

        # Store trained models in a list
        ensemble_models.append((ppo_model, dqn_model, a2c_model))

        # Calculate training metrics
        training_metrics.append(calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance))

    combined_training_metrics = aggregate_metrics_sharpe_weighted(training_metrics)
    print("\n=== Combined Training Metrics (Sharpe Ratio Weighted) ===")
    for metric, value in combined_training_metrics.items():
        print(f"{metric}: {value}")

    # Test in the multi-agent environment
    env_test = MultiAgentEnv(df_test, window_size=10, scaler=scaler, num_agents=4)
    obs = env_test.reset()
    done = False
    while not done:
        actions = []
        for i, (ppo_model, dqn_model, a2c_model) in enumerate(ensemble_models):
            ppo_action, _ = ppo_model.predict(obs[i])
            dqn_action, _ = dqn_model.predict(obs[i])
            a2c_action, _ = a2c_model.predict(obs[i])
            final_action = ensemble_predict([ppo_action, dqn_action, a2c_action])
            actions.append(final_action)

        obs, rewards, done, truncated, _ = env_test.step(actions)

    # Calculate and display testing metrics
    testing_metrics = []
    for agent in env_test.agents:
        testing_metrics.append(calculate_metrics(agent.trades, agent.initial_balance, agent.balance))

    combined_testing_metrics = aggregate_metrics_sharpe_weighted(testing_metrics)
    print("\n=== Combined Testing Metrics (Sharpe Ratio Weighted) ===")
    for metric, value in combined_testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 1696 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1339        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007874569 |
|    clip_fraction        | 0.0411      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -17         |
|   

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 7.93e+03     |
|    ep_rew_mean          | 0.129        |
| time/                   |              |
|    fps                  | 1138         |
|    iterations           | 11           |
|    time_elapsed         | 19           |
|    total_timesteps      | 22528        |
| train/                  |              |
|    approx_kl            | 0.0060199364 |
|    clip_fraction        | 0.0462       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.02        |
|    explained_variance   | -0.216       |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0214      |
|    n_updates            | 100          |
|    policy_gradient_loss | -0.00906     |
|    value_loss           | 0.00116      |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+03    |
|    ep_rew_mean          | 3.71        |
| time/                   |             |
|    fps                  | 1052        |
|    iterations           | 21          |
|    time_elapsed         | 40          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.013371274 |
|    clip_fraction        | 0.143       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.909      |
|    explained_variance   | -0.431      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0127      |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.00917    |
|    value_loss           | 0.00288     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+

------------------------------------
| time/                 |          |
|    fps                | 869      |
|    iterations         | 800      |
|    time_elapsed       | 4        |
|    total_timesteps    | 4000     |
| train/                |          |
|    entropy_loss       | -0.902   |
|    explained_variance | -35      |
|    learning_rate      | 0.0007   |
|    n_updates          | 799      |
|    policy_loss        | 0.00813  |
|    value_loss         | 0.00013  |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 862      |
|    iterations         | 900      |
|    time_elapsed       | 5        |
|    total_timesteps    | 4500     |
| train/                |          |
|    entropy_loss       | -0.974   |
|    explained_variance | -1.63    |
|    learning_rate      | 0.0007   |
|    n_updates          | 899      |
|    policy_loss        | 0.00132  |
|    value_loss         | 0.00221  |
-

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.438   |
| time/                 |          |
|    fps                | 863      |
|    iterations         | 2300     |
|    time_elapsed       | 13       |
|    total_timesteps    | 11500    |
| train/                |          |
|    entropy_loss       | -0.864   |
|    explained_variance | -17.4    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2299     |
|    policy_loss        | -0.00204 |
|    value_loss         | 9.94e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.438   |
| time/                 |          |
|    fps                | 864      |
|    iterations         | 2400     |
|    time_elapsed       | 13       |
|    total_timesteps    | 12000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.203   |
| time/                 |          |
|    fps                | 867      |
|    iterations         | 3600     |
|    time_elapsed       | 20       |
|    total_timesteps    | 18000    |
| train/                |          |
|    entropy_loss       | -0.739   |
|    explained_variance | -24.8    |
|    learning_rate      | 0.0007   |
|    n_updates          | 3599     |
|    policy_loss        | -0.00012 |
|    value_loss         | 1.21e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -0.203    |
| time/                 |           |
|    fps                | 867       |
|    iterations         | 3700      |
|    time_elapsed       | 21        |
|    total_timesteps    | 18500     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.398   |
| time/                 |          |
|    fps                | 868      |
|    iterations         | 4900     |
|    time_elapsed       | 28       |
|    total_timesteps    | 24500    |
| train/                |          |
|    entropy_loss       | -0.606   |
|    explained_variance | -1.55    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4899     |
|    policy_loss        | -0.00607 |
|    value_loss         | 0.000118 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.398   |
| time/                 |          |
|    fps                | 869      |
|    iterations         | 5000     |
|    time_elapsed       | 28       |
|    total_timesteps    | 25000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -0.398    |
| time/                 |           |
|    fps                | 876       |
|    iterations         | 6200      |
|    time_elapsed       | 35        |
|    total_timesteps    | 31000     |
| train/                |           |
|    entropy_loss       | -0.135    |
|    explained_variance | -38.6     |
|    learning_rate      | 0.0007    |
|    n_updates          | 6199      |
|    policy_loss        | -5.36e-06 |
|    value_loss         | 3.89e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -0.398    |
| time/                 |           |
|    fps                | 877       |
|    iterations         | 6300      |
|    time_elapsed       | 35        |
|    total_timesteps    | 31500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.2     |
| time/                 |          |
|    fps                | 885      |
|    iterations         | 7500     |
|    time_elapsed       | 42       |
|    total_timesteps    | 37500    |
| train/                |          |
|    entropy_loss       | -0.193   |
|    explained_variance | -225     |
|    learning_rate      | 0.0007   |
|    n_updates          | 7499     |
|    policy_loss        | 6.21e-05 |
|    value_loss         | 1.28e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -0.2      |
| time/                 |           |
|    fps                | 886       |
|    iterations         | 7600      |
|    time_elapsed       | 42        |
|    total_timesteps    | 38000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.14    |
| time/                 |          |
|    fps                | 904      |
|    iterations         | 8800     |
|    time_elapsed       | 48       |
|    total_timesteps    | 44000    |
| train/                |          |
|    entropy_loss       | -0.714   |
|    explained_variance | -15.5    |
|    learning_rate      | 0.0007   |
|    n_updates          | 8799     |
|    policy_loss        | -0.002   |
|    value_loss         | 3.24e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -0.14     |
| time/                 |           |
|    fps                | 906       |
|    iterations         | 8900      |
|    time_elapsed       | 49        |
|    total_timesteps    | 44500     |
| train/                |    

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 2005 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1667        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.005224838 |
|    clip_fraction        | 0.0705      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -8.41       |
|   

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+03    |
|    ep_rew_mean          | 2.25        |
| time/                   |             |
|    fps                  | 1437        |
|    iterations           | 11          |
|    time_elapsed         | 15          |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.006959893 |
|    clip_fraction        | 0.0748      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.997      |
|    explained_variance   | -0.523      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.012      |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.0109     |
|    value_loss           | 0.00134     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+03    |
|    ep_rew_mean          | 3.48        |
| time/                   |             |
|    fps                  | 1417        |
|    iterations           | 21          |
|    time_elapsed         | 30          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.013795538 |
|    clip_fraction        | 0.129       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.946      |
|    explained_variance   | -0.34       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0234     |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.00685    |
|    value_loss           | 0.00473     |
-----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 7.93e+03

------------------------------------
| time/                 |          |
|    fps                | 1045     |
|    iterations         | 800      |
|    time_elapsed       | 3        |
|    total_timesteps    | 4000     |
| train/                |          |
|    entropy_loss       | -0.694   |
|    explained_variance | -89.7    |
|    learning_rate      | 0.0007   |
|    n_updates          | 799      |
|    policy_loss        | 0.000108 |
|    value_loss         | 0.000247 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1045      |
|    iterations         | 900       |
|    time_elapsed       | 4         |
|    total_timesteps    | 4500      |
| train/                |           |
|    entropy_loss       | -0.509    |
|    explained_variance | -4.59e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 899       |
|    policy_loss        | 0.0477    |
|    value_loss         | 

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.768   |
| time/                 |          |
|    fps                | 1043     |
|    iterations         | 2300     |
|    time_elapsed       | 11       |
|    total_timesteps    | 11500    |
| train/                |          |
|    entropy_loss       | -0.818   |
|    explained_variance | -0.322   |
|    learning_rate      | 0.0007   |
|    n_updates          | 2299     |
|    policy_loss        | 0.0414   |
|    value_loss         | 0.00146  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.768   |
| time/                 |          |
|    fps                | 1043     |
|    iterations         | 2400     |
|    time_elapsed       | 11       |
|    total_timesteps    | 12000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | 0.143    |
| time/                 |          |
|    fps                | 1044     |
|    iterations         | 3600     |
|    time_elapsed       | 17       |
|    total_timesteps    | 18000    |
| train/                |          |
|    entropy_loss       | -1.01    |
|    explained_variance | -0.163   |
|    learning_rate      | 0.0007   |
|    n_updates          | 3599     |
|    policy_loss        | 0.0182   |
|    value_loss         | 0.000499 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | 0.143    |
| time/                 |          |
|    fps                | 1044     |
|    iterations         | 3700     |
|    time_elapsed       | 17       |
|    total_timesteps    | 18500    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -0.772    |
| time/                 |           |
|    fps                | 1044      |
|    iterations         | 4900      |
|    time_elapsed       | 23        |
|    total_timesteps    | 24500     |
| train/                |           |
|    entropy_loss       | -0.392    |
|    explained_variance | 0.0386    |
|    learning_rate      | 0.0007    |
|    n_updates          | 4899      |
|    policy_loss        | -0.000391 |
|    value_loss         | 6.91e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.772   |
| time/                 |          |
|    fps                | 1044     |
|    iterations         | 5000     |
|    time_elapsed       | 23       |
|    total_timesteps    | 25000    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -0.772    |
| time/                 |           |
|    fps                | 1043      |
|    iterations         | 6200      |
|    time_elapsed       | 29        |
|    total_timesteps    | 31000     |
| train/                |           |
|    entropy_loss       | -0.0615   |
|    explained_variance | -96       |
|    learning_rate      | 0.0007    |
|    n_updates          | 6199      |
|    policy_loss        | -2.88e-06 |
|    value_loss         | 3.54e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -0.772    |
| time/                 |           |
|    fps                | 1043      |
|    iterations         | 6300      |
|    time_elapsed       | 30        |
|    total_timesteps    | 31500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -1.11    |
| time/                 |          |
|    fps                | 1043     |
|    iterations         | 7500     |
|    time_elapsed       | 35       |
|    total_timesteps    | 37500    |
| train/                |          |
|    entropy_loss       | -0.0158  |
|    explained_variance | -0.235   |
|    learning_rate      | 0.0007   |
|    n_updates          | 7499     |
|    policy_loss        | 6.47e-07 |
|    value_loss         | 1.86e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -1.11    |
| time/                 |          |
|    fps                | 1043     |
|    iterations         | 7600     |
|    time_elapsed       | 36       |
|    total_timesteps    | 38000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -1.19     |
| time/                 |           |
|    fps                | 1042      |
|    iterations         | 8800      |
|    time_elapsed       | 42        |
|    total_timesteps    | 44000     |
| train/                |           |
|    entropy_loss       | -0.0074   |
|    explained_variance | -0.953    |
|    learning_rate      | 0.0007    |
|    n_updates          | 8799      |
|    policy_loss        | -6.21e-06 |
|    value_loss         | 4.73e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -1.19     |
| time/                 |           |
|    fps                | 1042      |
|    iterations         | 8900      |
|    time_elapsed       | 42        |
|    total_timesteps    | 44500     |
| train/    

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 2009 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1690        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007148348 |
|    clip_fraction        | 0.0494      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -2.33       |
|   

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+03    |
|    ep_rew_mean          | -0.163      |
| time/                   |             |
|    fps                  | 1431        |
|    iterations           | 11          |
|    time_elapsed         | 15          |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.005635538 |
|    clip_fraction        | 0.0405      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.992      |
|    explained_variance   | -0.307      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0201     |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.00936    |
|    value_loss           | 0.0015      |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 7.93

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 7.93e+03   |
|    ep_rew_mean          | 3.08       |
| time/                   |            |
|    fps                  | 1409       |
|    iterations           | 21         |
|    time_elapsed         | 30         |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.00735286 |
|    clip_fraction        | 0.087      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.917     |
|    explained_variance   | -0.324     |
|    learning_rate        | 0.0003     |
|    loss                 | -0.00474   |
|    n_updates            | 200        |
|    policy_gradient_loss | -0.0063    |
|    value_loss           | 0.00368    |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+03    |
|    ep_rew_m

------------------------------------
| time/                 |          |
|    fps                | 987      |
|    iterations         | 800      |
|    time_elapsed       | 4        |
|    total_timesteps    | 4000     |
| train/                |          |
|    entropy_loss       | -0.7     |
|    explained_variance | -0.647   |
|    learning_rate      | 0.0007   |
|    n_updates          | 799      |
|    policy_loss        | 0.0113   |
|    value_loss         | 0.000362 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 987      |
|    iterations         | 900      |
|    time_elapsed       | 4        |
|    total_timesteps    | 4500     |
| train/                |          |
|    entropy_loss       | -0.749   |
|    explained_variance | 0.338    |
|    learning_rate      | 0.0007   |
|    n_updates          | 899      |
|    policy_loss        | 0.0381   |
|    value_loss         | 0.00281  |
-

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.465   |
| time/                 |          |
|    fps                | 974      |
|    iterations         | 2300     |
|    time_elapsed       | 11       |
|    total_timesteps    | 11500    |
| train/                |          |
|    entropy_loss       | -0.628   |
|    explained_variance | -1.45    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2299     |
|    policy_loss        | 0.0015   |
|    value_loss         | 7.54e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.465   |
| time/                 |          |
|    fps                | 971      |
|    iterations         | 2400     |
|    time_elapsed       | 12       |
|    total_timesteps    | 12000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -0.373    |
| time/                 |           |
|    fps                | 955       |
|    iterations         | 3600      |
|    time_elapsed       | 18        |
|    total_timesteps    | 18000     |
| train/                |           |
|    entropy_loss       | -0.378    |
|    explained_variance | -1.4e+03  |
|    learning_rate      | 0.0007    |
|    n_updates          | 3599      |
|    policy_loss        | -0.000207 |
|    value_loss         | 3.13e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.373   |
| time/                 |          |
|    fps                | 956      |
|    iterations         | 3700     |
|    time_elapsed       | 19       |
|    total_timesteps    | 18500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.169   |
| time/                 |          |
|    fps                | 889      |
|    iterations         | 4900     |
|    time_elapsed       | 27       |
|    total_timesteps    | 24500    |
| train/                |          |
|    entropy_loss       | -0.183   |
|    explained_variance | -7.44    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4899     |
|    policy_loss        | 0.000108 |
|    value_loss         | 8.3e-06  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -0.169    |
| time/                 |           |
|    fps                | 890       |
|    iterations         | 5000      |
|    time_elapsed       | 28        |
|    total_timesteps    | 25000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.169   |
| time/                 |          |
|    fps                | 906      |
|    iterations         | 6200     |
|    time_elapsed       | 34       |
|    total_timesteps    | 31000    |
| train/                |          |
|    entropy_loss       | -0.415   |
|    explained_variance | -38.9    |
|    learning_rate      | 0.0007   |
|    n_updates          | 6199     |
|    policy_loss        | 0.000951 |
|    value_loss         | 3.75e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -0.169    |
| time/                 |           |
|    fps                | 907       |
|    iterations         | 6300      |
|    time_elapsed       | 34        |
|    total_timesteps    | 31500     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | 0.141    |
| time/                 |          |
|    fps                | 920      |
|    iterations         | 7500     |
|    time_elapsed       | 40       |
|    total_timesteps    | 37500    |
| train/                |          |
|    entropy_loss       | -0.758   |
|    explained_variance | 0.00719  |
|    learning_rate      | 0.0007   |
|    n_updates          | 7499     |
|    policy_loss        | -0.0447  |
|    value_loss         | 0.00239  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | 0.141    |
| time/                 |          |
|    fps                | 919      |
|    iterations         | 7600     |
|    time_elapsed       | 41       |
|    total_timesteps    | 38000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | 0.302    |
| time/                 |          |
|    fps                | 915      |
|    iterations         | 8800     |
|    time_elapsed       | 48       |
|    total_timesteps    | 44000    |
| train/                |          |
|    entropy_loss       | -0.735   |
|    explained_variance | -223     |
|    learning_rate      | 0.0007   |
|    n_updates          | 8799     |
|    policy_loss        | 0.00201  |
|    value_loss         | 6.49e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | 0.302     |
| time/                 |           |
|    fps                | 916       |
|    iterations         | 8900      |
|    time_elapsed       | 48        |
|    total_timesteps    | 44500     |
| train/                |    

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 2002 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1675        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008269371 |
|    clip_fraction        | 0.0669      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -6.16       |
|   

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+03    |
|    ep_rew_mean          | 1.08        |
| time/                   |             |
|    fps                  | 1436        |
|    iterations           | 11          |
|    time_elapsed         | 15          |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.007726073 |
|    clip_fraction        | 0.0416      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.973      |
|    explained_variance   | -0.392      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.027      |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.0104     |
|    value_loss           | 0.00118     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+03    |
|    ep_rew_mean          | 3.09        |
| time/                   |             |
|    fps                  | 1399        |
|    iterations           | 21          |
|    time_elapsed         | 30          |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.007927672 |
|    clip_fraction        | 0.0975      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.91       |
|    explained_variance   | -0.0558     |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00871    |
|    n_updates            | 200         |
|    policy_gradient_loss | -0.00698    |
|    value_loss           | 0.00355     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.93e+

------------------------------------
| time/                 |          |
|    fps                | 1006     |
|    iterations         | 800      |
|    time_elapsed       | 3        |
|    total_timesteps    | 4000     |
| train/                |          |
|    entropy_loss       | -1.04    |
|    explained_variance | -14.8    |
|    learning_rate      | 0.0007   |
|    n_updates          | 799      |
|    policy_loss        | 0.0323   |
|    value_loss         | 0.00184  |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1002     |
|    iterations         | 900      |
|    time_elapsed       | 4        |
|    total_timesteps    | 4500     |
| train/                |          |
|    entropy_loss       | -1.07    |
|    explained_variance | -2.77    |
|    learning_rate      | 0.0007   |
|    n_updates          | 899      |
|    policy_loss        | -0.0645  |
|    value_loss         | 0.00552  |
-

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | 1.85     |
| time/                 |          |
|    fps                | 923      |
|    iterations         | 2300     |
|    time_elapsed       | 12       |
|    total_timesteps    | 11500    |
| train/                |          |
|    entropy_loss       | -0.923   |
|    explained_variance | -112     |
|    learning_rate      | 0.0007   |
|    n_updates          | 2299     |
|    policy_loss        | -0.0183  |
|    value_loss         | 0.000434 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | 1.85     |
| time/                 |          |
|    fps                | 916      |
|    iterations         | 2400     |
|    time_elapsed       | 13       |
|    total_timesteps    | 12000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | 1.85     |
| time/                 |          |
|    fps                | 921      |
|    iterations         | 3600     |
|    time_elapsed       | 19       |
|    total_timesteps    | 18000    |
| train/                |          |
|    entropy_loss       | -0.769   |
|    explained_variance | -8.02    |
|    learning_rate      | 0.0007   |
|    n_updates          | 3599     |
|    policy_loss        | 0.00442  |
|    value_loss         | 2.86e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | 1.85     |
| time/                 |          |
|    fps                | 922      |
|    iterations         | 3700     |
|    time_elapsed       | 20       |
|    total_timesteps    | 18500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | 0.329    |
| time/                 |          |
|    fps                | 931      |
|    iterations         | 5000     |
|    time_elapsed       | 26       |
|    total_timesteps    | 25000    |
| train/                |          |
|    entropy_loss       | -0.723   |
|    explained_variance | -6.57    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4999     |
|    policy_loss        | -0.00338 |
|    value_loss         | 1.09e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | 0.329    |
| time/                 |          |
|    fps                | 932      |
|    iterations         | 5100     |
|    time_elapsed       | 27       |
|    total_timesteps    | 25500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | 0.329    |
| time/                 |          |
|    fps                | 934      |
|    iterations         | 6300     |
|    time_elapsed       | 33       |
|    total_timesteps    | 31500    |
| train/                |          |
|    entropy_loss       | -0.672   |
|    explained_variance | 0.42     |
|    learning_rate      | 0.0007   |
|    n_updates          | 6299     |
|    policy_loss        | 0.000296 |
|    value_loss         | 1.88e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -0.248    |
| time/                 |           |
|    fps                | 936       |
|    iterations         | 6400      |
|    time_elapsed       | 34        |
|    total_timesteps    | 32000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.248   |
| time/                 |          |
|    fps                | 936      |
|    iterations         | 7600     |
|    time_elapsed       | 40       |
|    total_timesteps    | 38000    |
| train/                |          |
|    entropy_loss       | -0.318   |
|    explained_variance | -6.86    |
|    learning_rate      | 0.0007   |
|    n_updates          | 7599     |
|    policy_loss        | 8e-05    |
|    value_loss         | 1.2e-08  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.248   |
| time/                 |          |
|    fps                | 936      |
|    iterations         | 7700     |
|    time_elapsed       | 41       |
|    total_timesteps    | 38500    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 7.93e+03  |
|    ep_rew_mean        | -0.376    |
| time/                 |           |
|    fps                | 936       |
|    iterations         | 8900      |
|    time_elapsed       | 47        |
|    total_timesteps    | 44500     |
| train/                |           |
|    entropy_loss       | -0.648    |
|    explained_variance | -0.175    |
|    learning_rate      | 0.0007    |
|    n_updates          | 8899      |
|    policy_loss        | -0.000909 |
|    value_loss         | 8.98e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 7.93e+03 |
|    ep_rew_mean        | -0.376   |
| time/                 |          |
|    fps                | 936      |
|    iterations         | 9000     |
|    time_elapsed       | 48       |
|    total_timesteps    | 45000    |
| train/             