In [1]:
#multiagent ppo
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment for Training
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.entry_price = 0
        self.trades = []
        self.scaler = scaler

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)
        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, **kwargs):
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        return self._get_observation(), {}

    def _get_observation(self):
        return self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']
        if action == 1:  # Buy
            if self.position == 0:
                self.position = 1
                self.entry_price = current_price
            elif self.position == -1:  # Close short
                reward = self.entry_price - current_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0
        elif action == 2:  # Sell
            if self.position == 0:
                self.position = -1
                self.entry_price = current_price
            elif self.position == 1:  # Close long
                reward = current_price - self.entry_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

# Mixed Multi-Agent Testing Environment
class MixedMultiAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(MixedMultiAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.scaler = scaler

        # Two teams with two agents each
        self.num_teams = 2
        self.agents_per_team = 2
        self.num_agents = self.num_teams * self.agents_per_team

        # Initialize agent balances, positions, and trades
        self.balances = [initial_balance] * self.num_agents
        self.positions = [0] * self.num_agents
        self.entry_prices = [0] * self.num_agents
        self.trades = [[] for _ in range(self.num_agents)]

    def reset(self, **kwargs):
        self.current_step = 0
        self.balances = [self.initial_balance] * self.num_agents
        self.positions = [0] * self.num_agents
        self.entry_prices = [0] * self.num_agents
        self.trades = [[] for _ in range(self.num_agents)]
        return [self._get_observation() for _ in range(self.num_agents)], {}

    def _get_observation(self):
        return self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)

    def step(self, actions):
        rewards = [0] * self.num_agents
        current_price = self.data.iloc[self.current_step]['close']
        for i, action in enumerate(actions):
            if action == 1:  # Buy
                if self.positions[i] == 0:
                    self.positions[i] = 1
                    self.entry_prices[i] = current_price
                elif self.positions[i] == -1:  # Close short
                    reward = self.entry_prices[i] - current_price
                    self.balances[i] += reward
                    rewards[i] = reward
                    self.positions[i] = 0
                    self.trades[i].append(reward)
            elif action == 2:  # Sell
                if self.positions[i] == 0:
                    self.positions[i] = -1
                    self.entry_prices[i] = current_price
                elif self.positions[i] == 1:  # Close long
                    reward = current_price - self.entry_prices[i]
                    self.balances[i] += reward
                    rewards[i] = reward
                    self.positions[i] = 0
                    self.trades[i].append(reward)

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return [self._get_observation() for _ in range(self.num_agents)], rewards, terminated, truncated, {}

# Calculate individual metrics
def calculate_metrics(trades, initial_balance, final_balance):
    total_profit = final_balance - initial_balance
    cumulative_return = total_profit / initial_balance
    
    # Calculate positive and negative trades for profit factor
    positive_trades = [trade for trade in trades if trade > 0]
    negative_trades = [trade for trade in trades if trade < 0]
    profit_factor = sum(positive_trades) / abs(sum(negative_trades)) if negative_trades else float('inf')

    win_rate = len(positive_trades) / len(trades) if trades else 0

    # Calculate Sharpe Ratio
    sharpe_ratio = np.mean(trades) / np.std(trades) if np.std(trades) != 0 else 0

    # Sortino Ratio (uses only negative trades as downside deviation)
    downside_std = np.std([trade for trade in trades if trade < 0])
    sortino_ratio = np.mean(trades) / downside_std if downside_std != 0 else 0

    # Maximum Drawdown
    cumulative_balance = np.cumsum(trades)
    running_max = np.maximum.accumulate(cumulative_balance)
    drawdown = running_max - cumulative_balance
    max_drawdown = np.max(drawdown) if len(drawdown) > 0 else 0

    return {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }

# Sharpe Ratio-weighted aggregation
def aggregate_metrics_sharpe_weighted(metrics_list):
    # Filter out agents with non-positive Sharpe Ratios
    positive_sharpe_metrics = [m for m in metrics_list if m["Sharpe Ratio"] > 0]
    total_sharpe = sum(m["Sharpe Ratio"] for m in positive_sharpe_metrics)
    
    # If no agents have a positive Sharpe Ratio, return zeros for all metrics
    if total_sharpe == 0:
        return {metric: 0 for metric in metrics_list[0]}
    
    combined_metrics = {
        "Total Profit": sum(m["Total Profit"] * m["Sharpe Ratio"] / total_sharpe for m in positive_sharpe_metrics),
        "Cumulative Return": sum(m["Cumulative Return"] * m["Sharpe Ratio"] / total_sharpe for m in positive_sharpe_metrics),
        "Win Rate": sum(m["Win Rate"] * m["Sharpe Ratio"] / total_sharpe for m in positive_sharpe_metrics),
        "Profit Factor": sum(m["Profit Factor"] * m["Sharpe Ratio"] / total_sharpe for m in positive_sharpe_metrics),
        "Sharpe Ratio": sum(m["Sharpe Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_sharpe_metrics),
        "Sortino Ratio": sum(m["Sortino Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_sharpe_metrics),
        "Maximum Drawdown": sum(m["Maximum Drawdown"] * m["Sharpe Ratio"] / total_sharpe for m in positive_sharpe_metrics),
    }
    return combined_metrics

# Train and evaluate each agent
def train_and_evaluate():
    train_file = 'MSFT_TRAINING.csv'
    test_file = 'MSFT_TESTING.csv'
    df_train_normalized, df_test_normalized, scaler = load_and_normalize_data(train_file, test_file)

    # Track training metrics
    training_metrics = []

    # Train each agent independently in single-agent environments
    models = []
    for i in range(4):  # 4 agents
        env_train = SingleAgentEnv(df_train_normalized, window_size=10, scaler=scaler)
        model = PPO("MlpPolicy", env_train, verbose=1)
        model.learn(total_timesteps=50000)
        models.append(model)

        # Calculate training metrics for each agent
        final_balance = env_train.balance
        metrics = calculate_metrics(env_train.trades, env_train.initial_balance, final_balance)
        training_metrics.append(metrics)
        print(f"\n--- Agent {i+1} Training Metrics ---")
        for metric, value in metrics.items():
            print(f"{metric}: {value}")

    # Aggregate training metrics with Sharpe Ratio weighting
    combined_training_metrics = aggregate_metrics_sharpe_weighted(training_metrics)
    print("\n=== Combined Training Metrics for All Agents (Sharpe Ratio Weighted) ===")
    for metric, value in combined_training_metrics.items():
        print(f"{metric}: {value}")

    # Test in the mixed multi-agent environment
    env_test = MixedMultiAgentEnv(df_test_normalized, window_size=10, scaler=scaler)
    obs, _ = env_test.reset()
    done = False
    while not done:
        actions = [model.predict(obs[i])[0] for i, model in enumerate(models)]
        obs, rewards, done, truncated, _ = env_test.step(actions)

    # Calculate and display testing metrics for each agent in the testing environment
    testing_metrics = []
    for i in range(4):  # 4 agents
        final_balance = env_test.balances[i]
        metrics = calculate_metrics(env_test.trades[i], env_test.initial_balance, final_balance)
        testing_metrics.append(metrics)
        print(f"\n--- Agent {i+1} Testing Metrics ---")
        for metric, value in metrics.items():
            print(f"{metric}: {value}")

    # Aggregate testing metrics with Sharpe Ratio weighting
    combined_testing_metrics = aggregate_metrics_sharpe_weighted(testing_metrics)
    print("\n=== Combined Testing Metrics for All Agents (Sharpe Ratio Weighted) ===")
    for metric, value in combined_testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 920  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 873         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014197102 |
|    clip_fraction        | 0.161       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -10.2       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00769    |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0104     |
|    value_loss         

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_mean          | 1.22        |
| time/                   |             |
|    fps                  | 812         |
|    iterations           | 12          |
|    time_elapsed         | 30          |
|    total_timesteps      | 24576       |
| train/                  |             |
|    approx_kl            | 0.008090118 |
|    clip_fraction        | 0.0771      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.968      |
|    explained_variance   | -0.131      |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00719     |
|    n_updates            | 110         |
|    policy_gradient_loss | -0.00998    |
|    value_loss           | 0.00185     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_mean          | 5.33        |
| time/                   |             |
|    fps                  | 825         |
|    iterations           | 22          |
|    time_elapsed         | 54          |
|    total_timesteps      | 45056       |
| train/                  |             |
|    approx_kl            | 0.009754185 |
|    clip_fraction        | 0.114       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.783      |
|    explained_variance   | 0.0556      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.000346   |
|    n_updates            | 210         |
|    policy_gradient_loss | -0.0108     |
|    value_loss           | 0.00355     |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 8.05

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 8.05e+03     |
|    ep_rew_mean          | -3.11        |
| time/                   |              |
|    fps                  | 941          |
|    iterations           | 7            |
|    time_elapsed         | 15           |
|    total_timesteps      | 14336        |
| train/                  |              |
|    approx_kl            | 0.0077586262 |
|    clip_fraction        | 0.0637       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.03        |
|    explained_variance   | -0.461       |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0102      |
|    n_updates            | 60           |
|    policy_gradient_loss | -0.00707     |
|    value_loss           | 0.00212      |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_mean          | 3           |
| time/                   |             |
|    fps                  | 903         |
|    iterations           | 17          |
|    time_elapsed         | 38          |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.011588339 |
|    clip_fraction        | 0.127       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.911      |
|    explained_variance   | 0.0183      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0178     |
|    n_updates            | 160         |
|    policy_gradient_loss | -0.00803    |
|    value_loss           | 0.00263     |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 8.05

-----------------------------
| time/              |      |
|    fps             | 1384 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 930         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009838453 |
|    clip_fraction        | 0.0563      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -13.4       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0288     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0101     |
|    value_loss           | 0.0105      |
-----------------------------------------
----------------------------------

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_mean          | 1.99        |
| time/                   |             |
|    fps                  | 794         |
|    iterations           | 12          |
|    time_elapsed         | 30          |
|    total_timesteps      | 24576       |
| train/                  |             |
|    approx_kl            | 0.006279283 |
|    clip_fraction        | 0.0597      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.05       |
|    explained_variance   | -0.206      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00926    |
|    n_updates            | 110         |
|    policy_gradient_loss | -0.00864    |
|    value_loss           | 0.00155     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 8.05e+03   |
|    ep_rew_mean          | 7.22       |
| time/                   |            |
|    fps                  | 769        |
|    iterations           | 22         |
|    time_elapsed         | 58         |
|    total_timesteps      | 45056      |
| train/                  |            |
|    approx_kl            | 0.01099412 |
|    clip_fraction        | 0.101      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.867     |
|    explained_variance   | 0.273      |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0162     |
|    n_updates            | 210        |
|    policy_gradient_loss | -0.0101    |
|    value_loss           | 0.00358    |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_m

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 8.05e+03     |
|    ep_rew_mean          | -0.31        |
| time/                   |              |
|    fps                  | 797          |
|    iterations           | 7            |
|    time_elapsed         | 17           |
|    total_timesteps      | 14336        |
| train/                  |              |
|    approx_kl            | 0.0064631384 |
|    clip_fraction        | 0.0473       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.05        |
|    explained_variance   | -0.116       |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0436      |
|    n_updates            | 60           |
|    policy_gradient_loss | -0.00726     |
|    value_loss           | 0.00231      |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 8.05e+03     |
|    ep_rew_mean          | 4.36         |
| time/                   |              |
|    fps                  | 745          |
|    iterations           | 17           |
|    time_elapsed         | 46           |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0073165507 |
|    clip_fraction        | 0.103        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.959       |
|    explained_variance   | 0.202        |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0125      |
|    n_updates            | 160          |
|    policy_gradient_loss | -0.00492     |
|    value_loss           | 0.00239      |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m


--- Agent 1 Testing Metrics ---
Total Profit: 23.84214233815146
Cumulative Return: 0.002384214233815146
Win Rate: 0.7569060773480663
Profit Factor: 9.980678894283669
Sharpe Ratio: 0.6103618292865464
Sortino Ratio: 2.3905926564949347
Maximum Drawdown: 0.10298789715212564

--- Agent 2 Testing Metrics ---
Total Profit: 21.55795417815898
Cumulative Return: 0.002155795417815898
Win Rate: 0.6831683168316832
Profit Factor: 6.124384472076316
Sharpe Ratio: 0.47902340783062525
Sortino Ratio: 1.576620673195984
Maximum Drawdown: 0.17116549716514928

--- Agent 3 Testing Metrics ---
Total Profit: 23.56102610860944
Cumulative Return: 0.002356102610860944
Win Rate: 0.7068493150684931
Profit Factor: 7.292530094020669
Sharpe Ratio: 0.4962467354387061
Sortino Ratio: 1.4876400470535838
Maximum Drawdown: 0.23180034892912182

--- Agent 4 Testing Metrics ---
Total Profit: 21.096384145637785
Cumulative Return: 0.0021096384145637783
Win Rate: 0.6937716262975778
Profit Factor: 6.609494983667396
Sharpe Ratio: 0

In [2]:
#multiagent DQN
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import DQN
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.scaler = scaler

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        return self._get_observation(), {}

    def _get_observation(self):
        return self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']

        if action == 1:  # Buy
            if self.position == 0:
                self.position = 1
                self.entry_price = current_price
            elif self.position == -1:
                reward = self.entry_price - current_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0

        elif action == 2:  # Sell
            if self.position == 0:
                self.position = -1
                self.entry_price = current_price
            elif self.position == 1:
                reward = current_price - self.entry_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

# Multi-Agent Trading Environment
class MultiAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None, num_agents=4):
        super(MultiAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.scaler = scaler
        self.num_agents = num_agents
        self.agents = [SingleAgentEnv(data, window_size, initial_balance, scaler) for _ in range(num_agents)]

    def reset(self):
        obs = []
        for agent in self.agents:
            agent_obs, _ = agent.reset()
            obs.append(agent_obs)
        return obs

    def step(self, actions):
        obs, rewards, terminated, truncated, infos = [], [], [], [], []
        for agent, action in zip(self.agents, actions):
            agent_obs, reward, done, truncate, info = agent.step(action)
            obs.append(agent_obs)
            rewards.append(reward)
            terminated.append(done)
            truncated.append(truncate)
            infos.append(info)
        return obs, rewards, any(terminated), any(truncated), infos

# Function to calculate metrics for each agent
def calculate_metrics(trades, initial_balance, final_balance):
    total_profit = final_balance - initial_balance
    cumulative_return = (final_balance - initial_balance) / initial_balance
    win_rate = len([trade for trade in trades if trade > 0]) / len(trades) if trades else 0
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf
    returns = np.array(trades)
    sharpe_ratio = np.mean(returns) / np.std(returns) if np.std(returns) != 0 else 0
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = np.mean(returns) / downside_std if downside_std != 0 else 0
    max_drawdown = np.max(np.maximum.accumulate(np.cumsum(trades)) - np.cumsum(trades)) if trades else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Sharpe Ratio-weighted aggregation for combined metrics
def aggregate_metrics_sharpe_weighted(metrics_list):
    positive_metrics = [m for m in metrics_list if m["Sharpe Ratio"] > 0]
    total_sharpe = sum(m["Sharpe Ratio"] for m in positive_metrics)
    if total_sharpe == 0:
        return {metric: 0 for metric in metrics_list[0]}
    
    combined_metrics = {
        "Total Profit": sum(m["Total Profit"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Cumulative Return": sum(m["Cumulative Return"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Win Rate": sum(m["Win Rate"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Profit Factor": sum(m["Profit Factor"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Sharpe Ratio": sum(m["Sharpe Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Sortino Ratio": sum(m["Sortino Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Maximum Drawdown": sum(m["Maximum Drawdown"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
    }
    return combined_metrics

# Train and evaluate each agent
def train_and_evaluate():
    train_file = 'MSFT_TRAINING.csv'
    test_file = 'MSFT_TESTING.csv'
    df_train, df_test, scaler = load_and_normalize_data(train_file, test_file)

    training_metrics = []
    models = []

    # Train each agent independently
    for i in range(4):  # 4 agents
        env_train = SingleAgentEnv(df_train, window_size=10, scaler=scaler)
        model = DQN("MlpPolicy", env_train, verbose=1)
        model.learn(total_timesteps=50000)
        models.append(model)

        # Record training metrics
        training_metrics.append(calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance))

    combined_training_metrics = aggregate_metrics_sharpe_weighted(training_metrics)
    print("\n=== Combined Training Metrics (Sharpe Ratio Weighted) ===")
    for metric, value in combined_training_metrics.items():
        print(f"{metric}: {value}")

    # Test in the multi-agent environment
    env_test = MultiAgentEnv(df_test, window_size=10, scaler=scaler, num_agents=4)
    obs = env_test.reset()
    done = False
    while not done:
        actions = [model.predict(obs[i])[0] for i, model in enumerate(models)]
        obs, rewards, done, truncated, _ = env_test.step(actions)

    # Calculate and display testing metrics
    testing_metrics = []
    for agent in env_test.agents:
        testing_metrics.append(calculate_metrics(agent.trades, agent.initial_balance, agent.balance))

    combined_testing_metrics = aggregate_metrics_sharpe_weighted(testing_metrics)
    print("\n=== Combined Testing Metrics (Sharpe Ratio Weighted) ===")
    for metric, value in combined_testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 8.05e+03 |
|    ep_rew_mean      | -2.11    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 920      |
|    time_elapsed     | 34       |
|    total_timesteps  | 32192    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000396 |
|    n_updates        | 8022     |
----------------------------------
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 8.05e+03 |
|    ep_rew_mean      | -0.249   |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 956      |
|    ti

In [3]:
#multiagent a2c
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import A2C
from sklearn.preprocessing import StandardScaler

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment with Modified Reward Structure
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.scaler = scaler

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Normalized stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=0, high=1, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        return self._get_observation(), {}

    def _get_observation(self):
        obs = self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)
        obs -= np.min(obs, axis=0)
        obs /= np.max(obs, axis=0) + 1e-8  # Normalizing to [0,1]
        return obs

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']

        if action == 1:  # Buy
            if self.position == 0:
                self.position = 1
                self.entry_price = current_price
            elif self.position == -1:
                reward = self.entry_price - current_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0

        elif action == 2:  # Sell
            if self.position == 0:
                self.position = -1
                self.entry_price = current_price
            elif self.position == 1:
                reward = current_price - self.entry_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0

        # Normalize reward and add transaction cost penalty
        reward = (reward / self.initial_balance) - 0.001  # Small penalty for holding a position

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), np.clip(reward, -1, 1), terminated, truncated, {}

# Multi-Agent Trading Environment
class MultiAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None, num_agents=4):
        super(MultiAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.scaler = scaler
        self.num_agents = num_agents
        self.agents = [SingleAgentEnv(data, window_size, initial_balance, scaler) for _ in range(num_agents)]

    def reset(self):
        obs = []
        for agent in self.agents:
            agent_obs, _ = agent.reset()
            obs.append(agent_obs)
        return obs

    def step(self, actions):
        obs, rewards, terminated, truncated, infos = [], [], [], [], []
        for agent, action in zip(self.agents, actions):
            agent_obs, reward, done, truncate, info = agent.step(action)
            obs.append(agent_obs)
            rewards.append(reward)
            terminated.append(done)
            truncated.append(truncate)
            infos.append(info)
        return obs, rewards, any(terminated), any(truncated), infos

# Function to calculate metrics for each agent
def calculate_metrics(trades, initial_balance, final_balance):
    total_profit = final_balance - initial_balance
    cumulative_return = (final_balance - initial_balance) / initial_balance
    win_rate = len([trade for trade in trades if trade > 0]) / len(trades) if trades else 0
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf
    returns = np.array(trades)
    sharpe_ratio = np.mean(returns) / np.std(returns) if np.std(returns) != 0 else 0
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = np.mean(returns) / downside_std if downside_std != 0 else 0
    max_drawdown = np.max(np.maximum.accumulate(np.cumsum(trades)) - np.cumsum(trades)) if trades else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Sharpe Ratio-weighted aggregation for combined metrics
def aggregate_metrics_sharpe_weighted(metrics_list):
    positive_metrics = [m for m in metrics_list if m["Sharpe Ratio"] > 0]
    total_sharpe = sum(m["Sharpe Ratio"] for m in positive_metrics)
    if total_sharpe == 0:
        return {metric: 0 for metric in metrics_list[0]}
    
    combined_metrics = {
        "Total Profit": sum(m["Total Profit"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Cumulative Return": sum(m["Cumulative Return"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Win Rate": sum(m["Win Rate"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Profit Factor": sum(m["Profit Factor"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Sharpe Ratio": sum(m["Sharpe Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Sortino Ratio": sum(m["Sortino Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Maximum Drawdown": sum(m["Maximum Drawdown"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
    }
    return combined_metrics

# Train and evaluate each agent
def train_and_evaluate():
    train_file = 'MSFT_TRAINING.csv'
    test_file = 'MSFT_TESTING.csv'
    df_train, df_test, scaler = load_and_normalize_data(train_file, test_file)

    training_metrics = []
    models = []

    # Train each agent independently
    for i in range(4):  # 4 agents
        env_train = SingleAgentEnv(df_train, window_size=10, scaler=scaler)
        model = A2C("MlpPolicy", env_train, verbose=1)
        model.learn(total_timesteps=100000)  # Increased timesteps
        models.append(model)

        # Record training metrics
        training_metrics.append(calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance))

    combined_training_metrics = aggregate_metrics_sharpe_weighted(training_metrics)
    print("\n=== Combined Training Metrics (Sharpe Ratio Weighted) ===")
    for metric, value in combined_training_metrics.items():
        print(f"{metric}: {value}")

    # Test in the multi-agent environment
    env_test = MultiAgentEnv(df_test, window_size=10, scaler=scaler, num_agents=4)
    obs = env_test.reset()
    done = False
    while not done:
        actions = [model.predict(obs[i])[0] for i, model in enumerate(models)]
        obs, rewards, done, truncated, _ = env_test.step(actions)

    # Calculate and display testing metrics
    testing_metrics = []
    for agent in env_test.agents:
        testing_metrics.append(calculate_metrics(agent.trades, agent.initial_balance, agent.balance))

    combined_testing_metrics = aggregate_metrics_sharpe_weighted(testing_metrics)
    print("\n=== Combined Testing Metrics (Sharpe Ratio Weighted) ===")
    for metric, value in combined_testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| time/                 |          |
|    fps                | 683      |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.09    |
|    explained_variance | -505     |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0363  |
|    value_loss         | 0.0101   |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 746      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.1     |
|    explained_variance | -355     |
|    learning_rate      | 0.0007   |
|    n_updates    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 687       |
|    iterations         | 1700      |
|    time_elapsed       | 12        |
|    total_timesteps    | 8500      |
| train/                |           |
|    entropy_loss       | -0.455    |
|    explained_variance | -6.94e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 1699      |
|    policy_loss        | 0.0041    |
|    value_loss         | 2.53e-05  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 692      |
|    iterations         | 1800     |
|    time_elapsed       | 12       |
|    total_timesteps    | 9000     |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 710       |
|    iterations         | 3000      |
|    time_elapsed       | 21        |
|    total_timesteps    | 15000     |
| train/                |           |
|    entropy_loss       | -0.691    |
|    explained_variance | -687      |
|    learning_rate      | 0.0007    |
|    n_updates          | 2999      |
|    policy_loss        | -0.000393 |
|    value_loss         | 8.11e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 713       |
|    iterations         | 3100      |
|    time_elapsed       | 21        |
|    total_timesteps    | 15500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 727      |
|    iterations         | 4300     |
|    time_elapsed       | 29       |
|    total_timesteps    | 21500    |
| train/                |          |
|    entropy_loss       | -0.408   |
|    explained_variance | -4.06    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4299     |
|    policy_loss        | 0.000606 |
|    value_loss         | 2.93e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 729       |
|    iterations         | 4400      |
|    time_elapsed       | 30        |
|    total_timesteps    | 22000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 748      |
|    iterations         | 5600     |
|    time_elapsed       | 37       |
|    total_timesteps    | 28000    |
| train/                |          |
|    entropy_loss       | -0.507   |
|    explained_variance | -7.17    |
|    learning_rate      | 0.0007   |
|    n_updates          | 5599     |
|    policy_loss        | 0.00467  |
|    value_loss         | 4.4e-05  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 749      |
|    iterations         | 5700     |
|    time_elapsed       | 38       |
|    total_timesteps    | 28500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 760      |
|    iterations         | 6900     |
|    time_elapsed       | 45       |
|    total_timesteps    | 34500    |
| train/                |          |
|    entropy_loss       | -0.792   |
|    explained_variance | -5.5     |
|    learning_rate      | 0.0007   |
|    n_updates          | 6899     |
|    policy_loss        | -0.00121 |
|    value_loss         | 4.07e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 761       |
|    iterations         | 7000      |
|    time_elapsed       | 45        |
|    total_timesteps    | 35000     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 767       |
|    iterations         | 8200      |
|    time_elapsed       | 53        |
|    total_timesteps    | 41000     |
| train/                |           |
|    entropy_loss       | -0.286    |
|    explained_variance | -210      |
|    learning_rate      | 0.0007    |
|    n_updates          | 8199      |
|    policy_loss        | -0.000243 |
|    value_loss         | 1.22e-05  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 768      |
|    iterations         | 8300     |
|    time_elapsed       | 54       |
|    total_timesteps    | 41500    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 768       |
|    iterations         | 9500      |
|    time_elapsed       | 61        |
|    total_timesteps    | 47500     |
| train/                |           |
|    entropy_loss       | -0.202    |
|    explained_variance | -9.18     |
|    learning_rate      | 0.0007    |
|    n_updates          | 9499      |
|    policy_loss        | -0.000179 |
|    value_loss         | 1.49e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 767       |
|    iterations         | 9600      |
|    time_elapsed       | 62        |
|    total_timesteps    | 48000     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 740       |
|    iterations         | 10800     |
|    time_elapsed       | 72        |
|    total_timesteps    | 54000     |
| train/                |           |
|    entropy_loss       | -0.0347   |
|    explained_variance | -1.64e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 10799     |
|    policy_loss        | -1.77e-06 |
|    value_loss         | 1.17e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 739      |
|    iterations         | 10900    |
|    time_elapsed       | 73       |
|    total_timesteps    | 54500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 744      |
|    iterations         | 12100    |
|    time_elapsed       | 81       |
|    total_timesteps    | 60500    |
| train/                |          |
|    entropy_loss       | -0.0081  |
|    explained_variance | -5.96    |
|    learning_rate      | 0.0007   |
|    n_updates          | 12099    |
|    policy_loss        | -1.3e-07 |
|    value_loss         | 2.52e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 743      |
|    iterations         | 12200    |
|    time_elapsed       | 81       |
|    total_timesteps    | 61000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 725       |
|    iterations         | 13400     |
|    time_elapsed       | 92        |
|    total_timesteps    | 67000     |
| train/                |           |
|    entropy_loss       | -0.00424  |
|    explained_variance | -18.2     |
|    learning_rate      | 0.0007    |
|    n_updates          | 13399     |
|    policy_loss        | -5.45e-08 |
|    value_loss         | 3.74e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 725      |
|    iterations         | 13500    |
|    time_elapsed       | 93       |
|    total_timesteps    | 67500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 715      |
|    iterations         | 14700    |
|    time_elapsed       | 102      |
|    total_timesteps    | 73500    |
| train/                |          |
|    entropy_loss       | -0.0154  |
|    explained_variance | -30.2    |
|    learning_rate      | 0.0007   |
|    n_updates          | 14699    |
|    policy_loss        | 1.79e-06 |
|    value_loss         | 2.92e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 712      |
|    iterations         | 14800    |
|    time_elapsed       | 103      |
|    total_timesteps    | 74000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 705      |
|    iterations         | 16000    |
|    time_elapsed       | 113      |
|    total_timesteps    | 80000    |
| train/                |          |
|    entropy_loss       | -0.0272  |
|    explained_variance | -23.2    |
|    learning_rate      | 0.0007   |
|    n_updates          | 15999    |
|    policy_loss        | 5.16e-06 |
|    value_loss         | 3.99e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 704      |
|    iterations         | 16100    |
|    time_elapsed       | 114      |
|    total_timesteps    | 80500    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 700       |
|    iterations         | 17300     |
|    time_elapsed       | 123       |
|    total_timesteps    | 86500     |
| train/                |           |
|    entropy_loss       | -0.0266   |
|    explained_variance | -462      |
|    learning_rate      | 0.0007    |
|    n_updates          | 17299     |
|    policy_loss        | -1.16e-05 |
|    value_loss         | 1.03e-05  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 700      |
|    iterations         | 17400    |
|    time_elapsed       | 124      |
|    total_timesteps    | 87000    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 703       |
|    iterations         | 18600     |
|    time_elapsed       | 132       |
|    total_timesteps    | 93000     |
| train/                |           |
|    entropy_loss       | -0.0111   |
|    explained_variance | -1.14e+04 |
|    learning_rate      | 0.0007    |
|    n_updates          | 18599     |
|    policy_loss        | 2.15e-07  |
|    value_loss         | 1.27e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 704       |
|    iterations         | 18700     |
|    time_elapsed       | 132       |
|    total_timesteps    | 93500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 712      |
|    iterations         | 19900    |
|    time_elapsed       | 139      |
|    total_timesteps    | 99500    |
| train/                |          |
|    entropy_loss       | -0.00849 |
|    explained_variance | -1.96    |
|    learning_rate      | 0.0007   |
|    n_updates          | 19899    |
|    policy_loss        | 2.02e-06 |
|    value_loss         | 4.78e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 713       |
|    iterations         | 20000     |
|    time_elapsed       | 140       |
|    total_timesteps    | 100000    |
| train/                |    

------------------------------------
| time/                 |          |
|    fps                | 1012     |
|    iterations         | 1500     |
|    time_elapsed       | 7        |
|    total_timesteps    | 7500     |
| train/                |          |
|    entropy_loss       | -0.383   |
|    explained_variance | -2.9e+04 |
|    learning_rate      | 0.0007   |
|    n_updates          | 1499     |
|    policy_loss        | 0.000932 |
|    value_loss         | 0.000205 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1012      |
|    iterations         | 1600      |
|    time_elapsed       | 7         |
|    total_timesteps    | 8000      |
| train/                |           |
|    entropy_loss       | -0.351    |
|    explained_variance | -9.55     |
|    learning_rate      | 0.0007    |
|    n_updates          | 1599      |
|    policy_loss        | -0.000372 |
|    value_loss         | 

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 1013      |
|    iterations         | 2900      |
|    time_elapsed       | 14        |
|    total_timesteps    | 14500     |
| train/                |           |
|    entropy_loss       | -0.0993   |
|    explained_variance | -147      |
|    learning_rate      | 0.0007    |
|    n_updates          | 2899      |
|    policy_loss        | -0.000111 |
|    value_loss         | 6.29e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 1014      |
|    iterations         | 3000      |
|    time_elapsed       | 14        |
|    total_timesteps    | 15000     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 977       |
|    iterations         | 4200      |
|    time_elapsed       | 21        |
|    total_timesteps    | 21000     |
| train/                |           |
|    entropy_loss       | -0.0176   |
|    explained_variance | -120      |
|    learning_rate      | 0.0007    |
|    n_updates          | 4199      |
|    policy_loss        | -2.23e-07 |
|    value_loss         | 1.38e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 977       |
|    iterations         | 4300      |
|    time_elapsed       | 21        |
|    total_timesteps    | 21500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 979       |
|    iterations         | 5500      |
|    time_elapsed       | 28        |
|    total_timesteps    | 27500     |
| train/                |           |
|    entropy_loss       | -0.00577  |
|    explained_variance | -7.37     |
|    learning_rate      | 0.0007    |
|    n_updates          | 5499      |
|    policy_loss        | -5.25e-07 |
|    value_loss         | 7.62e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 979      |
|    iterations         | 5600     |
|    time_elapsed       | 28       |
|    total_timesteps    | 28000    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 979       |
|    iterations         | 6800      |
|    time_elapsed       | 34        |
|    total_timesteps    | 34000     |
| train/                |           |
|    entropy_loss       | -0.0029   |
|    explained_variance | -72       |
|    learning_rate      | 0.0007    |
|    n_updates          | 6799      |
|    policy_loss        | -4.76e-07 |
|    value_loss         | 3.63e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 979       |
|    iterations         | 6900      |
|    time_elapsed       | 35        |
|    total_timesteps    | 34500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 979       |
|    iterations         | 8100      |
|    time_elapsed       | 41        |
|    total_timesteps    | 40500     |
| train/                |           |
|    entropy_loss       | -0.0033   |
|    explained_variance | -2.08e+04 |
|    learning_rate      | 0.0007    |
|    n_updates          | 8099      |
|    policy_loss        | 1.68e-06  |
|    value_loss         | 7.9e-05   |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 979      |
|    iterations         | 8200     |
|    time_elapsed       | 41       |
|    total_timesteps    | 41000    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 980      |
|    iterations         | 9400     |
|    time_elapsed       | 47       |
|    total_timesteps    | 47000    |
| train/                |          |
|    entropy_loss       | -0.0028  |
|    explained_variance | -31.1    |
|    learning_rate      | 0.0007   |
|    n_updates          | 9399     |
|    policy_loss        | 1.78e-07 |
|    value_loss         | 8.99e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 979      |
|    iterations         | 9500     |
|    time_elapsed       | 48       |
|    total_timesteps    | 47500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 978      |
|    iterations         | 10700    |
|    time_elapsed       | 54       |
|    total_timesteps    | 53500    |
| train/                |          |
|    entropy_loss       | -0.00252 |
|    explained_variance | -15.2    |
|    learning_rate      | 0.0007   |
|    n_updates          | 10699    |
|    policy_loss        | 3.95e-08 |
|    value_loss         | 3.49e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 978      |
|    iterations         | 10800    |
|    time_elapsed       | 55       |
|    total_timesteps    | 54000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 976       |
|    iterations         | 12000     |
|    time_elapsed       | 61        |
|    total_timesteps    | 60000     |
| train/                |           |
|    entropy_loss       | -0.000948 |
|    explained_variance | -93       |
|    learning_rate      | 0.0007    |
|    n_updates          | 11999     |
|    policy_loss        | 5.12e-08  |
|    value_loss         | 4.34e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 976       |
|    iterations         | 12100     |
|    time_elapsed       | 61        |
|    total_timesteps    | 60500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 967       |
|    iterations         | 13300     |
|    time_elapsed       | 68        |
|    total_timesteps    | 66500     |
| train/                |           |
|    entropy_loss       | -0.000555 |
|    explained_variance | -75.5     |
|    learning_rate      | 0.0007    |
|    n_updates          | 13299     |
|    policy_loss        | -4.4e-08  |
|    value_loss         | 1.46e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 967       |
|    iterations         | 13400     |
|    time_elapsed       | 69        |
|    total_timesteps    | 67000     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 964       |
|    iterations         | 14600     |
|    time_elapsed       | 75        |
|    total_timesteps    | 73000     |
| train/                |           |
|    entropy_loss       | -0.000606 |
|    explained_variance | -39.6     |
|    learning_rate      | 0.0007    |
|    n_updates          | 14599     |
|    policy_loss        | -1.33e-07 |
|    value_loss         | 8.05e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 964       |
|    iterations         | 14700     |
|    time_elapsed       | 76        |
|    total_timesteps    | 73500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 960      |
|    iterations         | 15900    |
|    time_elapsed       | 82       |
|    total_timesteps    | 79500    |
| train/                |          |
|    entropy_loss       | -0.00046 |
|    explained_variance | -30.1    |
|    learning_rate      | 0.0007   |
|    n_updates          | 15899    |
|    policy_loss        | 1.09e-08 |
|    value_loss         | 1.05e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 960       |
|    iterations         | 16000     |
|    time_elapsed       | 83        |
|    total_timesteps    | 80000     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 957       |
|    iterations         | 17200     |
|    time_elapsed       | 89        |
|    total_timesteps    | 86000     |
| train/                |           |
|    entropy_loss       | -0.000445 |
|    explained_variance | -4.82     |
|    learning_rate      | 0.0007    |
|    n_updates          | 17199     |
|    policy_loss        | -1.32e-08 |
|    value_loss         | 2.2e-07   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 957       |
|    iterations         | 17300     |
|    time_elapsed       | 90        |
|    total_timesteps    | 86500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 951       |
|    iterations         | 18500     |
|    time_elapsed       | 97        |
|    total_timesteps    | 92500     |
| train/                |           |
|    entropy_loss       | -0.000433 |
|    explained_variance | -54.4     |
|    learning_rate      | 0.0007    |
|    n_updates          | 18499     |
|    policy_loss        | 1.65e-08  |
|    value_loss         | 3.8e-07   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 951       |
|    iterations         | 18600     |
|    time_elapsed       | 97        |
|    total_timesteps    | 93000     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 939       |
|    iterations         | 19800     |
|    time_elapsed       | 105       |
|    total_timesteps    | 99000     |
| train/                |           |
|    entropy_loss       | -0.000303 |
|    explained_variance | -3.34     |
|    learning_rate      | 0.0007    |
|    n_updates          | 19799     |
|    policy_loss        | -2.19e-08 |
|    value_loss         | 1.03e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 939       |
|    iterations         | 19900     |
|    time_elapsed       | 105       |
|    total_timesteps    | 99500     |
| train/    

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


-------------------------------------
| time/                 |           |
|    fps                | 878       |
|    iterations         | 100       |
|    time_elapsed       | 0         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.07     |
|    explained_variance | -9.24e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | 0.0346    |
|    value_loss         | 0.016     |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 914      |
|    iterations         | 200      |
|    time_elapsed       | 1        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -1.01    |
|    explained_variance | -371     |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    policy_loss        | -0.0345  |
|    value_loss         

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 991       |
|    iterations         | 1700      |
|    time_elapsed       | 8         |
|    total_timesteps    | 8500      |
| train/                |           |
|    entropy_loss       | -0.828    |
|    explained_variance | -304      |
|    learning_rate      | 0.0007    |
|    n_updates          | 1699      |
|    policy_loss        | -0.000192 |
|    value_loss         | 4.3e-05   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 985       |
|    iterations         | 1800      |
|    time_elapsed       | 9         |
|    total_timesteps    | 9000      |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 986       |
|    iterations         | 3000      |
|    time_elapsed       | 15        |
|    total_timesteps    | 15000     |
| train/                |           |
|    entropy_loss       | -0.362    |
|    explained_variance | -25.8     |
|    learning_rate      | 0.0007    |
|    n_updates          | 2999      |
|    policy_loss        | -0.000544 |
|    value_loss         | 6.59e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 987       |
|    iterations         | 3100      |
|    time_elapsed       | 15        |
|    total_timesteps    | 15500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 993      |
|    iterations         | 4300     |
|    time_elapsed       | 21       |
|    total_timesteps    | 21500    |
| train/                |          |
|    entropy_loss       | -0.599   |
|    explained_variance | -444     |
|    learning_rate      | 0.0007   |
|    n_updates          | 4299     |
|    policy_loss        | 0.000146 |
|    value_loss         | 9.02e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 993      |
|    iterations         | 4400     |
|    time_elapsed       | 22       |
|    total_timesteps    | 22000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 997      |
|    iterations         | 5600     |
|    time_elapsed       | 28       |
|    total_timesteps    | 28000    |
| train/                |          |
|    entropy_loss       | -0.286   |
|    explained_variance | -81.1    |
|    learning_rate      | 0.0007   |
|    n_updates          | 5599     |
|    policy_loss        | -0.00121 |
|    value_loss         | 3.22e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 997      |
|    iterations         | 5700     |
|    time_elapsed       | 28       |
|    total_timesteps    | 28500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 999      |
|    iterations         | 6900     |
|    time_elapsed       | 34       |
|    total_timesteps    | 34500    |
| train/                |          |
|    entropy_loss       | -0.116   |
|    explained_variance | -10.9    |
|    learning_rate      | 0.0007   |
|    n_updates          | 6899     |
|    policy_loss        | 4.18e-05 |
|    value_loss         | 3.44e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 999       |
|    iterations         | 7000      |
|    time_elapsed       | 35        |
|    total_timesteps    | 35000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 993      |
|    iterations         | 8200     |
|    time_elapsed       | 41       |
|    total_timesteps    | 41000    |
| train/                |          |
|    entropy_loss       | -0.0111  |
|    explained_variance | -18.3    |
|    learning_rate      | 0.0007   |
|    n_updates          | 8199     |
|    policy_loss        | -4.2e-06 |
|    value_loss         | 1.09e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 993       |
|    iterations         | 8300      |
|    time_elapsed       | 41        |
|    total_timesteps    | 41500     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 993      |
|    iterations         | 9500     |
|    time_elapsed       | 47       |
|    total_timesteps    | 47500    |
| train/                |          |
|    entropy_loss       | -0.0243  |
|    explained_variance | -15.1    |
|    learning_rate      | 0.0007   |
|    n_updates          | 9499     |
|    policy_loss        | 8.62e-06 |
|    value_loss         | 6.53e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 993      |
|    iterations         | 9600     |
|    time_elapsed       | 48       |
|    total_timesteps    | 48000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 994       |
|    iterations         | 10800     |
|    time_elapsed       | 54        |
|    total_timesteps    | 54000     |
| train/                |           |
|    entropy_loss       | -0.00528  |
|    explained_variance | -55.2     |
|    learning_rate      | 0.0007    |
|    n_updates          | 10799     |
|    policy_loss        | -1.18e-06 |
|    value_loss         | 5.12e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 994      |
|    iterations         | 10900    |
|    time_elapsed       | 54       |
|    total_timesteps    | 54500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 994      |
|    iterations         | 12100    |
|    time_elapsed       | 60       |
|    total_timesteps    | 60500    |
| train/                |          |
|    entropy_loss       | -0.00704 |
|    explained_variance | -101     |
|    learning_rate      | 0.0007   |
|    n_updates          | 12099    |
|    policy_loss        | 5.09e-07 |
|    value_loss         | 4.59e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 994      |
|    iterations         | 12200    |
|    time_elapsed       | 61       |
|    total_timesteps    | 61000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 995       |
|    iterations         | 13400     |
|    time_elapsed       | 67        |
|    total_timesteps    | 67000     |
| train/                |           |
|    entropy_loss       | -0.00435  |
|    explained_variance | -4.69     |
|    learning_rate      | 0.0007    |
|    n_updates          | 13399     |
|    policy_loss        | -1.89e-06 |
|    value_loss         | 1.94e-05  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 995       |
|    iterations         | 13500     |
|    time_elapsed       | 67        |
|    total_timesteps    | 67500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 995      |
|    iterations         | 14700    |
|    time_elapsed       | 73       |
|    total_timesteps    | 73500    |
| train/                |          |
|    entropy_loss       | -0.0049  |
|    explained_variance | -46      |
|    learning_rate      | 0.0007   |
|    n_updates          | 14699    |
|    policy_loss        | 3.19e-07 |
|    value_loss         | 4.29e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 995       |
|    iterations         | 14800     |
|    time_elapsed       | 74        |
|    total_timesteps    | 74000     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 984       |
|    iterations         | 16000     |
|    time_elapsed       | 81        |
|    total_timesteps    | 80000     |
| train/                |           |
|    entropy_loss       | -0.00876  |
|    explained_variance | -579      |
|    learning_rate      | 0.0007    |
|    n_updates          | 15999     |
|    policy_loss        | -1.35e-07 |
|    value_loss         | 2.61e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 983      |
|    iterations         | 16100    |
|    time_elapsed       | 81       |
|    total_timesteps    | 80500    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 953       |
|    iterations         | 17300     |
|    time_elapsed       | 90        |
|    total_timesteps    | 86500     |
| train/                |           |
|    entropy_loss       | -0.035    |
|    explained_variance | -7.69     |
|    learning_rate      | 0.0007    |
|    n_updates          | 17299     |
|    policy_loss        | -6.58e-06 |
|    value_loss         | 1.8e-06   |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 951      |
|    iterations         | 17400    |
|    time_elapsed       | 91       |
|    total_timesteps    | 87000    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 916       |
|    iterations         | 18600     |
|    time_elapsed       | 101       |
|    total_timesteps    | 93000     |
| train/                |           |
|    entropy_loss       | -0.055    |
|    explained_variance | -1.91e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 18599     |
|    policy_loss        | 1.32e-05  |
|    value_loss         | 2.25e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 916       |
|    iterations         | 18700     |
|    time_elapsed       | 102       |
|    total_timesteps    | 93500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 910      |
|    iterations         | 19900    |
|    time_elapsed       | 109      |
|    total_timesteps    | 99500    |
| train/                |          |
|    entropy_loss       | -0.0205  |
|    explained_variance | -258     |
|    learning_rate      | 0.0007   |
|    n_updates          | 19899    |
|    policy_loss        | 2.49e-07 |
|    value_loss         | 1.09e-08 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 910       |
|    iterations         | 20000     |
|    time_elapsed       | 109       |
|    total_timesteps    | 100000    |
| train/                |    

------------------------------------
| time/                 |          |
|    fps                | 998      |
|    iterations         | 1500     |
|    time_elapsed       | 7        |
|    total_timesteps    | 7500     |
| train/                |          |
|    entropy_loss       | -1.05    |
|    explained_variance | -225     |
|    learning_rate      | 0.0007   |
|    n_updates          | 1499     |
|    policy_loss        | -0.00544 |
|    value_loss         | 0.000118 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 998      |
|    iterations         | 1600     |
|    time_elapsed       | 8        |
|    total_timesteps    | 8000     |
| train/                |          |
|    entropy_loss       | -1.05    |
|    explained_variance | -38.3    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1599     |
|    policy_loss        | 0.00138  |
|    value_loss         | 1.36e-05 |
-

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 1001     |
|    iterations         | 2900     |
|    time_elapsed       | 14       |
|    total_timesteps    | 14500    |
| train/                |          |
|    entropy_loss       | -0.723   |
|    explained_variance | -12.3    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2899     |
|    policy_loss        | 0.00378  |
|    value_loss         | 8.78e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 1002     |
|    iterations         | 3000     |
|    time_elapsed       | 14       |
|    total_timesteps    | 15000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 993      |
|    iterations         | 4200     |
|    time_elapsed       | 21       |
|    total_timesteps    | 21000    |
| train/                |          |
|    entropy_loss       | -0.722   |
|    explained_variance | -20.8    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4199     |
|    policy_loss        | 0.000256 |
|    value_loss         | 1.5e-06  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 993       |
|    iterations         | 4300      |
|    time_elapsed       | 21        |
|    total_timesteps    | 21500     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 995       |
|    iterations         | 5500      |
|    time_elapsed       | 27        |
|    total_timesteps    | 27500     |
| train/                |           |
|    entropy_loss       | -0.342    |
|    explained_variance | -14.2     |
|    learning_rate      | 0.0007    |
|    n_updates          | 5499      |
|    policy_loss        | -0.000201 |
|    value_loss         | 3.94e-06  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 995      |
|    iterations         | 5600     |
|    time_elapsed       | 28       |
|    total_timesteps    | 28000    |
| train/             

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 996       |
|    iterations         | 6800      |
|    time_elapsed       | 34        |
|    total_timesteps    | 34000     |
| train/                |           |
|    entropy_loss       | -0.33     |
|    explained_variance | -1.9e+03  |
|    learning_rate      | 0.0007    |
|    n_updates          | 6799      |
|    policy_loss        | -7.96e-05 |
|    value_loss         | 7.45e-07  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 997       |
|    iterations         | 6900      |
|    time_elapsed       | 34        |
|    total_timesteps    | 34500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 998      |
|    iterations         | 8100     |
|    time_elapsed       | 40       |
|    total_timesteps    | 40500    |
| train/                |          |
|    entropy_loss       | -0.635   |
|    explained_variance | -178     |
|    learning_rate      | 0.0007   |
|    n_updates          | 8099     |
|    policy_loss        | 0.0273   |
|    value_loss         | 0.000198 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 998      |
|    iterations         | 8200     |
|    time_elapsed       | 41       |
|    total_timesteps    | 41000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 998      |
|    iterations         | 9400     |
|    time_elapsed       | 47       |
|    total_timesteps    | 47000    |
| train/                |          |
|    entropy_loss       | -0.873   |
|    explained_variance | -0.924   |
|    learning_rate      | 0.0007   |
|    n_updates          | 9399     |
|    policy_loss        | 0.000205 |
|    value_loss         | 1.02e-07 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 999      |
|    iterations         | 9500     |
|    time_elapsed       | 47       |
|    total_timesteps    | 47500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 997      |
|    iterations         | 10700    |
|    time_elapsed       | 53       |
|    total_timesteps    | 53500    |
| train/                |          |
|    entropy_loss       | -0.77    |
|    explained_variance | -0.253   |
|    learning_rate      | 0.0007   |
|    n_updates          | 10699    |
|    policy_loss        | -0.00158 |
|    value_loss         | 6.83e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 997       |
|    iterations         | 10800     |
|    time_elapsed       | 54        |
|    total_timesteps    | 54000     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 997       |
|    iterations         | 12000     |
|    time_elapsed       | 60        |
|    total_timesteps    | 60000     |
| train/                |           |
|    entropy_loss       | -0.792    |
|    explained_variance | -42.3     |
|    learning_rate      | 0.0007    |
|    n_updates          | 11999     |
|    policy_loss        | -2.85e-05 |
|    value_loss         | 2.41e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 997      |
|    iterations         | 12100    |
|    time_elapsed       | 60       |
|    total_timesteps    | 60500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 995      |
|    iterations         | 13300    |
|    time_elapsed       | 66       |
|    total_timesteps    | 66500    |
| train/                |          |
|    entropy_loss       | -0.729   |
|    explained_variance | -170     |
|    learning_rate      | 0.0007   |
|    n_updates          | 13299    |
|    policy_loss        | 0.00285  |
|    value_loss         | 6.9e-06  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 995       |
|    iterations         | 13400     |
|    time_elapsed       | 67        |
|    total_timesteps    | 67000     |
| train/                |    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 995       |
|    iterations         | 14600     |
|    time_elapsed       | 73        |
|    total_timesteps    | 73000     |
| train/                |           |
|    entropy_loss       | -0.916    |
|    explained_variance | -6.96     |
|    learning_rate      | 0.0007    |
|    n_updates          | 14599     |
|    policy_loss        | -0.000269 |
|    value_loss         | 4.43e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 994       |
|    iterations         | 14700     |
|    time_elapsed       | 73        |
|    total_timesteps    | 73500     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 993       |
|    iterations         | 15900     |
|    time_elapsed       | 80        |
|    total_timesteps    | 79500     |
| train/                |           |
|    entropy_loss       | -0.512    |
|    explained_variance | -6.51     |
|    learning_rate      | 0.0007    |
|    n_updates          | 15899     |
|    policy_loss        | -0.000269 |
|    value_loss         | 6.68e-07  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 993      |
|    iterations         | 16000    |
|    time_elapsed       | 80       |
|    total_timesteps    | 80000    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 991      |
|    iterations         | 17200    |
|    time_elapsed       | 86       |
|    total_timesteps    | 86000    |
| train/                |          |
|    entropy_loss       | -0.442   |
|    explained_variance | -9.82    |
|    learning_rate      | 0.0007   |
|    n_updates          | 17199    |
|    policy_loss        | 0.000434 |
|    value_loss         | 4.23e-06 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -8.05    |
| time/                 |          |
|    fps                | 989      |
|    iterations         | 17300    |
|    time_elapsed       | 87       |
|    total_timesteps    | 86500    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 984       |
|    iterations         | 18500     |
|    time_elapsed       | 93        |
|    total_timesteps    | 92500     |
| train/                |           |
|    entropy_loss       | -0.0394   |
|    explained_variance | -9.6      |
|    learning_rate      | 0.0007    |
|    n_updates          | 18499     |
|    policy_loss        | -9.92e-07 |
|    value_loss         | 2.93e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 984       |
|    iterations         | 18600     |
|    time_elapsed       | 94        |
|    total_timesteps    | 93000     |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 985       |
|    iterations         | 19800     |
|    time_elapsed       | 100       |
|    total_timesteps    | 99000     |
| train/                |           |
|    entropy_loss       | -0.0438   |
|    explained_variance | -2.07     |
|    learning_rate      | 0.0007    |
|    n_updates          | 19799     |
|    policy_loss        | -3.07e-07 |
|    value_loss         | 3.93e-08  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -8.05     |
| time/                 |           |
|    fps                | 985       |
|    iterations         | 19900     |
|    time_elapsed       | 100       |
|    total_timesteps    | 99500     |
| train/    

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [5]:
#multiagent ensemble
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO, DQN, A2C
from sklearn.preprocessing import StandardScaler
from collections import Counter

# Function to load and normalize data
def load_and_normalize_data(train_file, test_file):
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)

    df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])
    df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

    scaler = StandardScaler()
    columns_to_normalize = ['open', 'high', 'low', 'close', 'volume']

    df_train[columns_to_normalize] = scaler.fit_transform(df_train[columns_to_normalize])
    df_test[columns_to_normalize] = scaler.transform(df_test[columns_to_normalize])

    return df_train, df_test, scaler

# Single-Agent Trading Environment
class SingleAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None):
        super(SingleAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.current_step = 0
        self.initial_balance = initial_balance
        self.balance = initial_balance
        self.position = 0  # 0 = neutral, 1 = long, -1 = short
        self.trades = []
        self.entry_price = 0
        self.scaler = scaler

        # Action space: hold (0), buy (1), sell (2)
        self.action_space = spaces.Discrete(3)

        # Observation space: Stock prices (open, high, low, close, volume)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(window_size, 5), dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.position = 0
        self.balance = self.initial_balance
        self.trades = []
        self.entry_price = 0
        return self._get_observation(), {}

    def _get_observation(self):
        return self.data.iloc[self.current_step:self.current_step + self.window_size][['open', 'high', 'low', 'close', 'volume']].values.astype(np.float32)

    def step(self, action):
        reward = 0
        current_price = self.data.iloc[self.current_step]['close']

        if action == 1:  # Buy
            if self.position == 0:
                self.position = 1
                self.entry_price = current_price
            elif self.position == -1:
                reward = self.entry_price - current_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0

        elif action == 2:  # Sell
            if self.position == 0:
                self.position = -1
                self.entry_price = current_price
            elif self.position == 1:
                reward = current_price - self.entry_price
                self.balance += reward
                self.trades.append(reward)
                self.position = 0

        self.current_step += 1
        terminated = self.current_step >= len(self.data) - self.window_size
        truncated = False
        return self._get_observation(), reward, terminated, truncated, {}

# Multi-Agent Trading Environment
class MultiAgentEnv(gym.Env):
    def __init__(self, data, window_size=10, initial_balance=10000, scaler=None, num_agents=4):
        super(MultiAgentEnv, self).__init__()
        self.data = data
        self.window_size = window_size
        self.initial_balance = initial_balance
        self.scaler = scaler
        self.num_agents = num_agents
        self.agents = [SingleAgentEnv(data, window_size, initial_balance, scaler) for _ in range(num_agents)]

    def reset(self):
        obs = []
        for agent in self.agents:
            agent_obs, _ = agent.reset()
            obs.append(agent_obs)
        return obs

    def step(self, actions):
        obs, rewards, terminated, truncated, infos = [], [], [], [], []
        for agent, action in zip(self.agents, actions):
            agent_obs, reward, done, truncate, info = agent.step(action)
            obs.append(agent_obs)
            rewards.append(reward)
            terminated.append(done)
            truncated.append(truncate)
            infos.append(info)
        return obs, rewards, any(terminated), any(truncated), infos

# Ensemble model function
def ensemble_predict(actions):
    actions = [int(action) for action in actions]
    action_counts = Counter(actions)
    return action_counts.most_common(1)[0][0]

# Function to calculate metrics for each agent
def calculate_metrics(trades, initial_balance, final_balance):
    total_profit = final_balance - initial_balance
    cumulative_return = (final_balance - initial_balance) / initial_balance
    win_rate = len([trade for trade in trades if trade > 0]) / len(trades) if trades else 0
    gross_profit = sum(trade for trade in trades if trade > 0)
    gross_loss = -sum(trade for trade in trades if trade < 0)
    profit_factor = gross_profit / gross_loss if gross_loss != 0 else np.inf
    returns = np.array(trades)
    sharpe_ratio = np.mean(returns) / np.std(returns) if np.std(returns) != 0 else 0
    downside_std = np.std([min(0, r) for r in returns])
    sortino_ratio = np.mean(returns) / downside_std if downside_std != 0 else 0
    max_drawdown = np.max(np.maximum.accumulate(np.cumsum(trades)) - np.cumsum(trades)) if trades else 0

    metrics = {
        "Total Profit": total_profit,
        "Cumulative Return": cumulative_return,
        "Win Rate": win_rate,
        "Profit Factor": profit_factor,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Maximum Drawdown": max_drawdown
    }
    return metrics

# Sharpe Ratio-weighted aggregation for combined metrics
def aggregate_metrics_sharpe_weighted(metrics_list):
    positive_metrics = [m for m in metrics_list if m["Sharpe Ratio"] > 0]
    total_sharpe = sum(m["Sharpe Ratio"] for m in positive_metrics)
    if total_sharpe == 0:
        return {metric: 0 for metric in metrics_list[0]}
    
    combined_metrics = {
        "Total Profit": sum(m["Total Profit"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Cumulative Return": sum(m["Cumulative Return"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Win Rate": sum(m["Win Rate"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Profit Factor": sum(m["Profit Factor"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Sharpe Ratio": sum(m["Sharpe Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Sortino Ratio": sum(m["Sortino Ratio"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
        "Maximum Drawdown": sum(m["Maximum Drawdown"] * m["Sharpe Ratio"] / total_sharpe for m in positive_metrics),
    }
    return combined_metrics

# Train and evaluate each agent
def train_and_evaluate():
    train_file = 'MSFT_TRAINING.csv'
    test_file = 'MSFT_TESTING.csv'
    df_train, df_test, scaler = load_and_normalize_data(train_file, test_file)

    training_metrics = []
    ensemble_models = []

    # Train each agent independently
    for i in range(4):  # 4 agents
        env_train = SingleAgentEnv(df_train, window_size=10, scaler=scaler)

        # Initialize each model
        ppo_model = PPO("MlpPolicy", env_train, verbose=1)
        dqn_model = DQN("MlpPolicy", env_train, verbose=1)
        a2c_model = A2C("MlpPolicy", env_train, verbose=1)

        # Train each model
        ppo_model.learn(total_timesteps=50000)
        dqn_model.learn(total_timesteps=50000)
        a2c_model.learn(total_timesteps=50000)

        # Store trained models in a list
        ensemble_models.append((ppo_model, dqn_model, a2c_model))

        # Calculate training metrics
        training_metrics.append(calculate_metrics(env_train.trades, env_train.initial_balance, env_train.balance))

    combined_training_metrics = aggregate_metrics_sharpe_weighted(training_metrics)
    print("\n=== Combined Training Metrics (Sharpe Ratio Weighted) ===")
    for metric, value in combined_training_metrics.items():
        print(f"{metric}: {value}")

    # Test in the multi-agent environment
    env_test = MultiAgentEnv(df_test, window_size=10, scaler=scaler, num_agents=4)
    obs = env_test.reset()
    done = False
    while not done:
        actions = []
        for i, (ppo_model, dqn_model, a2c_model) in enumerate(ensemble_models):
            ppo_action, _ = ppo_model.predict(obs[i])
            dqn_action, _ = dqn_model.predict(obs[i])
            a2c_action, _ = a2c_model.predict(obs[i])
            final_action = ensemble_predict([ppo_action, dqn_action, a2c_action])
            actions.append(final_action)

        obs, rewards, done, truncated, _ = env_test.step(actions)

    # Calculate and display testing metrics
    testing_metrics = []
    for agent in env_test.agents:
        testing_metrics.append(calculate_metrics(agent.trades, agent.initial_balance, agent.balance))

    combined_testing_metrics = aggregate_metrics_sharpe_weighted(testing_metrics)
    print("\n=== Combined Testing Metrics (Sharpe Ratio Weighted) ===")
    for metric, value in combined_testing_metrics.items():
        print(f"{metric}: {value}")

# Run the training and evaluation
train_and_evaluate()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 2021 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1308        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014839342 |
|    clip_fraction        | 0.169       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -11.5       |
|   

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_mean          | 0.97        |
| time/                   |             |
|    fps                  | 1295        |
|    iterations           | 11          |
|    time_elapsed         | 17          |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.008404506 |
|    clip_fraction        | 0.0618      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.06       |
|    explained_variance   | -0.344      |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0212     |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.011      |
|    value_loss           | 0.00264     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 8.05e+03     |
|    ep_rew_mean          | 8.06         |
| time/                   |              |
|    fps                  | 1330         |
|    iterations           | 21           |
|    time_elapsed         | 32           |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0062535955 |
|    clip_fraction        | 0.104        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.877       |
|    explained_variance   | 0.227        |
|    learning_rate        | 0.0003       |
|    loss                 | -0.00749     |
|    n_updates            | 200          |
|    policy_gradient_loss | -0.00187     |
|    value_loss           | 0.0033       |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

-------------------------------------
| time/                 |           |
|    fps                | 1024      |
|    iterations         | 800       |
|    time_elapsed       | 3         |
|    total_timesteps    | 4000      |
| train/                |           |
|    entropy_loss       | -0.668    |
|    explained_variance | -1.76e+04 |
|    learning_rate      | 0.0007    |
|    n_updates          | 799       |
|    policy_loss        | -0.0352   |
|    value_loss         | 0.00205   |
-------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1025     |
|    iterations         | 900      |
|    time_elapsed       | 4        |
|    total_timesteps    | 4500     |
| train/                |          |
|    entropy_loss       | -1.02    |
|    explained_variance | -15.2    |
|    learning_rate      | 0.0007   |
|    n_updates          | 899      |
|    policy_loss        | 0.0191   |
|    value_loss         

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.135   |
| time/                 |          |
|    fps                | 1017     |
|    iterations         | 2300     |
|    time_elapsed       | 11       |
|    total_timesteps    | 11500    |
| train/                |          |
|    entropy_loss       | -0.258   |
|    explained_variance | -181     |
|    learning_rate      | 0.0007   |
|    n_updates          | 2299     |
|    policy_loss        | -0.00377 |
|    value_loss         | 0.00401  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.135   |
| time/                 |          |
|    fps                | 1018     |
|    iterations         | 2400     |
|    time_elapsed       | 11       |
|    total_timesteps    | 12000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.00121 |
| time/                 |          |
|    fps                | 1022     |
|    iterations         | 3600     |
|    time_elapsed       | 17       |
|    total_timesteps    | 18000    |
| train/                |          |
|    entropy_loss       | -0.0277  |
|    explained_variance | -69.2    |
|    learning_rate      | 0.0007   |
|    n_updates          | 3599     |
|    policy_loss        | 1.64e-07 |
|    value_loss         | 1.6e-07  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.00121  |
| time/                 |           |
|    fps                | 1023      |
|    iterations         | 3700      |
|    time_elapsed       | 18        |
|    total_timesteps    | 18500     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.0043  |
| time/                 |          |
|    fps                | 1024     |
|    iterations         | 4900     |
|    time_elapsed       | 23       |
|    total_timesteps    | 24500    |
| train/                |          |
|    entropy_loss       | -0.36    |
|    explained_variance | -77      |
|    learning_rate      | 0.0007   |
|    n_updates          | 4899     |
|    policy_loss        | 0.000121 |
|    value_loss         | 2.22e-08 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.0043   |
| time/                 |           |
|    fps                | 1025      |
|    iterations         | 5000      |
|    time_elapsed       | 24        |
|    total_timesteps    | 25000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.0043  |
| time/                 |          |
|    fps                | 1027     |
|    iterations         | 6200     |
|    time_elapsed       | 30       |
|    total_timesteps    | 31000    |
| train/                |          |
|    entropy_loss       | -0.00382 |
|    explained_variance | -8.05    |
|    learning_rate      | 0.0007   |
|    n_updates          | 6199     |
|    policy_loss        | 3.17e-08 |
|    value_loss         | 5.26e-08 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.0043   |
| time/                 |           |
|    fps                | 1027      |
|    iterations         | 6300      |
|    time_elapsed       | 30        |
|    total_timesteps    | 31500     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.352    |
| time/                 |          |
|    fps                | 1027     |
|    iterations         | 7500     |
|    time_elapsed       | 36       |
|    total_timesteps    | 37500    |
| train/                |          |
|    entropy_loss       | -0.00311 |
|    explained_variance | -3.7e+03 |
|    learning_rate      | 0.0007   |
|    n_updates          | 7499     |
|    policy_loss        | 2.71e-07 |
|    value_loss         | 6.67e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.352     |
| time/                 |           |
|    fps                | 1027      |
|    iterations         | 7600      |
|    time_elapsed       | 36        |
|    total_timesteps    | 38000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.232    |
| time/                 |          |
|    fps                | 1028     |
|    iterations         | 8800     |
|    time_elapsed       | 42       |
|    total_timesteps    | 44000    |
| train/                |          |
|    entropy_loss       | -0.592   |
|    explained_variance | -67.7    |
|    learning_rate      | 0.0007   |
|    n_updates          | 8799     |
|    policy_loss        | 8.15e-05 |
|    value_loss         | 2.64e-08 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.232    |
| time/                 |          |
|    fps                | 1028     |
|    iterations         | 8900     |
|    time_elapsed       | 43       |
|    total_timesteps    | 44500    |
| train/                |          |
|

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


-----------------------------
| time/              |      |
|    fps             | 1997 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1678        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009781207 |
|    clip_fraction        | 0.0457      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -5.15       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00907    |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00575    |
|    value_loss           | 0.0108      |
-----------------------------------------
----------------------------------

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 8.05e+03     |
|    ep_rew_mean          | 2.64         |
| time/                   |              |
|    fps                  | 1446         |
|    iterations           | 12           |
|    time_elapsed         | 16           |
|    total_timesteps      | 24576        |
| train/                  |              |
|    approx_kl            | 0.0052640084 |
|    clip_fraction        | 0.0403       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.938       |
|    explained_variance   | 0.463        |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0208      |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00695     |
|    value_loss           | 0.0014       |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_mean          | 5.37        |
| time/                   |             |
|    fps                  | 1431        |
|    iterations           | 22          |
|    time_elapsed         | 31          |
|    total_timesteps      | 45056       |
| train/                  |             |
|    approx_kl            | 0.008290682 |
|    clip_fraction        | 0.088       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.902      |
|    explained_variance   | 0.16        |
|    learning_rate        | 0.0003      |
|    loss                 | -0.016      |
|    n_updates            | 210         |
|    policy_gradient_loss | -0.0111     |
|    value_loss           | 0.00273     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+

------------------------------------
| time/                 |          |
|    fps                | 1017     |
|    iterations         | 900      |
|    time_elapsed       | 4        |
|    total_timesteps    | 4500     |
| train/                |          |
|    entropy_loss       | -0.657   |
|    explained_variance | -14.4    |
|    learning_rate      | 0.0007   |
|    n_updates          | 899      |
|    policy_loss        | -0.0112  |
|    value_loss         | 0.000255 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1017     |
|    iterations         | 1000     |
|    time_elapsed       | 4        |
|    total_timesteps    | 5000     |
| train/                |          |
|    entropy_loss       | -1.07    |
|    explained_variance | -405     |
|    learning_rate      | 0.0007   |
|    n_updates          | 999      |
|    policy_loss        | 0.0359   |
|    value_loss         | 0.00142  |
-

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 1.58     |
| time/                 |          |
|    fps                | 1014     |
|    iterations         | 2400     |
|    time_elapsed       | 11       |
|    total_timesteps    | 12000    |
| train/                |          |
|    entropy_loss       | -1.09    |
|    explained_variance | -9.6     |
|    learning_rate      | 0.0007   |
|    n_updates          | 2399     |
|    policy_loss        | 0.0402   |
|    value_loss         | 0.0015   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 1.58     |
| time/                 |          |
|    fps                | 1014     |
|    iterations         | 2500     |
|    time_elapsed       | 12       |
|    total_timesteps    | 12500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.724    |
| time/                 |          |
|    fps                | 978      |
|    iterations         | 3700     |
|    time_elapsed       | 18       |
|    total_timesteps    | 18500    |
| train/                |          |
|    entropy_loss       | -0.74    |
|    explained_variance | -0.052   |
|    learning_rate      | 0.0007   |
|    n_updates          | 3699     |
|    policy_loss        | -0.135   |
|    value_loss         | 0.00739  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.724    |
| time/                 |          |
|    fps                | 979      |
|    iterations         | 3800     |
|    time_elapsed       | 19       |
|    total_timesteps    | 19000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.28     |
| time/                 |          |
|    fps                | 974      |
|    iterations         | 5000     |
|    time_elapsed       | 25       |
|    total_timesteps    | 25000    |
| train/                |          |
|    entropy_loss       | -0.757   |
|    explained_variance | -24.2    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4999     |
|    policy_loss        | 0.00118  |
|    value_loss         | 1.93e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.28     |
| time/                 |          |
|    fps                | 969      |
|    iterations         | 5100     |
|    time_elapsed       | 26       |
|    total_timesteps    | 25500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.28     |
| time/                 |          |
|    fps                | 956      |
|    iterations         | 6300     |
|    time_elapsed       | 32       |
|    total_timesteps    | 31500    |
| train/                |          |
|    entropy_loss       | -0.816   |
|    explained_variance | -0.333   |
|    learning_rate      | 0.0007   |
|    n_updates          | 6299     |
|    policy_loss        | -0.0111  |
|    value_loss         | 0.000269 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.28     |
| time/                 |          |
|    fps                | 957      |
|    iterations         | 6400     |
|    time_elapsed       | 33       |
|    total_timesteps    | 32000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.494     |
| time/                 |           |
|    fps                | 942       |
|    iterations         | 7600      |
|    time_elapsed       | 40        |
|    total_timesteps    | 38000     |
| train/                |           |
|    entropy_loss       | -0.537    |
|    explained_variance | -2.44e-05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 7599      |
|    policy_loss        | -0.0141   |
|    value_loss         | 0.000715  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.494    |
| time/                 |          |
|    fps                | 943      |
|    iterations         | 7700     |
|    time_elapsed       | 40       |
|    total_timesteps    | 38500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.0449  |
| time/                 |          |
|    fps                | 949      |
|    iterations         | 8900     |
|    time_elapsed       | 46       |
|    total_timesteps    | 44500    |
| train/                |          |
|    entropy_loss       | -0.669   |
|    explained_variance | -146     |
|    learning_rate      | 0.0007   |
|    n_updates          | 8899     |
|    policy_loss        | -0.00214 |
|    value_loss         | 1.35e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.0449  |
| time/                 |          |
|    fps                | 947      |
|    iterations         | 9000     |
|    time_elapsed       | 47       |
|    total_timesteps    | 45000    |
| train/                |          |
|

-----------------------------------------
| time/                   |             |
|    fps                  | 1523        |
|    iterations           | 3           |
|    time_elapsed         | 4           |
|    total_timesteps      | 6144        |
| train/                  |             |
|    approx_kl            | 0.008038059 |
|    clip_fraction        | 0.0687      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -4.67       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.00872    |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.00509    |
|    value_loss           | 0.00547     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_mean          | 4.18        |
| time/                   |             |
|    fps                  | 1482  

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_mean          | 1.94        |
| time/                   |             |
|    fps                  | 1379        |
|    iterations           | 13          |
|    time_elapsed         | 19          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.007378008 |
|    clip_fraction        | 0.0826      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.05       |
|    explained_variance   | 0.761       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0149     |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.00762    |
|    value_loss           | 0.00158     |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 8.05

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_mean          | 4.78        |
| time/                   |             |
|    fps                  | 1363        |
|    iterations           | 23          |
|    time_elapsed         | 34          |
|    total_timesteps      | 47104       |
| train/                  |             |
|    approx_kl            | 0.010062035 |
|    clip_fraction        | 0.117       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.93       |
|    explained_variance   | 0.222       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0272     |
|    n_updates            | 220         |
|    policy_gradient_loss | -0.011      |
|    value_loss           | 0.00221     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+

-------------------------------------
| time/                 |           |
|    fps                | 1015      |
|    iterations         | 1100      |
|    time_elapsed       | 5         |
|    total_timesteps    | 5500      |
| train/                |           |
|    entropy_loss       | -1.04     |
|    explained_variance | -1.59e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 1099      |
|    policy_loss        | 0.015     |
|    value_loss         | 0.000641  |
-------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 1017      |
|    iterations         | 1200      |
|    time_elapsed       | 5         |
|    total_timesteps    | 6000      |
| train/                |           |
|    entropy_loss       | -0.655    |
|    explained_variance | -4.56e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 1199      |
|    policy_loss        | 0.0137    |
|    value_l

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -2.53     |
| time/                 |           |
|    fps                | 1020      |
|    iterations         | 2600      |
|    time_elapsed       | 12        |
|    total_timesteps    | 13000     |
| train/                |           |
|    entropy_loss       | -1.02     |
|    explained_variance | -1.34e+04 |
|    learning_rate      | 0.0007    |
|    n_updates          | 2599      |
|    policy_loss        | 0.103     |
|    value_loss         | 0.00854   |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -2.53    |
| time/                 |          |
|    fps                | 1019     |
|    iterations         | 2700     |
|    time_elapsed       | 13       |
|    total_timesteps    | 13500    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -1.92    |
| time/                 |          |
|    fps                | 1020     |
|    iterations         | 3900     |
|    time_elapsed       | 19       |
|    total_timesteps    | 19500    |
| train/                |          |
|    entropy_loss       | -0.824   |
|    explained_variance | 0.0137   |
|    learning_rate      | 0.0007   |
|    n_updates          | 3899     |
|    policy_loss        | -0.0223  |
|    value_loss         | 0.000875 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -1.92    |
| time/                 |          |
|    fps                | 1020     |
|    iterations         | 4000     |
|    time_elapsed       | 19       |
|    total_timesteps    | 20000    |
| train/                |          |
|

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.944    |
| time/                 |           |
|    fps                | 1012      |
|    iterations         | 5200      |
|    time_elapsed       | 25        |
|    total_timesteps    | 26000     |
| train/                |           |
|    entropy_loss       | -0.124    |
|    explained_variance | -476      |
|    learning_rate      | 0.0007    |
|    n_updates          | 5199      |
|    policy_loss        | -9.67e-07 |
|    value_loss         | 2.27e-06  |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.944    |
| time/                 |           |
|    fps                | 1012      |
|    iterations         | 5300      |
|    time_elapsed       | 26        |
|    total_timesteps    | 26500     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.924   |
| time/                 |          |
|    fps                | 1015     |
|    iterations         | 6500     |
|    time_elapsed       | 31       |
|    total_timesteps    | 32500    |
| train/                |          |
|    entropy_loss       | -0.323   |
|    explained_variance | 0.187    |
|    learning_rate      | 0.0007   |
|    n_updates          | 6499     |
|    policy_loss        | 0.1      |
|    value_loss         | 0.0279   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.924   |
| time/                 |          |
|    fps                | 1015     |
|    iterations         | 6600     |
|    time_elapsed       | 32       |
|    total_timesteps    | 33000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.924   |
| time/                 |          |
|    fps                | 1018     |
|    iterations         | 7800     |
|    time_elapsed       | 38       |
|    total_timesteps    | 39000    |
| train/                |          |
|    entropy_loss       | -0.0852  |
|    explained_variance | -59.7    |
|    learning_rate      | 0.0007   |
|    n_updates          | 7799     |
|    policy_loss        | 2.62e-06 |
|    value_loss         | 3.5e-08  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.924    |
| time/                 |           |
|    fps                | 1018      |
|    iterations         | 7900      |
|    time_elapsed       | 38        |
|    total_timesteps    | 39500     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.694   |
| time/                 |          |
|    fps                | 1018     |
|    iterations         | 9100     |
|    time_elapsed       | 44       |
|    total_timesteps    | 45500    |
| train/                |          |
|    entropy_loss       | -0.037   |
|    explained_variance | 0.988    |
|    learning_rate      | 0.0007   |
|    n_updates          | 9099     |
|    policy_loss        | 0.00131  |
|    value_loss         | 1.28e-06 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.694    |
| time/                 |           |
|    fps                | 1018      |
|    iterations         | 9200      |
|    time_elapsed       | 45        |
|    total_timesteps    | 46000     |
| train/                |    

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_mean          | 3.25        |
| time/                   |             |
|    fps                  | 1524        |
|    iterations           | 4           |
|    time_elapsed         | 5           |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.008010114 |
|    clip_fraction        | 0.052       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.08       |
|    explained_variance   | 0.736       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0244     |
|    n_updates            | 30          |
|    policy_gradient_loss | -0.00428    |
|    value_loss           | 0.00443     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 8.05e+03   |
|    ep_rew_mean          | 2.42       |
| time/                   |            |
|    fps                  | 1428       |
|    iterations           | 14         |
|    time_elapsed         | 20         |
|    total_timesteps      | 28672      |
| train/                  |            |
|    approx_kl            | 0.00757991 |
|    clip_fraction        | 0.0762     |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.05      |
|    explained_variance   | 0.0815     |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0203    |
|    n_updates            | 130        |
|    policy_gradient_loss | -0.0103    |
|    value_loss           | 0.00458    |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+03    |
|    ep_rew_mean          | 7.99        |
| time/                   |             |
|    fps                  | 1407        |
|    iterations           | 24          |
|    time_elapsed         | 34          |
|    total_timesteps      | 49152       |
| train/                  |             |
|    approx_kl            | 0.009543983 |
|    clip_fraction        | 0.101       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.953      |
|    explained_variance   | 0.616       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0085     |
|    n_updates            | 230         |
|    policy_gradient_loss | -0.00703    |
|    value_loss           | 0.00165     |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.05e+

------------------------------------
| time/                 |          |
|    fps                | 1036     |
|    iterations         | 1300     |
|    time_elapsed       | 6        |
|    total_timesteps    | 6500     |
| train/                |          |
|    entropy_loss       | -0.953   |
|    explained_variance | -27      |
|    learning_rate      | 0.0007   |
|    n_updates          | 1299     |
|    policy_loss        | 0.00126  |
|    value_loss         | 8.87e-06 |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 1020     |
|    iterations         | 1400     |
|    time_elapsed       | 6        |
|    total_timesteps    | 7000     |
| train/                |          |
|    entropy_loss       | -0.919   |
|    explained_variance | -303     |
|    learning_rate      | 0.0007   |
|    n_updates          | 1399     |
|    policy_loss        | 0.00434  |
|    value_loss         | 0.000779 |
-

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.538    |
| time/                 |           |
|    fps                | 990       |
|    iterations         | 2700      |
|    time_elapsed       | 13        |
|    total_timesteps    | 13500     |
| train/                |           |
|    entropy_loss       | -0.95     |
|    explained_variance | -4.15e+03 |
|    learning_rate      | 0.0007    |
|    n_updates          | 2699      |
|    policy_loss        | 0.00117   |
|    value_loss         | 9.77e-05  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.538   |
| time/                 |          |
|    fps                | 991      |
|    iterations         | 2800     |
|    time_elapsed       | 14       |
|    total_timesteps    | 14000    |
| train/             

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.607   |
| time/                 |          |
|    fps                | 995      |
|    iterations         | 4000     |
|    time_elapsed       | 20       |
|    total_timesteps    | 20000    |
| train/                |          |
|    entropy_loss       | -1.08    |
|    explained_variance | -30.6    |
|    learning_rate      | 0.0007   |
|    n_updates          | 3999     |
|    policy_loss        | -0.0112  |
|    value_loss         | 0.00012  |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.607   |
| time/                 |          |
|    fps                | 995      |
|    iterations         | 4100     |
|    time_elapsed       | 20       |
|    total_timesteps    | 20500    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.421   |
| time/                 |          |
|    fps                | 988      |
|    iterations         | 5300     |
|    time_elapsed       | 26       |
|    total_timesteps    | 26500    |
| train/                |          |
|    entropy_loss       | -0.786   |
|    explained_variance | -0.0393  |
|    learning_rate      | 0.0007   |
|    n_updates          | 5299     |
|    policy_loss        | -0.0472  |
|    value_loss         | 0.00618  |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.421    |
| time/                 |           |
|    fps                | 989       |
|    iterations         | 5400      |
|    time_elapsed       | 27        |
|    total_timesteps    | 27000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.14    |
| time/                 |          |
|    fps                | 981      |
|    iterations         | 6600     |
|    time_elapsed       | 33       |
|    total_timesteps    | 33000    |
| train/                |          |
|    entropy_loss       | -0.606   |
|    explained_variance | 0.174    |
|    learning_rate      | 0.0007   |
|    n_updates          | 6599     |
|    policy_loss        | -0.00636 |
|    value_loss         | 6.54e-05 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.14     |
| time/                 |           |
|    fps                | 981       |
|    iterations         | 6700      |
|    time_elapsed       | 34        |
|    total_timesteps    | 33500     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | -0.14    |
| time/                 |          |
|    fps                | 988      |
|    iterations         | 7900     |
|    time_elapsed       | 39       |
|    total_timesteps    | 39500    |
| train/                |          |
|    entropy_loss       | -0.0346  |
|    explained_variance | -70.7    |
|    learning_rate      | 0.0007   |
|    n_updates          | 7899     |
|    policy_loss        | 1.03e-06 |
|    value_loss         | 4.76e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | -0.14     |
| time/                 |           |
|    fps                | 987       |
|    iterations         | 8000      |
|    time_elapsed       | 40        |
|    total_timesteps    | 40000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 8.05e+03 |
|    ep_rew_mean        | 0.0256   |
| time/                 |          |
|    fps                | 991      |
|    iterations         | 9200     |
|    time_elapsed       | 46       |
|    total_timesteps    | 46000    |
| train/                |          |
|    entropy_loss       | -0.0984  |
|    explained_variance | -34.4    |
|    learning_rate      | 0.0007   |
|    n_updates          | 9199     |
|    policy_loss        | 1.23e-05 |
|    value_loss         | 7.37e-07 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 8.05e+03  |
|    ep_rew_mean        | 0.0256    |
| time/                 |           |
|    fps                | 991       |
|    iterations         | 9300      |
|    time_elapsed       | 46        |
|    total_timesteps    | 46500     |
| train/                |    