In [1]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env  import DummyVecEnv
import numpy as np

In [2]:
# Define a custom environment
class CustomEnvironment(gym.Env):
    def __init__(self):
        super(CustomEnvironment, self).__init__()
        self.num_elements = 8
        self.best_list = np.random.rand(self.num_elements)  # Initialize with random values
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(self.num_elements,))
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(self.num_elements,))

    def step(self, action):
        # Simulate the game
        # In a real application, replace this with your game logic and calculate the winning rate
        winning_rate = np.sum(self.best_list * action)
        reward = winning_rate  # Reward is the winning rate
        done = True  # For simplicity, we'll consider a single-step environment
        return self.best_list, reward, done, {}

    def reset(self):
        return self.best_list

In [4]:

# Create a DummyVecEnv
env = DummyVecEnv([lambda: CustomEnvironment()])

# Define the PPO agent
model = PPO("MlpPolicy", env, verbose=1)

# Train the agent to find the best combination of elements
model.learn(total_timesteps=10000)

# Extract the learned policy
best_policy = model.predict(env.observation_space.sample())

print("Best Combination of Elements (Policy):", best_policy)



Using cpu device
-----------------------------
| time/              |      |
|    fps             | 2799 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 1752       |
|    iterations           | 2          |
|    time_elapsed         | 2          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.08966977 |
|    clip_fraction        | 0.61       |
|    clip_range           | 0.2        |
|    entropy_loss         | -11.5      |
|    explained_variance   | -1.19e-07  |
|    learning_rate        | 0.0003     |
|    loss                 | 0.0384     |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.131     |
|    std                  | 1.02       |
|    value_loss           | 0.345      |
-----------------------------------