In [1]:
from stable_baselines3.ppo import CnnPolicy
from stable_baselines3 import PPO
from pettingzoo.butterfly import pistonball_v6
import supersuit as ss
import matplotlib.pyplot as plt

env = pistonball_v6.parallel_env(n_pistons=20,
                                 time_penalty=-0.1,
                                 continuous=True,
                                 random_drop=True,
                                 random_rotate=True,
                                 ball_mass=0.75,
                                 ball_friction=0.3,
                                 ball_elasticity=1.5,
                                 max_cycles=125
                                )

env = ss.color_reduction_v0(env, mode= 'B')
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 3)
env = ss.pettingzoo_env_to_vec_env_v1(env)
env = ss.concat_vec_envs_v1(env, 8, num_cpus=4, base_class='stable_baselines3')

done = 0
records1 = []
env.reset()

for i in range(100):
    env.reset()
    done =0
    while not done:
        totalreward = 0
        for agent in env.agent_iter():
            obs, reward, done, info = env.last()
            totalreward += reward
            act = env.action_space(agent).sample()
            env.step(act)
    records1.append(totalreward)
    print('totalreward:',totalreward,'\n')
plt.plot(records1)
plt.show()

model = PPO(CnnPolicy,
            env,
            verbose=3,
            gamma=0.95,
            n_steps=256,
            ent_coef=0.0905168,
            learning_rate=0.00062211,
            vf_coef=0.042202,
            max_grad_norm=0.9,
            gae_lambda=0.99,
            n_epochs=5,
            clip_range=0.3,
            batch_size=256
           )
model.learn(total_timesteps=2000000)
model.save('pp')



env = pistonball_v6.env()
env = ss.color_reduction_v0(env, mode='B')
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 3)

model = PPO.load('pp')
done = 0
records2 = []
env.reset()
for i in range(100):
    env.reset()
    done =0
    while not done:
        totalreward = 0
        for agent in env.agent_iter():
            #env.render()
            obs, reward, done, info = env.last()
            totalreward += reward
            act = model.predict(obs, deterministic=True)[0] if not done else None
            env.step(act)
    records2.append(totalreward)
    print('totalreward:',totalreward,'\n')
plt.plot(records2)
plt.show()


    
    






  from .autonotebook import tqdm as notebook_tqdm
  from distutils.version import LooseVersion


AttributeError: agent_iter

In [2]:
import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy

from pettingzoo.butterfly import pistonball_v6

env = pistonball_v6.parallel_env(
    n_pistons=20,
    time_penalty=-0.1,
    continuous=True,
    random_drop=True,
    random_rotate=True,
    ball_mass=0.75,
    ball_friction=0.3,
    ball_elasticity=1.5,
    max_cycles=125,
)
env = ss.color_reduction_v0(env, mode="B")
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 3)
env = ss.pettingzoo_env_to_vec_env_v1(env)
env = ss.concat_vec_envs_v1(env, 8, num_cpus=4, base_class="stable_baselines3")
model = PPO(
    CnnPolicy,
    env,
    verbose=3,
    gamma=0.95,
    n_steps=256,
    ent_coef=0.0905168,
    learning_rate=0.00062211,
    vf_coef=0.042202,
    max_grad_norm=0.9,
    gae_lambda=0.99,
    n_epochs=5,
    clip_range=0.3,
    batch_size=256,
)
model.learn(total_timesteps=2000000)
model.save("policy")

# Rendering

env = pistonball_v6.env()
env = ss.color_reduction_v0(env, mode="B")
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 3)

model = PPO.load("policy")

env.reset()
for agent in env.agent_iter():
    obs, reward, done, info = env.last()
    act = model.predict(obs, deterministic=True)[0] if not done else None
    env.step(act)
    env.render()

Using cpu device
Wrapping the env in a VecTransposeImage.
------------------------------
| time/              |       |
|    fps             | 803   |
|    iterations      | 1     |
|    time_elapsed    | 50    |
|    total_timesteps | 40960 |
------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 360         |
|    iterations           | 2           |
|    time_elapsed         | 227         |
|    total_timesteps      | 81920       |
| train/                  |             |
|    approx_kl            | 0.012186225 |
|    clip_fraction        | 0.0395      |
|    clip_range           | 0.3         |
|    entropy_loss         | -1.51       |
|    explained_variance   | 0.0131      |
|    learning_rate        | 0.000622    |
|    loss                 | 0.32        |
|    n_updates            | 5           |
|    policy_gradient_loss | 0.00294     |
|    std                  | 1.12        |
|    value_

-----------------------------------------
| time/                   |             |
|    fps                  | 220         |
|    iterations           | 12          |
|    time_elapsed         | 2233        |
|    total_timesteps      | 491520      |
| train/                  |             |
|    approx_kl            | 0.008593191 |
|    clip_fraction        | 0.0351      |
|    clip_range           | 0.3         |
|    entropy_loss         | -2.38       |
|    explained_variance   | -0.0266     |
|    learning_rate        | 0.000622    |
|    loss                 | 1.51        |
|    n_updates            | 55          |
|    policy_gradient_loss | 0.00142     |
|    std                  | 2.67        |
|    value_loss           | 40.1        |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 216         |
|    iterations           | 13          |
|    time_elapsed         | 2454  

-----------------------------------------
| time/                   |             |
|    fps                  | 210         |
|    iterations           | 23          |
|    time_elapsed         | 4470        |
|    total_timesteps      | 942080      |
| train/                  |             |
|    approx_kl            | 0.009169035 |
|    clip_fraction        | 0.0356      |
|    clip_range           | 0.3         |
|    entropy_loss         | -3.27       |
|    explained_variance   | 0.0101      |
|    learning_rate        | 0.000622    |
|    loss                 | 3.08        |
|    n_updates            | 110         |
|    policy_gradient_loss | 0.00226     |
|    std                  | 6.44        |
|    value_loss           | 76.6        |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 211         |
|    iterations           | 24          |
|    time_elapsed         | 4655  

-----------------------------------------
| time/                   |             |
|    fps                  | 213         |
|    iterations           | 34          |
|    time_elapsed         | 6512        |
|    total_timesteps      | 1392640     |
| train/                  |             |
|    approx_kl            | 0.009268234 |
|    clip_fraction        | 0.0356      |
|    clip_range           | 0.3         |
|    entropy_loss         | -4.15       |
|    explained_variance   | -0.00937    |
|    learning_rate        | 0.000622    |
|    loss                 | 1.78        |
|    n_updates            | 165         |
|    policy_gradient_loss | 0.0017      |
|    std                  | 15.6        |
|    value_loss           | 49.6        |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 214         |
|    iterations           | 35          |
|    time_elapsed         | 6697  

-----------------------------------------
| time/                   |             |
|    fps                  | 213         |
|    iterations           | 45          |
|    time_elapsed         | 8642        |
|    total_timesteps      | 1843200     |
| train/                  |             |
|    approx_kl            | 0.008710417 |
|    clip_fraction        | 0.0362      |
|    clip_range           | 0.3         |
|    entropy_loss         | -5.03       |
|    explained_variance   | 0.0141      |
|    learning_rate        | 0.000622    |
|    loss                 | 1.96        |
|    n_updates            | 220         |
|    policy_gradient_loss | 0.00197     |
|    std                  | 37.4        |
|    value_loss           | 54.3        |
-----------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 213          |
|    iterations           | 46           |
|    time_elapsed         | 88

  deprecation(


In [None]:
import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy

from pettingzoo.butterfly import pistonball_v6
env = pistonball_v6.parallel_env(
    n_pistons=20,
    time_penalty=-0.1,
    continuous=True,
    random_drop=True,
    random_rotate=True,
    ball_mass=0.75,
    ball_friction=0.3,
    ball_elasticity=1.5,
    max_cycles=125,
)
env = ss.color_reduction_v0(env, mode="B")
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 3)
env = ss.pettingzoo_env_to_vec_env_v1(env)
env = ss.concat_vec_envs_v1(env, 8, num_cpus=4, base_class="stable_baselines3")

done = 0

records2 = []
env.reset()
model = PPO.load("policy")
env = pistonball_v6.env()
env = ss.color_reduction_v0(env, mode='B')
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 3)

for i in range(100):
    env.reset()
    total = 0
    for agent in env.agent_iter():
        obs, reward, done, info = env.last()
        act = model.predict(obs, deterministic=True)[0] if not done else None
        env.step(act)
        total += reward
    records2.append(total)
    print('After training,this is the',i,'th episode,with the total reward:',total)
        
done = 0
records1 = []
env.reset()
for i in range(100):
    env.reset()
    total = 0
    for agent in env.agent_iter():
        obs, reward, done, info = env.last()
        act = env.action_space(agent).sample()
        env.step(act)
        total += reward
    records1.append(total)
    print('Before training,this is the',i,'th episode,with the total reward:',total)
    

After training,this is the 0 th episode,with the total reward: 1964.0000000000027
After training,this is the 1 th episode,with the total reward: 1943.9999999999907
After training,this is the 2 th episode,with the total reward: 1959.9999999999957
After training,this is the 3 th episode,with the total reward: 1966.000000000001
After training,this is the 4 th episode,with the total reward: 1963.9999999999945
After training,this is the 5 th episode,with the total reward: 1958.0000000000034
After training,this is the 6 th episode,with the total reward: 1960.0000000000064
After training,this is the 7 th episode,with the total reward: 1955.999999999999
After training,this is the 8 th episode,with the total reward: 1969.9999999999895
After training,this is the 9 th episode,with the total reward: 1962.0000000000118
After training,this is the 10 th episode,with the total reward: 1944.0000000000007
After training,this is the 11 th episode,with the total reward: 1949.9999999999948
After training,t

Before training,this is the 0 th episode,with the total reward: -295.51920341394066
Before training,this is the 1 th episode,with the total reward: -33.40365682137587
Before training,this is the 2 th episode,with the total reward: -364.7058823529562
Before training,this is the 3 th episode,with the total reward: 144.44444444444596
Before training,this is the 4 th episode,with the total reward: 283.5195530726197
Before training,this is the 5 th episode,with the total reward: 85.329341317364
Before training,this is the 6 th episode,with the total reward: 548.825256975005
Before training,this is the 7 th episode,with the total reward: -252.78551532032748
Before training,this is the 8 th episode,with the total reward: -230.36465638148292
Before training,this is the 9 th episode,with the total reward: -182.84671532845476
Before training,this is the 10 th episode,with the total reward: 522.4550898203399
Before training,this is the 11 th episode,with the total reward: -124.99999999999507
Befo