In [1]:
import gymnasium as gym
from stable_baselines3 import A2C, DQN, PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.logger import configure
import os
import numpy as np
import pandas as pd
from classes import CustomFlappyBirdEnv_std

In [2]:
# Import and register env
gym.envs.registration.register(
    id='CustomFlappyBird-v0',
    entry_point='__main__:CustomFlappyBirdEnv_std',
    max_episode_steps=10000000,
)

# Environment erstellen und mit VecMonitor wrappen
env = make_vec_env("CustomFlappyBird-v0", n_envs=4, env_kwargs={'render_mode': 'rgb_array', 'use_lidar': False})

In [3]:
# Enable log
log_dir = "./logs/"
os.makedirs(log_dir, exist_ok=True)

In [4]:
#A2C

A2C_MLP_std_2Mio = os.path.join(log_dir, "A2C_MLP_std_2Mio")

# Configure the logger to save data to a specific folder
new_logger = configure(A2C_MLP_std_2Mio, ["stdout", "csv"])

# PPO Modell definieren
A2C = A2C(
    "MlpPolicy",
    env,
    learning_rate=7e-4,
    n_steps=5,
    gamma=0.99,
    gae_lambda=1.0,
    ent_coef=0.01,
    vf_coef=0.5,
    max_grad_norm=0.5,
    use_rms_prop=True,
    verbose=1,
    device='cuda'
)

# Attach the new logger to the model
A2C.set_logger(new_logger)

# Modell trainieren mit Callback
A2C.learn(total_timesteps=2000000)

# Modell speichern
A2C.save("models/A2C_MLP_std_2Mio")

Logging to ./logs/A2C_MLP_std_2Mio
Using cuda device
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 50       |
|    ep_rew_mean        | -9.21    |
| time/                 |          |
|    fps                | 788      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -0.12    |
|    explained_variance | -1.04    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0499  |
|    value_loss         | 4.81     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 50       |
|    ep_rew_mean        | -9.25    |
| time/                 |          |
|    fps                | 881      |
|    iterations         | 200      |
|    time_elapsed       | 4        |
|    total_timesteps  



In [5]:
#DQN
DQN_MLP_std_2Mio = os.path.join(log_dir, "DQN_MLP_std_2Mio")

# Configure the logger to save data to a specific folder
new_logger = configure(DQN_MLP_std_2Mio, ["stdout", "csv"])

# PPO Modell definieren
DQN = DQN(
    "MlpPolicy",  # Policy type
    env,
    learning_rate=1e-4,
    buffer_size=50000,
    learning_starts=1000,
    batch_size=32,
    tau=1.0,
    gamma=0.99,
    train_freq=4,
    gradient_steps=1,
    target_update_interval=1000,
    verbose=1,
    device='cuda'
)

# Attach the new logger to the model
DQN.set_logger(new_logger)

# Modell trainieren mit Callback
DQN.learn(total_timesteps=2000000)

# Modell speichern
DQN.save("models/DQN_MLP_std_2Mio")

Logging to ./logs/DQN_MLP_std_2Mio
Using cuda device
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50       |
|    ep_rew_mean      | -8.1     |
|    exploration_rate | 0.999    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 22163    |
|    time_elapsed     | 0        |
|    total_timesteps  | 200      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50       |
|    ep_rew_mean      | -7.88    |
|    exploration_rate | 0.998    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 22222    |
|    time_elapsed     | 0        |
|    total_timesteps  | 400      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 50       |
|    ep_rew_mean      | -7.8     |
|    exploration_rate | 0.997    |
| 

In [6]:
#baseline
def heuristic(obs):
    bird_y = obs[9]
    threshold = 0.4
    return 1 if bird_y > threshold else 0

# Variables for tracking progress
total_timesteps = 2000000
current_total_timesteps = 0
ep_rew_sum = 0
episode_rewards = []
episode_lengths = []

# Reset the environment
obs, _ = env.reset()

while current_total_timesteps < total_timesteps:
    action = heuristic(obs)
    obs, reward, terminated, _, _ = env.step(action)
    ep_rew_sum += reward
    current_total_timesteps += 1

    if terminated:
        episode_rewards.append(ep_rew_sum)
        episode_lengths.append(current_total_timesteps)
        obs, _ = env.reset()
        ep_rew_sum = 0

        # Log progress every 100 episodes
        if len(episode_rewards) % 100 == 0:
            mean_reward = np.mean(episode_rewards[-100:])
            print(f"Episode: {len(episode_rewards)}, Total Timesteps: {current_total_timesteps}, Mean Reward (last 100 episodes): {mean_reward}")

# Close the environment
env.close()

# Calculate cumulative timesteps and mean rewards
cumulative_timesteps = np.cumsum(episode_lengths)
mean_rewards = [np.mean(episode_rewards[:i+1]) for i in range(len(episode_rewards))]

# Prepare the results in a DataFrame
results_df = pd.DataFrame({
    'total_timesteps': cumulative_timesteps,
    'ep_rew_mean': mean_rewards
})

# Log the results to a file
log_dir = "./logs/baseline_std_2Mio"
os.makedirs(log_dir, exist_ok=True)
baseline_log_file = os.path.join(log_dir, "progress.csv")
results_df.to_csv(baseline_log_file, index=False)

print(f"Results saved to {baseline_log_file}")

ValueError: too many values to unpack (expected 2)