In [5]:
# Import necessary libraries
import gym
import pickle
import random
from stable_baselines3 import DQN, A2C, PPO
from stable_baselines3.common.evaluation import evaluate_policy
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import RIGHT_ONLY
from stable_baselines3.common import atari_wrappers
import tensorflow as tf

with tf.device('/gpu:0'):
    # Set parameters for the experiment
    environmentID = "SuperMarioBros2-v1"
    trainMode = True  # Set to False if you wish to load a pre-trained model
    learningAlg = "PPO"  # Choose between 'DQN', 'A2C', and 'PPO'
    seed = random.randint(0, 1000) if trainMode else 42  # Set a seed number here
    num_training_steps = 10000
    num_test_episodes = 10
    learning_rate = 0.00083
    gamma = 0.995
    policy_rendering = True

    # Define a function to create the learning environment
    def make_env(gym_id, seed):
        env = gym_super_mario_bros.make(gym_id)
        env = JoypadSpace(env, RIGHT_ONLY)
        env = atari_wrappers.MaxAndSkipEnv(env, 4)
        env = atari_wrappers.NoopResetEnv(env, noop_max=30)
        env = atari_wrappers.ClipRewardEnv(env)
        env.seed(seed)    
        env.action_space.seed(seed)
        env.observation_space.seed(seed)
        return env

    # Create the learning environment
    environment = make_env(environmentID, seed)

    # Initialize the agent's model
    if learningAlg == "DQN":
        model = DQN("CnnPolicy", environment, seed=seed, learning_rate=learning_rate, gamma=gamma, buffer_size=50000, exploration_fraction=0.9, verbose=1)
    elif learningAlg == "A2C":
        model = A2C("CnnPolicy", environment, seed=seed, learning_rate=learning_rate, gamma=gamma, verbose=1)
    elif learningAlg == "PPO":
        model = PPO("CnnPolicy", environment, seed=seed, learning_rate=learning_rate, gamma=gamma, verbose=1)
    else:
        raise ValueError(f"UNKNOWN learningAlg={learningAlg}")

    # Train the agent or load a pre-trained model
    # Train the agent or load a pre-trained model
    if trainMode:
        model.learn(total_timesteps=num_training_steps)  # Removed progress_bar=True
        policyFileName = f"{learningAlg}-{environmentID}-seed{str(seed)}.policy.pkl"
        print("Saving policy " + str(policyFileName))
        model.save(policyFileName)
    else:
        policyFileName = input("Enter the name of the policy file to load: ")
        print("Loading policy...")
        model = model.load(policyFileName)


    # Evaluate and visualize the agent's performance
    print("Evaluating policy...")
    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=num_test_episodes * 5, render=policy_rendering)
    print(f"EVALUATION: mean_reward={mean_reward} std_reward={std_reward}")

ModuleNotFoundError: No module named 'stable_baselines3'

In [1]:
pip install stable-baselines3[extra]==1.6.0

Collecting stable-baselines3==1.6.0 (from stable-baselines3[extra]==1.6.0)
  Using cached stable_baselines3-1.6.0-py3-none-any.whl.metadata (4.0 kB)
Collecting gym==0.21 (from stable-baselines3==1.6.0->stable-baselines3[extra]==1.6.0)
  Using cached gym-0.21.0.tar.gz (1.5 MB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[1 lines of output][0m
  [31m   [0m error in gym setup command: 'extras_require' must be a dictionary whose values are strings or lists of strings containing valid project/version requirement specifiers.
  [31m   [0m [31m[end of output][0m
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[?25h[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m 