In [1]:
import gymnasium as gym

from huggingface_sb3 import load_from_hub, package_to_hub
from huggingface_hub import (notebook_login,)

In [2]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor

## 1. Model Define

In [3]:
env = gym.make("LunarLander-v3")

observation, info = env.reset()

print("_____OBSERVATION SPACE_____ \n")
print("Observation Space Shape ", env.observation_space.shape)
print("Sample observation ", env.observation_space.sample())

print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample())

_____OBSERVATION SPACE_____ 

Observation Space Shape  (8,)
Sample observation  [-0.40257037  1.1994597   0.01725279  1.6703686  -4.1764774   4.785459
  0.8142918   0.36941528]

 _____ACTION SPACE_____ 

Action Space Shape 4
Action Space Sample 3


  from pkg_resources import resource_stream, resource_exists


In [None]:
for _ in range(20):
    action = env.action_space.sample()
    print("Action taken: ", action)

    observation, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        print("Environment is reset")
        observation, info = env.reset()

env.close()

## 2. Model learning

In [None]:
env = gym.make("LunarLander-v3")

model = PPO("MlpPolicy", env, verbose=1)

model.learn(total_timesteps=1000000)
model_name = "ppo-LunarLander-v3"

model.save(model_name)

## 3. Model Evaluate

In [5]:
eval_env = Monitor(gym.make("LunarLander-v3"))
mean_reward, std_reward = evaluate_policy(model,
                                          eval_env,
                                          n_eval_episodes=10,
                                          deterministic=True
                                         )
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

mean_reward=249.21 +/- 21.825505942447712


In [24]:
import time

model = PPO.load("ppo-LunarLander-v3")
env = gym.make("LunarLander-v3", render_mode="human")

obs, _ = env.reset()
print("게임 시작! (이제 안 멈출 겁니다)")

try:
    while True:
        time.sleep(0.015)  

        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env.step(action)
        env.render()
        
        if done or truncated:
            obs, _ = env.reset()
            print("한 판 종료! 다시 시작")

except KeyboardInterrupt:
    print("종료합니다.")
    env.close()

게임 시작! (이제 안 멈출 겁니다)
종료합니다.
