In [1]:
from stable_baselines3 import PPO
import  gymnasium as gym
from collections import defaultdict
import numpy as np

In [2]:
env = gym.make("CartPole-v1")

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100_000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.7     |
|    ep_rew_mean     | 21.7     |
| time/              |          |
|    fps             | 6472     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 24.6        |
|    ep_rew_mean          | 24.6        |
| time/                   |             |
|    fps                  | 4136        |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009581953 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.2         |
|    entropy_loss   

<stable_baselines3.ppo.ppo.PPO at 0x168df3850>

In [18]:
from stable_baselines3.common.vec_env import DummyVecEnv


def make_render_env():
    return gym.make("CartPole-v1", render_mode="human")

env = DummyVecEnv([make_render_env])

# Testowanie modelu
episodes = 5

for episode in range(episodes):
    obs = env.reset()
    done = False
    total_reward = 0
    while not done:
        env.render() 
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        total_reward += reward
    print(f"Episode {episode + 1} finished with reward: {total_reward}")

env.close()

Episode 1 finished with reward: [500.]
Episode 2 finished with reward: [500.]
Episode 3 finished with reward: [500.]
Episode 4 finished with reward: [500.]
Episode 5 finished with reward: [500.]


In [2]:
class QLearningAgent:

    def __init__(
            self,
            env: gym.Env,
            learning_rate: float,
            initial_epsilon: float,
            epsilon_decay: float,
            final_epsilon: float,
            discount_factor: float = 0.9
    ):

        self.env = env
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))

        self.lr = learning_rate
        self.discount_factor = discount_factor

        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []

    def get_action(self, obs: tuple[int, int, bool]) -> int:

        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        else:
            return int(np.argmax(self.q_values[obs]))

    def update(
            self,
            obs: tuple[int, int, bool],
            action: int,
            reward: float,
            terminated: bool,
            next_obs: tuple[int, int, bool],
    ):
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])
        temporal_difference = (
                reward + self.discount_factor * future_q_value - self.q_values[obs][action]
        )

        self.q_values[obs][action] = (
                self.q_values[obs][action] + self.lr * temporal_difference
        )
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)
        
    def get_action_test(self, obs: tuple[int, int, bool]) -> int:
        return int(np.argmax(self.q_values[obs]))
        

In [3]:
from tqdm import tqdm


def learn(
        learning_rate: float,
        n_episodes: int,
        start_epsilon: float,
        epsilon_decay: float, 
        final_epsilon: float
        ):
    
    env = gym.make("CliffWalking-v0")
    
    agent = QLearningAgent(
        env=env,
        learning_rate=learning_rate,
        initial_epsilon=start_epsilon,
        epsilon_decay=epsilon_decay,
        final_epsilon=final_epsilon,
    )
    
    for episode in tqdm(range(n_episodes)):
        obs, info = env.reset()
        done = False
    
        while not done:
            action = agent.get_action(obs)
            next_obs, reward, terminated, truncated, info = env.step(action)
    
            agent.update(obs, action, reward, terminated, next_obs)
    
            done = terminated or truncated
            obs = next_obs

        agent.decay_epsilon()
    
    return agent

In [5]:
learning_rate = 0.01
n_episodes = 50_000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)
final_epsilon = 0.1

In [6]:
agentQLearning = learn(learning_rate, n_episodes, start_epsilon, epsilon_decay, final_epsilon)

100%|██████████| 50000/50000 [01:23<00:00, 595.53it/s] 


In [7]:
episodes = 5
env = gym.make("CartPole-v1", render_mode="human")

for episode in range(episodes):
    obs = env.reset()
    done = False
    total_reward = 0
    while not done:
        
        action = agentQLearning.get_action_test(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        
        done = terminated or truncated
        obs = next_obs
        total_reward += reward
        
    print(f"Episode {episode + 1} finished with reward: {total_reward}")

TypeError: unhashable type: 'numpy.ndarray'