In [4]:
import gymnasium as gym

### Inplementacja Q Learn z polityka Epsilon Greedy

In [5]:
import random
import numpy as np
from tqdm.auto import tqdm

class Q_Learn:
    def __init__(self, epsilon, gamma, lr, env):
        self.epsilon = epsilon
        self.initial_epsilon = epsilon  
        self.gamma = gamma
        self.lr = lr
        self.env = env
        self.q_table = self._init_qtable(env)  # Użyj _ zamiast __ dla metod wewnętrznych

    def _init_qtable(self, env):
        """Inicjalizuje Q-table o kształcie (num_states, num_actions)."""
        if isinstance(env.observation_space, gym.spaces.Discrete):
            num_states = env.observation_space.n
        else:
            raise ValueError("Tylko środowiska z dyskretnymi stanami są obsługiwane!")
        num_actions = env.action_space.n
        return np.zeros((num_states, num_actions))

    def _greedy_policy(self, state):
        """Zwraca akcję o najwyższej wartości Q dla danego stanu."""
        return np.argmax(self.q_table[state])

    def epsilon_greedy_policy(self, state):
        """ε-zachłanna polityka wyboru akcji."""
        if random.random() > self.epsilon:
            return self._greedy_policy(state)
        else:
            return self.env.action_space.sample()

    def reduce_epsilon(self, epoch, total_epochs):
        """Liniowa redukcja epsilon w trakcie treningu."""
        self.epsilon = max(self.initial_epsilon * (1 - epoch / total_epochs), 0.01)

    def train(self, epochs, max_steps):
        for epoch in tqdm(range(epochs)):
            state, info = self.env.reset()
            terminated, truncated = False, False

            for _ in range(max_steps):
                action = self.epsilon_greedy_policy(state)
                new_state, reward, terminated, truncated, info = self.env.step(action)\

                best_next_action = np.argmax(self.q_table[new_state])
                td_target = reward + self.gamma * self.q_table[new_state][best_next_action]
                self.q_table[state][action] += self.lr * (td_target - self.q_table[state][action])

                if terminated or truncated:
                    break
                state = new_state

            self.reduce_epsilon(epoch, epochs)

        print("Trening zakończony!")

    def evaluate(self, eval_epochs,  max_steps, seed):
        episode_rewards = []
        for epoch in range(eval_epochs):
            if seed:
                state, info = self.env.reset(seed = random.randint(0, seed))
            else:
                state, info = self.env.reset()

            step = 0
            truncated = False
            terminated = False
            total_rewards_ep = 0

            for step in range(max_steps):
                action = self._greedy_policy(state)
                new_state, reward, terminated, truncated, info = self.env.step(action)\

                total_rewards_ep += reward

                if terminated or truncated:
                    break

                state = new_state
            episode_rewards.append(total_rewards_ep)
        mean_reward = np.mean(episode_rewards)
        std_reward = np.std(reward)

        return mean_reward, std_reward

In [6]:
import cv2
import time

def record_video(env, Qtable, fps=10):
    """
    Wyświetl wideo z działania agenta opartego na Qtable.
    :param env: środowisko Gym z render_mode='rgb_array'
    :param Qtable: wytrenowana tablica Q
    :param fps: liczba klatek na sekundę (domyślnie 1)
    """
    state, info = env.reset(seed=random.randint(0, 500))
    terminated = False
    truncated = False

    frames = []

    while not (terminated or truncated):
        frame = env.render()  # render_mode='rgb_array' zwraca klatkę jako obraz
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # OpenCV używa BGR
        frames.append(frame_bgr)

        action = np.argmax(Qtable[state])
        state, reward, terminated, truncated, info = env.step(action)

    env.close()

    return frames

### Frozeem lake from openAI gym. With not slippery  `is_slippery=False`

In [7]:
env = gym.make('FrozenLake-v1', map_name = "8x8", is_slippery = False)

In [8]:
print("_____OBSERVATION SPACE_____ \n")
print("Observation Space", env.observation_space)
print("Sample observation", env.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

Observation Space Discrete(64)
Sample observation 9


In [9]:
print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample())


 _____ACTION SPACE_____ 

Action Space Shape 4
Action Space Sample 0


In [10]:
agent = Q_Learn(epsilon=0.9, gamma=0.99, lr=0.1, env=env)

In [15]:
agent.train(epochs=30000, max_steps=500)

100%|██████████| 30000/30000 [00:14<00:00, 2076.42it/s]

Trening zakończony!





In [16]:
agent.q_table

array([[8.68745813e-01, 8.77521023e-01, 8.77521023e-01, 8.68745813e-01],
       [8.68745813e-01, 8.86384872e-01, 8.86384872e-01, 8.77521023e-01],
       [8.77521023e-01, 8.95338254e-01, 8.95338254e-01, 8.86384872e-01],
       [8.86384872e-01, 9.04382075e-01, 9.04382075e-01, 8.95338254e-01],
       [8.95338254e-01, 9.13517247e-01, 9.13517247e-01, 9.04382075e-01],
       [9.04382075e-01, 9.22744694e-01, 9.22744694e-01, 9.13517247e-01],
       [9.13517247e-01, 9.32065348e-01, 9.32065348e-01, 9.22744694e-01],
       [9.22744692e-01, 9.41480149e-01, 9.32065343e-01, 9.32065347e-01],
       [8.77521023e-01, 8.86384872e-01, 8.86384872e-01, 8.68745813e-01],
       [8.77521023e-01, 8.95338254e-01, 8.95338254e-01, 8.77521023e-01],
       [8.86384872e-01, 9.04382075e-01, 9.04382075e-01, 8.86384872e-01],
       [8.95338254e-01, 0.00000000e+00, 9.13517247e-01, 8.95338254e-01],
       [9.04382075e-01, 9.22744694e-01, 9.22744694e-01, 9.04382075e-01],
       [9.13517247e-01, 9.32065348e-01, 9.32065348e

In [17]:
mean_reward, std_reward = agent.evaluate(50, 50, 500)
print(mean_reward, std_reward)

1.0 0.0


In [18]:
env = gym.make('FrozenLake-v1', map_name = "8x8", is_slippery = False, render_mode='rgb_array')

frames = record_video(env, agent.q_table)

np.array(frames).shape

(14, 512, 512, 3)

In [19]:
for frame in frames:
    cv2.imshow('test', frame)

    cv2.waitKey(200)

cv2.destroyAllWindows()

### Frozeem lake from openAI gym. With not slippery  `is_slippery=True`

In [21]:
env = gym.make('FrozenLake-v1', map_name= "8x8", is_slippery = True)

agent = Q_Learn(epsilon=0.9, gamma=0.99, lr=0.1, env=env)

agent.train(20000, 500)

100%|██████████| 20000/20000 [00:24<00:00, 802.49it/s]

Trening zakończony!





In [22]:
agent.q_table

array([[4.00001152e-01, 4.00646722e-01, 3.95993457e-01, 4.26094682e-01],
       [3.99210123e-01, 4.10604310e-01, 4.36328355e-01, 4.10725078e-01],
       [4.16124600e-01, 4.21579396e-01, 4.58196527e-01, 4.02619568e-01],
       [4.30704004e-01, 4.46455963e-01, 4.78255254e-01, 4.54171247e-01],
       [4.56789227e-01, 4.61904534e-01, 5.14709827e-01, 4.63656118e-01],
       [4.79484726e-01, 4.84523847e-01, 5.39128711e-01, 4.80454258e-01],
       [5.09631490e-01, 5.14543223e-01, 5.58123195e-01, 5.14512794e-01],
       [5.60134323e-01, 5.26046686e-01, 5.15476719e-01, 5.12007451e-01],
       [3.84445396e-01, 3.73565564e-01, 3.80853674e-01, 4.21509975e-01],
       [3.89566472e-01, 3.76348158e-01, 3.92395860e-01, 4.27503308e-01],
       [4.06621645e-01, 3.89645435e-01, 3.95543956e-01, 4.47898597e-01],
       [2.87915853e-01, 2.99384195e-01, 3.43337865e-01, 4.74089867e-01],
       [4.24478506e-01, 4.47657474e-01, 4.53362913e-01, 5.08859885e-01],
       [4.64492917e-01, 4.72850655e-01, 4.67547152e

In [25]:
agent.evaluate(50, 100, 500)

(np.float64(0.66), np.float64(0.0))

In [26]:
env = gym.make('FrozenLake-v1', map_name = "8x8", is_slippery = False, render_mode='rgb_array')

frames = record_video(env, agent.q_table)

np.array(frames).shape

(100, 512, 512, 3)

In [17]:
for frame in frames:
    cv2.imshow('test', frame)

    cv2.waitKey(100)

cv2.destroyAllWindows()

### Taxi-v3

In [27]:
env = gym.make('Taxi-v3')

agent = Q_Learn(epsilon=0.9, gamma=0.99, lr=0.1, env=env)

agent.train(20000, 100)

100%|██████████| 20000/20000 [00:23<00:00, 861.43it/s] 

Trening zakończony!





In [28]:
agent.q_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 7.44059051,  8.525849  ,  7.44059051,  8.525849  ,  9.6220697 ,
        -0.474151  ],
       [11.84784175, 12.97761793, 11.84784175, 12.97761793, 14.11880599,
         3.97761793],
       ...,
       [14.07539045, 15.2715212 , 14.06566413, 12.92864593,  5.0887527 ,
         5.09508088],
       [ 9.6194383 , 10.7227343 ,  9.62018644, 10.72936333,  0.6169259 ,
         0.61966805],
       [17.61199987, 16.43587583, 17.6119993 , 18.8       ,  8.61199815,
         8.61199966]], shape=(500, 6))

In [29]:
agent.evaluate(50, 100, 500)

(np.float64(7.3), np.float64(0.0))

In [30]:
env = gym.make('Taxi-v3', render_mode='rgb_array')
frames = record_video(env, agent.q_table)

np.array(frames).shape

(12, 350, 550, 3)

In [31]:
for frame in frames:
    cv2.imshow('test', frame)

    cv2.waitKey(1000)

cv2.destroyAllWindows()