In [1]:
import numpy as np
import gymnasium as gym
import imageio
from IPython.display import HTML
    

def Training(env, n_train_episodes, lr, max_steps, gamma, min_epsilon, max_epsilon, decay):
    Q = np.zeros((env.observation_space.n, env.action_space.n)) # inicializiranje tabele (Korak 1.)

    for i in range(n_train_episodes): # učenje (Korak 2.)
        observation, info = env.reset()

        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay * i)

        for j in range(max_steps):

            number = np.random.random(1)[0]
            if number > epsilon:  
                action = np.argmax(Q[observation, :]) # izkoriščanje
            else:
                action = env.action_space.sample() # raziskovanje

            observationNew, reward, terminated, truncated, info = env.step(action)
            
            Q[observation, action] += lr * (reward + gamma * np.max(Q[observationNew, :]) - Q[observation, action]) # Bellmanova enačba
            
            observation = observationNew

            if terminated or truncated:
                break

    return Q

def Igranje(env, Q, file):
    steps = []

    observation, info = env.reset()
    terminated = False
    step = env.render()
    steps.append(step)

    while not terminated:
        action = np.argmax(Q[observation][:])
        observation, reward, terminated, truncated, info = env.step(action)
        step = env.render()
        steps.append(step)

    imageio.mimwrite(file, steps, fps=1)

def Vrednotenje(env, Q, n_eval_episodes, max_steps):
    rewards = np.zeros(n_eval_episodes)

    for i in range(n_eval_episodes):
        observation, info = env.reset()
        episode_reward = 0

        for step in range(max_steps):
            action = np.argmax(Q[observation, :])
            observation, reward, terminated, truncated, info = env.step(action)
            episode_reward += reward

            if terminated or truncated:
                break
        
        #rewards = np.append(rewards, episode_reward)
        rewards = np.concatenate((rewards[0:i], [episode_reward], rewards[i + 1:]))

    return np.mean(rewards), np.std(rewards)


In [2]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array") # inicializacija okolja
tabela = Training(env, 10000, 0.7, 100, 0.95, 0.05, 1.0, 0.0005) # priprava in učenje q tabele
Igranje(env, tabela, "FrozenLake_4x4_not_slippery.gif") # uporaba q tabele in vizualizacija
print("FrozenLake-v1,4x4,not_slippery - mean, std dev:", Vrednotenje(env, tabela, 100, 6)) # ocena učenja
env.close()

FrozenLake-v1,4x4,not_slippery - mean, std dev: (1.0, 0.0)


In [3]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True, render_mode="rgb_array")
tabela = Training(env, 10000, 0.7, 100, 0.95, 0.05, 1.0, 0.0005)
Igranje(env, tabela, "FrozenLake_4x4_slippery.gif")
print("FrozenLake-v1,4x4,slippery - mean, std dev:", Vrednotenje(env, tabela, 100, 20))
env.close()

FrozenLake-v1,4x4,slippery - mean, std dev: (0.16, 0.36660605559646714)


In [77]:
env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=False, render_mode="rgb_array")
tabela = Training(env, 250000, 0.8, 400, 0.9, 0.001, 1.0, 0.0005)
Igranje(env, tabela, "FrozenLake_8x8_not_slippery.gif")
print("FrozenLake-v1,8x8,not_slippery - mean, std dev:", Vrednotenje(env, tabela, 1000, 400))
env.close()

KeyboardInterrupt: 

In [86]:
env = gym.make("FrozenLake-v1", map_name="8x8", is_slippery=True, render_mode="rgb_array")
tabela = Training(env, 250000, 0.8, 400, 0.9, 0.001, 1.0, 0.0005)
Igranje(env, tabela, "FrozenLake_8x8_slippery.gif")
#print("FrozenLake-v1,4x4,slippery - mean, std dev:", Vrednotenje(env, tabela, 1000, 400))
env.close()

KeyboardInterrupt: 

In [4]:
env = gym.make('Taxi-v3', render_mode="rgb_array")
tabela = Training(env, 10000, 0.7, 100, 0.95, 0.05, 1.0, 0.0005)
Igranje(env, tabela, "Taxi-v3.gif")
print("Taxi-v3 - mean, std dev:", Vrednotenje(env, tabela, 100, 100))
env.close()

Taxi-v3 - mean, std dev: (7.58, 2.661503334583671)


In [5]:
env = gym.make('CliffWalking-v0', render_mode="rgb_array")
tabela = Training(env, 10000, 0.7, 100, 0.95, 0.05, 1.0, 0.0005)
Igranje(env, tabela, "CliffWalking-v0.gif")
print("CliffWalking-v0 - mean, std dev:", Vrednotenje(env, tabela, 100, 100))
env.close()

CliffWalking-v0 - mean, std dev: (-13.0, 0.0)


In [8]:
HTML('<img src="./Taxi-v3.gif">')