## Imports

In [1]:
import numpy as np
import gymnasium as gym
from IPython.display import HTML
from base64 import b64encode
import imageio

## Utils

In [2]:
def record_video(env, policy, out_directory, fps=1, random_action=False, max_steps=100):
    images = []  
    done = False
    truncated = False
    state, info = env.reset()
    img = env.render()
    images.append(img)
    total_reward = 0
    i = 0
    while not done and not truncated:
        i += 1
        if i > max_steps:
            break
        action = np.random.randint(4) if random_action else policy[state]
        state, reward, done, truncated, info = env.step(action)
        total_reward += reward
        img = env.render()
        images.append(img)
        if not random_action:
            print(f"action: {action}, state: {state}, reward: {reward}, done: {done}, truncated: {truncated}, info: {info}")
    imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)
    return total_reward

In [3]:
def show_video(video_path, video_width=500):
    video_file = open(video_path, "r+b").read()
    video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
    return HTML(f"""<video width={video_width} controls><source src="{video_url}"></video>""")

## Random Walk

In [4]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode='rgb_array')
total_reward = record_video(env, None, 'frozenlake_random.mp4', fps=3, random_action=True)
print(f"total reward: {total_reward}")
show_video('frozenlake_random.mp4', video_width=500)

error: XDG_RUNTIME_DIR not set in the environment.


total reward: 0.0


## Define FrozenLake MDP

In [5]:
class FrozenLakeMDP:
    def __init__(self, is_slippery):
        self.is_slippery = is_slippery
        self.terminal_states = np.zeros(16, dtype=int)
        self.terminal_states[[5, 7, 11, 12, 15]] = 1
        self.reward_fn = np.zeros(16, dtype=int)
        self.reward_fn[15] = 1

    def is_terminal(self, state):
        return self.terminal_states[state]
    
    def get_reward_function(self):
        return self.reward_fn

    def next_state_det(self, state, action):
        if action == 0:    # LEFT
            next_state = state - 1 if state % 4 != 0 else state
        elif action == 1:  # DOWN
            next_state = state + 4 if state // 4 != 3 else state
        elif action == 2:  # RIGHT
            next_state = state + 1 if state % 4 != 3 else state
        elif action == 3:  # UP
            next_state = state - 4 if state // 4 != 0 else state
        else:         # WRONG ACTION
            next_state = state
        return next_state
    
    def trans_prob(self, state, action):
        prob = np.zeros((16,), dtype=float)
        if not self.is_slippery:
            prob[self.next_state_det(state, action)] = 1.0
        else:
            prob[self.next_state_det(state, action)] += 1/3
            prob[self.next_state_det(state, (action+1)%4)] += 1/3
            prob[self.next_state_det(state, (action-1)%4)] += 1/3
        return prob

    def next_state_reward(self, state, action):
        next_state_probs = self.trans_prob(state, action)
        next_state = np.random.choice(16, p=next_state_probs)
        reward = self.reward_fn[next_state]
        return next_state, reward

In [6]:
dynamics = FrozenLakeMDP(is_slippery=False)

In [7]:
# reward function of the environent
dynamics.get_reward_function()

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [8]:
# evaluating if a given state is a terminal state (= hole or goal)
print(dynamics.is_terminal(0), dynamics.is_terminal(7), dynamics.is_terminal(15))

0 1 1


In [9]:
# if we take action `a` in state `s`,
# what is the probability of landing in each state?
dynamics.trans_prob(14, 2)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [10]:
# if we take action `a` in state `s`, what do we get?
# this is done through sampling the transition probability
next_state, reward = dynamics.next_state_reward(11, 1)
print(next_state, reward)

15 1


## Iterative Policy Evaluation

In [11]:
def policy_evaluation(dynamics, policy, gamma=0.9, num_iter=10):
    """
    evaluates policy based on Iterative Policy Evaluation.
 
    Args:
        dynamics (FrozenLakeMDP): dynamics of the environment.
        policy (numpy.array): policy we want to evaluate.
        gamma (float): discount factor.
        num_iter (int): number of iterations for the loop.
 
    Returns:
        numpy.array: state value function.
    """

    # TODO: implement Iterative Policy Evaluation algorithm

    s_value_function = np.zeros(16, dtype=float)
    rewards = dynamics.get_reward_function()
    for _ in range(num_iter):
        for state in range(16):
            if dynamics.is_terminal(state) and state != 15:
                continue
            probs = dynamics.trans_prob(state, policy[state])
            value = 0
            for i in range(16):
                # print(next_state, reward)
                value += probs[i]*(rewards[i] + gamma*s_value_function[i])
            s_value_function[state] = value
        # for state in range(16):
        #     print(f"({state}:{s_value_function[state]})", end=",")
        # print()
    return s_value_function

In [510]:
dynamics = FrozenLakeMDP(is_slippery=False)

# 1. go-right policy
policy = 2 * np.ones(16, dtype=int)

# 2. shortest-path policy
policy = np.array([1, 2, 1, 0, 1, -1, 1, -1, 2, 1, 1, -1, -1, 2, 2, -1])

s_value_function = policy_evaluation(dynamics, policy)

(0:0.0),(1:0.0),(2:0.0),(3:0.0),(4:0.0),(5:0.0),(6:0.0),(7:0.0),(8:0.0),(9:0.0),(10:0.0),(11:0.0),(12:0.0),(13:0.0),(14:1.0),(15:1.0),
(0:0.0),(1:0.0),(2:0.0),(3:0.0),(4:0.0),(5:0.0),(6:0.0),(7:0.0),(8:0.0),(9:0.0),(10:0.9),(11:0.0),(12:0.0),(13:0.9),(14:1.9),(15:1.9),
(0:0.0),(1:0.0),(2:0.0),(3:0.0),(4:0.0),(5:0.0),(6:0.81),(7:0.0),(8:0.0),(9:0.81),(10:1.71),(11:0.0),(12:0.0),(13:1.71),(14:2.71),(15:2.71),
(0:0.0),(1:0.0),(2:0.7290000000000001),(3:0.6561000000000001),(4:0.0),(5:0.0),(6:1.539),(7:0.0),(8:0.7290000000000001),(9:1.539),(10:2.439),(11:0.0),(12:0.0),(13:2.439),(14:3.439),(15:3.439),
(0:0.0),(1:0.6561000000000001),(2:1.3851),(3:1.24659),(4:0.6561000000000001),(5:0.0),(6:2.1951),(7:0.0),(8:1.3851),(9:2.1951),(10:3.0951),(11:0.0),(12:0.0),(13:3.0951),(14:4.0951),(15:4.0951),
(0:0.5904900000000002),(1:1.24659),(2:1.9755900000000002),(3:1.7780310000000001),(4:1.24659),(5:0.0),(6:2.78559),(7:0.0),(8:1.9755900000000002),(9:2.78559),(10:3.6855900000000004),(11:0.0),(12:0.0),(13:3.

In [511]:
dynamics = FrozenLakeMDP(is_slippery=True)

# 1. go-right policy
policy = 2 * np.ones(16, dtype=int)

# 2. shortest-path policy
policy = np.array([1, 2, 1, 0, 1, -1, 1, -1, 2, 1, 1, -1, -1, 2, 2, -1])

s_value_function_slippery = policy_evaluation(dynamics, policy)

(0:0.0),(1:0.0),(2:0.0),(3:0.0),(4:0.0),(5:0.0),(6:0.0),(7:0.0),(8:0.0),(9:0.0),(10:0.0),(11:0.0),(12:0.0),(13:0.0),(14:0.3333333333333333),(15:0.7666666666666666),
(0:0.0),(1:0.0),(2:0.0),(3:0.0),(4:0.0),(5:0.0),(6:0.0),(7:0.0),(8:0.0),(9:0.0),(10:0.09999999999999999),(11:0.0),(12:0.0),(13:0.09999999999999999),(14:0.6933333333333332),(15:1.3346666666666664),
(0:0.0),(1:0.0),(2:0.0),(3:0.0),(4:0.0),(5:0.0),(6:0.03),(7:0.0),(8:0.0),(9:0.06),(10:0.22599999999999995),(11:0.0),(12:0.0),(13:0.25599999999999995),(14:1.0095333333333332),(15:1.7703266666666666),
(0:0.0),(1:0.0),(2:0.009),(3:0.0026999999999999997),(4:0.0),(5:0.0),(6:0.06779999999999999),(7:0.0),(8:0.018),(9:0.14999999999999997),(10:0.3478599999999999),(11:0.0),(12:0.0),(13:0.42465999999999987),(14:1.2716493333333332),(15:2.1103574666666667),
(0:0.0),(1:0.0026999999999999997),(2:0.021959999999999997),(3:0.007397999999999998),(4:0.005399999999999999),(5:0.0),(6:0.10435799999999996),(7:0.0),(8:0.046619999999999995),(9:0.2457419999

In [814]:
# TODO: print and analyze the state value function
print("ارزش حالات تابع ارزش در حالت is_slippery=False به شکل زیر است.")
for state in range(len(s_value_function)):
    print(f"(state={state}, value(s)={s_value_function[state]})")
print("همانطور که میبینیم ارزش‌ها در هر ایتریشن آپدیت میشوند و با توجه به قطعی بودن نتایج عمل عامل، بالاترین ارزش‌ها بعد از 10 بار ایتریشن به دست می‌آید.")
print("همچنین چون در استیت‌های سوراخ، next_state=state است و ریوارد آن هم صفر است، ارزش‌ آنها آپدیت نمیشود و همواره صفر باقی میماند که این به دلیل این است که بازی در همان جا تمام میشود.")
print("در استیت 15 هم مجددا next_state=state میشود اما چون ریوارد آن یک است در هر مرحله آپدیت میشود.")
print("هرچه به خانه شانزدهم(اندیس 15) نزدیک می‌شویم، ارزش استیت‌های غیر سوراخ بیشتر میشود.")
print("\n")

print("ارزش حالات تابع ارزش در حالت is_slippery=True به شکل زیر است.")
for state in range(len(s_value_function_slippery)):
    print(f"(state = {state}, value(s) = {s_value_function[state]})")
print("تفاوت این حالت با حالت قبلی، قطعی نبودن حرکت‌های بعدی است که در نتیجه آن نتایج تصمیمات ما به نوعی نویز دارد و با احتمال خاصی رخ می‌دهد و ارزش هر یک از استیت‌ها نسبت به حالت قبلی کمتر است.")
print("در واقع در این حالت نمیتوانیم سیاست را به صورت بهینه اجرا کنیم.")

ارزش حالات تابع ارزش در حالت is_slippery=False به شکل زیر است.
(state=0, value(s)=2.4181155990000005)
(state=1, value(s)=3.074215599000001)
(state=2, value(s)=3.803215599000001)
(state=3, value(s)=3.422894039100001)
(state=4, value(s)=3.074215599000001)
(state=5, value(s)=0.0)
(state=6, value(s)=4.613215599)
(state=7, value(s)=0.0)
(state=8, value(s)=3.803215599000001)
(state=9, value(s)=4.613215599)
(state=10, value(s)=5.5132155990000005)
(state=11, value(s)=0.0)
(state=12, value(s)=0.0)
(state=13, value(s)=5.5132155990000005)
(state=14, value(s)=6.5132155990000005)
(state=15, value(s)=6.5132155990000005)
همانطور که میبینیم ارزش‌ها در هر ایتریشن آپدیت میشوند و با توجه به قطعی بودن نتایج عمل عامل، بالاترین ارزش‌ها بعد از 10 بار ایتریشن به دست می‌آید.
همچنین چون در استیت‌های سوراخ، next_state=state است و ریوارد آن هم صفر است، ارزش‌ آنها آپدیت نمیشود و همواره صفر باقی میماند که این به دلیل این است که بازی در همان جا تمام میشود.
در استیت 15 هم مجددا next_state=state میشود اما چون ریوارد آ

## Policy Iteration

In [12]:
def greedy_policy_improvement(dynamics, s_value_function, gamma=0.9):
    """
    obtains a policy in a greedy manner based on current state value function.
 
    Args:
        dynamics (FrozenLakeMDP): dynamics of the environment.
        s_value_function (numpy.array): state value function.
        gamma (float): discount factor.
 
    Returns:
        numpy.array: the greedy policy.
    """

    # TODO: implement Greedy Policy Improvement algorithm
    policy = np.random.randint(0, 4, size=16)
    rewards = dynamics.get_reward_function()
    for state in range(len(s_value_function)):
        for action in range(4):
            # print(f"")
            value = 0
            probs = dynamics.trans_prob(state, action)
            for i in range(16):
                value += probs[i]*(rewards[i] + gamma*s_value_function[i])
            if policy[state] != action and value > s_value_function[state]:
                policy[state] = action
                s_value_function[state] = value
                
    print(s_value_function.astype(int))
    
    return policy
    # policy = np.random.randint(0, 4, size=16)

    # rewards = dynamics.get_reward_function()


    # for i in range(16):
    #   best_action = 0
    #   best_v = 0
    #   for j in range(4):
    #     v = 0
    #     probs = dynamics.trans_prob(i, j)
    #     for k in range(16):
    #       v += probs[k] * (rewards[k] + gamma * s_value_function[k])
    #     if v>best_v:
    #       best_v=v
    #       best_action = j
    #   policy[i]=best_action


    # return policy

In [13]:
def policy_iteration(dynamics, gamma=0.9, outer_iter=100, inner_iter=100):
    """
    optimizes a policy based on Policy Iteration
 
    Args:
        dynamics (FrozenLakeMDP): dynamics of the environment.
        gamma (float): discount factor.
        outer_iter (int): number of iterations for the Policy Iteration loop.
        inner_iter (int): number of iterations for the Policy Evaluation loop.
 
    Returns:
        numpy.array: the optimized policy.
    """

    # TODO: implement Policy Iteration algorithm

    # 1.Initialization
    s_value_function = np.zeros(16, dtype=float)
    initial_policy = np.random.randint(0, 4, size=16)

    # 2.Policy Evaluation
    s_value_function = policy_evaluation(dynamics, initial_policy, gamma=gamma, num_iter=inner_iter)
    
    # 3.Policy Improvement
    policy = initial_policy
    for _ in range(outer_iter):
        new_policy = greedy_policy_improvement(dynamics, s_value_function, gamma=gamma)
        if (new_policy == policy).all():
            break
        else:
            s_value_function = policy_evaluation(dynamics, new_policy, gamma=gamma, num_iter=inner_iter)
            policy = new_policy
    print(f"policy:{policy}")
    
    return policy
    # policy = np.random.randint(0, 4, size=16)

    # for i in range(outer_iter):
    #   s_eval = policy_evaluation(dynamics=dynamics, policy=policy, gamma=gamma, num_iter=inner_iter)
    #   new_policy = greedy_policy_improvement(dynamics, s_eval, gamma=gamma)
    #   flag = True
    #   for j in range(16):
    #     if policy[j] != new_policy[j]:
    #       flag = False
    #       break
    #   if flag:
    #     return policy
    #   policy = new_policy

    # return policy

In [14]:
# TODO: test and analyze the algorithm

dynamics = FrozenLakeMDP(is_slippery=False)
policy = policy_iteration(dynamics)

[0 0 0 0 0 0 0 0 0 0 0 9 0 0 0 9]
[0 0 0 0 0 0 0 0 0 0 8 9 0 8 9 9]
[0 0 0 0 0 7 8 0 0 8 8 9 8 8 9 9]
[0 0 7 0 6 7 8 7 7 8 8 9 8 8 9 9]
[5 6 7 6 6 7 8 7 7 8 8 9 8 8 9 9]
[5 6 7 6 6 7 8 7 7 8 8 9 8 8 9 9]
[5 6 7 6 6 7 8 7 7 8 8 9 8 8 9 9]
[5 6 7 6 6 7 8 7 7 8 8 9 8 8 9 9]
[5 6 7 6 6 7 8 7 7 8 8 9 6 8 9 9]
[5 6 7 0 6 7 8 0 7 8 8 9 6 8 9 9]
[5 6 7 6 6 7 8 7 7 8 8 9 8 8 9 9]
[5 6 7 6 6 7 8 7 7 8 8 9 6 8 9 9]
[5 6 7 6 6 7 8 7 7 8 8 9 8 8 9 9]
[5 6 7 6 6 7 8 7 7 8 8 9 8 8 9 9]
policy:[1 2 1 0 1 1 1 0 2 1 1 1 2 2 2 2]


In [15]:
# TODO: test the policy on the environment

env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode='rgb_array')
total_reward = record_video(env, policy, 'frozenlake_policy_iteration_is_not_slippery.mp4', fps=5, random_action=False)
print(f"total reward: {total_reward}")
show_video('frozenlake_policy_iteration_is_not_slippery.mp4', video_width=500)

action: 1, state: 4, reward: 0.0, done: False, truncated: False, info: {'prob': 1.0}
action: 1, state: 8, reward: 0.0, done: False, truncated: False, info: {'prob': 1.0}
action: 2, state: 9, reward: 0.0, done: False, truncated: False, info: {'prob': 1.0}
action: 1, state: 13, reward: 0.0, done: False, truncated: False, info: {'prob': 1.0}
action: 2, state: 14, reward: 0.0, done: False, truncated: False, info: {'prob': 1.0}
action: 2, state: 15, reward: 1.0, done: True, truncated: False, info: {'prob': 1.0}
total reward: 1.0


در این حالت میبینیم که با توجه به قطعی بودن نتیجه حرکات، سیاست بهینه به صورت دقیق انجام میشود و هرگز truncate اتفاق نمی‌افتد.
در واقع با هر بار اجرای الگوریتم policy iteration سیاست به دست آمده یکسان خواهد بود و نتیجه اجرای سیاست همیشه یکسان خواهد بود.

In [16]:
# TODO: test and analyze the algorithm

dynamics_is_slippery = FrozenLakeMDP(is_slippery=True)
policy_is_slippery = policy_iteration(dynamics_is_slippery)

[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2]
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2]
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2]
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2]
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2]
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2]
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2]
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2]
policy:[0 3 0 3 0 1 0 3 3 1 0 1 3 2 1 2]


In [44]:
# TODO: test the policy on the environment
# 0 3 0 3 0 1 0 3 3 1 0 1 3 2 1 2
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True, render_mode='rgb_array')
total_reward = record_video(env, policy_is_slippery, 'frozenlake_policy_iteration_is_slippery.mp4', fps=5, random_action=False)
print(f"total reward: {total_reward}")
show_video('frozenlake_policy_iteration_is_slippery.mp4', video_width=500)

action: 0, state: 0, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 0, state: 4, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 0, state: 4, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 0, state: 0, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 0, state: 0, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 0, state: 4, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 0, state: 0, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 0, state: 4, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 0, state: 0, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 0, state: 0, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}


در حالت غیرقطعی، تصمینی وجود ندارد که عامل به مقصد برسد اما میتوان بهترین سیایت را به دست آورد. در هر بار اجرای سیاست بهینه actionهای رخ داده متفاوت هستند و ممکن است عامل به هدف برسد یا در سوراخ بیفتد اما اگر عامل طبق این سیاست عمل کند احتمال رسیدن عامل به هدف بیشتر از سایر سیاست ها است.

یا برای True = slippery_is آیا اصال سیاستی میتونه وجود داشته باشه که بتونه به صورت
۱۰۰ درصدی عامل روب همقص دبرسونه
خیر با توجه به توضیحات بالا هیچ سیاستی نمیتواند به صورت 100 درصدی تضمین دهد عامل به مقصد برسد.؟

## Q-Learning

In [45]:
class QAgent:  # The Q-Learning RL agent

    def __init__(self, num_states, num_actions, epsilon, alpha, gamma=0.9, eps_end=0.01, eps_decay=3e-6):

        self.num_states = num_states    # number of possible states
        self.num_actions = num_actions  # number of possible actions
        self.gamma = gamma              # discount factor
        self.epsilon = epsilon          # initial exploration probability
        self.alpha = alpha              # step size
        self.eps_decay = eps_decay      # linear decay rate of epsilon
        self.eps_end = eps_end          # minimum value for epsilon
        self.q_table = np.zeros((num_states, num_actions), dtype=float)

    def choose_action(self, state):
        """
        chooses an action in an epsilon-greedy manner.
    
        Args:
            state (int): current state of the agent.
    
        Returns:
            int: the chosen action
        """
        
        # TODO: implement epsilon-greedy action selection
        
        ep = self.epsilon
        random_num = np.random.random()
        if random_num > ep:
            action = 0
            best_q_value = 0
            for a in range(self.num_actions):
                if self.q_table[state, a] > best_q_value:
                    best_q_value = self.q_table[state, a]
                    action = a
        else:
            action = np.random.randint(0, self.num_actions)
        
        return action

    def learn(self, state, action, reward, next_state):
        """
        updates the q-table based on a single interaction with the environment.
    
        Args:
            state (int): state of the agent.
            action (int): action chosen by the agent.
            reward (int): reward obtained by the agent.
            next_state (int): next state of the agent.
        """

        # TODO: implement Q-table update
        best_q_prime = 0
        for a_prime in range(self.num_actions):
            if self.q_table[next_state, a_prime] > best_q_prime:
                best_q_prime = self.q_table[next_state, a_prime]
        sample = reward + self.gamma*best_q_prime
        self.q_table[state, action] = (1 - self.alpha)*self.q_table[state, action] + self.alpha*sample
        # epsilon decay
        self.epsilon = self.epsilon - self.eps_decay if self.epsilon > self.eps_end else self.eps_end

In [46]:
def train(env, agent, n_episodes=100000):
    """
        trains an agent through interactions with the environemnt using Q-learning.
    
        Args:
            env (gym.Env): the gym environment.
            agent (QAgent): the Q-learning agent.
            n_episodes (int): number of training episodes.
    """

    for i in range(n_episodes):

        # TODO: implement the training loop for Q-learning
        env.reset()
        done = False
        truncated = False
        state = 0
        while not done and not truncated:
          action = agent.choose_action(state)
          next_state, reward, done, truncated, info = env.step(action)
          agent.learn(state, action, reward, next_state)
          state = next_state
        ...

In [47]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False)
agent = QAgent(num_states=16, num_actions=4, epsilon=1.0, alpha=1e-3)

In [55]:
train(env, agent)

In [56]:
# TODO: obtain the policy by a simple argmax on agent's Q-table
policy = np.argmax(agent.q_table, axis=1)
print(policy)

[2 2 1 0 1 0 1 0 2 2 1 0 0 2 2 0]


In [58]:
# TODO: test the policy on the environment

env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode='rgb_array')
total_reward = record_video(env, policy, 'frozenlake_random.mp4', fps=5, random_action=False)
print(f"total reward: {total_reward}")
show_video('frozenlake_random.mp4', video_width=500)

action: 2, state: 1, reward: 0.0, done: False, truncated: False, info: {'prob': 1.0}
action: 2, state: 2, reward: 0.0, done: False, truncated: False, info: {'prob': 1.0}
action: 1, state: 6, reward: 0.0, done: False, truncated: False, info: {'prob': 1.0}
action: 1, state: 10, reward: 0.0, done: False, truncated: False, info: {'prob': 1.0}
action: 1, state: 14, reward: 0.0, done: False, truncated: False, info: {'prob': 1.0}
action: 2, state: 15, reward: 1.0, done: True, truncated: False, info: {'prob': 1.0}
total reward: 1.0


In [59]:
env_is_slippery = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True)
agent_is_slippery = QAgent(num_states=16, num_actions=4, epsilon=1.0, alpha=1e-3)

In [60]:
train(env_is_slippery, agent_is_slippery)

In [61]:
policy_is_slippery = np.argmax(agent_is_slippery.q_table, axis=1)
print(policy_is_slippery)

[2 2 2 3 1 0 0 0 1 1 1 0 0 1 3 0]


In [72]:
env_is_slippery = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True, render_mode='rgb_array')
total_reward = record_video(env_is_slippery, policy_is_slippery, 'frozenlake_random.mp4', fps=5, random_action=False)
print(f"total reward: {total_reward}")
show_video('frozenlake_random.mp4', video_width=500)

action: 2, state: 0, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 2, state: 4, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 1, state: 4, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 1, state: 4, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 1, state: 8, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 1, state: 8, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 1, state: 9, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 1, state: 10, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 1, state: 14, reward: 0.0, done: False, truncated: False, info: {'prob': 0.3333333333333333}
action: 3, state: 13, reward: 0.0, done: False, truncated: False, info: {'prob': 0.333333333333333