In [1]:
import gymnasium as gym
import numpy as np
import time

In [2]:
env=gym.make("FrozenLake-v1",render_mode='human',is_slippery=False)

In [3]:
n_states=env.observation_space.n
n_actions=env.action_space.n

In [4]:
gamma = 0.99
theta = 0.000001

In [5]:
def argmax(env, V, pi, action,s, gamma):
    e = np.zeros(n_actions)
    for a in range(n_actions):                         # iterate for every action possible 
        q=0
        P = np.array(env.env.P[s][a])                   
        (x,y) = np.shape(P)                             # for Bellman Equation 
        
        for i in range(x):                              # iterate for every possible states
            s_= int(P[i][1])                            # S' - Sprime - possible succesor states
            p = P[i][0]                                 # Transition Probability P(s'|s,a) 
            r = P[i][2]                                 # Reward
            
            q += p*(r+gamma*V[s_])                      # calculate action_ value q(s|a)
            e[a] = q
            
    m = np.argmax(e) 
    action[s]=m                                           # Take index which has maximum value 
    pi[s][m] = 1                                        # update pi(a|s) 

    return pi

In [6]:
def bellman_optimality_update(env, V, s, gamma):  # update the stae_value V[s] by taking 
    pi = np.zeros((n_states, n_actions))       # action which maximizes current value
    e = np.zeros(n_actions)                       
                                            # STEP1: Find 
    for a in range(n_actions):             
        q=0                                 # iterate for all possible action
        P = np.array(env.env.P[s][a])
        (x,y) = np.shape(P)
        
        for i in range(x):
            s_= int(P[i][1])
            p = P[i][0]
            r = P[i][2]
            q += p*(r+gamma*V[s_])
            e[a] = q
            
    m = np.argmax(e)
    pi[s][m] = 1
    
    value = 0
    for a in range(n_actions):
        u = 0
        P = np.array(env.env.P[s][a])
        (x,y) = np.shape(P)
        for i in range(x):
            
            s_= int(P[i][1])
            p = P[i][0]
            r = P[i][2]
            
            u += p*(r+gamma*V[s_])
            
        value += pi[s,a] * u
  
    V[s]=value
    return V[s]

In [7]:
def value_iteration(env, gamma, theta):
    V = np.zeros(n_states)                                       # initialize v(0) to arbitory value, my case "zeros"
    while True:
        delta = 0
        for s in range(n_states):                       # iterate for all states
            v = V[s]
            bellman_optimality_update(env, V, s, gamma)   # update state_value with bellman_optimality_update
            delta = max(delta, abs(v - V[s]))             # assign the change in value per iteration to delta  
        if delta < theta:                                       
            break                                         # if change gets to negligible 
                                                          # --> converged to optimal value         
    pi = np.zeros((n_states, n_actions)) 
    action = np.zeros((n_states))
    for s in range(n_states):
        pi = argmax(env, V, pi,action, s, gamma)         # extract optimal policy using action value 
        
    return V, pi,action                                          # optimal value funtion, optimal policy

In [8]:
V, pi, action = value_iteration(env, gamma, theta)

In [9]:
V

array([0.95099005, 0.96059601, 0.970299  , 0.96059601, 0.96059601,
       0.        , 0.9801    , 0.        , 0.970299  , 0.9801    ,
       0.99      , 0.        , 0.        , 0.99      , 1.        ,
       0.        ])

In [10]:
pi

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]])

In [11]:
action

array([1., 2., 1., 0., 1., 0., 1., 0., 2., 1., 1., 0., 0., 2., 2., 0.])

In [12]:

action=[int(i) for i in action]
print (action)

[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]


In [13]:
#a= np.reshape(action,(4,4))
#print(a)                          # discrete action to take in given state

In [14]:
state=env.reset()[0]
env.render()
print('Initial state of the system')

numberOfIterations=100

for i in range(numberOfIterations):
    randomAction= action[state]
    returnValue=env.step(randomAction)
    env.render()
    print('Iteration: {} and action {}'.format(i+1,randomAction))
    time.sleep(2)
    state=returnValue[0]
    if returnValue[2]:
        break

env.close()    

Initial state of the system
Iteration: 1 and action 1
Iteration: 2 and action 1
Iteration: 3 and action 2
Iteration: 4 and action 1
Iteration: 5 and action 2
Iteration: 6 and action 2
