In [1]:
import gym
import numpy as np

In [None]:
#import policy and value iteration

In [None]:
action_mappings = {
    0: '\u2191', # UP
    1: '\u2192', # RIGHT
    2: '\u2193', # DOWN
    3: '\u2190', # LEFT
}2

In [2]:
def play_episodes(environment, n_episodes, policy):
    wins=0
    total_reward=0
    
    for episode in range(n_episodes):
        state=environment.reset()
        terminated=False
        
        while not terminated:
            #Choose best action according to the policy
            action=np.argmax(policy[state])
            
            #Perform the action in the environment
            next_state, rewards, terminated, info=environment.step(action)
            
            #Add the reward to the total reward count
            total_reward+=rewards
            
            #Update the current state
            state=next_state
            
            #Check if episode is terminated and the reward is achieved....Note that reward is 1 at the goal
            if terminated and rewards==1.0:
                wins+=1
                
    avg_reward=total_reward/n_episodes
    
    return wins, total_reward, avg_reward

__Value Iteration__

In [None]:
#Just for fun if you want to check how the environment works, in the below equation 1st value is current state and 2nd value 
#is the action taken. Then environment gives the state_probablity, next_state, reward, terminated
environment.P[50][1]

In [7]:
def value_iteration(environment, discount_factor=1.0, theta=1e-9, max_iterations=1e9):
    
    #Initialise the state value function
    state_values=np.zeros(environment.nS)
    
    #Initialise a counter
    evaluation_iterations=0
    
    for i in range(int(max_iterations)):
        evaluation_iterations+=1
        
        #Initialise delta for early stopping
        delta=0
        
        for state in range(environment.nS):
            
            q_values=cal_q_values(environment, state, state_values)
            
            #We will get four values for this state. We wil pick the max out of it
            best_action_value= np.max(q_values)
            
            delta= max(delta, abs(state_values[state]-best_action_value))
            
            #Update the state value function
            state_values[state]=best_action_value
            
        #Early Stopping
        if(delta<theta):
            print('Value iteration converged on {}th iteration.'.format(evaluation_iterations))
            print(state_values)
            break
            
    #Now we need to find the optimal policy using the State Value function
    policy=np.zeros((environment.nS, environment.nA))
    
    for state in range(environment.nS):
        q_values=cal_q_values(environment, state, state_values)
        
        #Get the best action
        best_action=np.argmax(q_values)
        
        #Update the policy
        policy[state]=np.eye(environment.nA)[best_action]
        
    print('Final policy is:')
    print(policy)
    return policy, state_values
        
        
            
    

In [8]:
def cal_q_values(environment, state, state_value_fn, discount_factor=1.0):
    #Here we will be calculating the q-values for given state with every action
    
    #initialise the q-values 
    q_values= np.zeros(environment.nA)
    
    #Note: Q-Value= Reward + DiscountFactor*EnvironmentProbablity*ValueFn
    
    for action in range(environment.nA):
        
        #For each action, environment can take us to any state based on transition model
        for state_probablity, next_state, reward, terminated in environment.P[state][action]:
            
            #Now calculate q-value
            q_values[action]+= reward+(state_probablity*discount_factor*state_value_fn[next_state])
                                       
    return q_values
    

__Lets the create the optimum policy__

In [9]:
#Create the environment
environment = gym.make('FrozenLake8x8-v0')
print('Environment created.')

Environment created.


In [10]:
policy, state_value_fn= value_iteration(environment)

Value iteration converged on 968th iteration.
[2.99999999 2.99999999 2.99999999 2.99999999 2.99999999 2.99999999
 2.99999999 2.99999999 2.99999999 2.99999999 2.99999999 2.99999999
 2.99999999 2.99999999 2.99999999 2.99999999 2.99999998 2.93460489
 2.77929154 0.         2.56985302 2.83869488 2.94623162 2.99999999
 2.99999997 2.80381469 2.40326974 1.42471131 1.8708642  0.
 2.83403282 3.         2.99999996 2.47683921 1.62670298 0.
 1.61802826 1.8335677  2.55586684 3.         2.99999996 0.
 0.         0.50412238 1.14965288 1.326808   0.         3.
 2.99999995 0.         0.58402039 0.36271426 0.         0.99720343
 0.         3.         2.99999995 2.19467343 1.38934691 0.
 0.83240114 1.66480229 2.33240114 0.        ]
Final policy is:
[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0

In [None]:
policy

In [None]:
state_value_fn

In [11]:
wins, total_reward, average_reward= play_episodes(environment,1000, policy)

In [12]:
wins

886

In [13]:
total_reward

886.0

In [None]:
average_reward

In [None]:
policy

In [None]:
%debug