In [8]:
import gym
import numpy as np
#make environment
env = gym.make('FrozenLake-v0')
# as the environment is continues there cannot be finite number of states 
states = env.observation_space.n #used if discrete environment
states

16

FrozenLake-v0
The agent controls the movement of a character in a grid world. Some tiles of the grid are walkable, and others lead to the agent falling into the water. Additionally, the movement direction of the agent is uncertain and only partially depends on the chosen direction. The agent is rewarded for finding a walkable path to a goal tile.

Winter is here. You and your friends were tossing around a frisbee at the park when you made a wild throw that left the frisbee out in the middle of the lake. The water is mostly frozen, but there are a few holes where the ice has melted. If you step into one of those holes, you'll fall into the freezing water. At this time, there's an international frisbee shortage, so it's absolutely imperative that you navigate across the lake and retrieve the disc. However, the ice is slippery, so you won't always move in the direction you intend.

The surface is described using a grid like the following:
```
SFFF       (S: starting point, safe)
FHFH       (F: frozen surface, safe)
FFFH       (H: hole, fall to your doom)
HFFG       (G: goal, where the frisbee is located)
```

The actions you can take are up - down - right - late
The episode ends when you reach the goal or fall in a hole. You receive a reward of 1 if you reach the goal, and zero otherwise

In [9]:
#check number of actions that can be 
actions = env.action_space.n
actions

4

In [3]:
#initialize value table randomly
value_table = np.zeros((states,1))
value_table

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [4]:
def value_iterations(env , n_iterations , gamma = 1.0 , threshold = 1e-30):
    """
    n_iterations is gives the app number if times to train
    gamma is "discount-factor" if the step should be taken or not
    threshold is allowed error
    this iterates through possible movements. Then creates table suggesting values
    of reward we can get for being in a specific state. 
  """
    for i in range(n_iterations):
        
        new_valuetable = np.copy(value_table)
        for state in range(states):
            q_value = []
            for action in range(actions):
                next_state_reward = []
                for next_state_parameters in env.env.P[state][action]:
                    transition_prob, next_state, reward_prob, _ = next_state_parameters
                    reward = transition_prob*(reward_prob+gamma*new_valuetable[next_state])
                    next_state_reward.append(reward)
                    
                    
                q_value.append((np.sum(next_state_reward)))
            value_table[state] = max(q_value)
            
        if (np.sum(np.fabs(new_valuetable - value_table))<=threshold):
            break
    return value_table  

In [13]:
def extract_policy(value_table, gamma = 1.0):
    """ 
    policy is the definition of what action to take 
    """
    policy = np.zeros(env.observation_space.n)
    for state in range(env.observation_space.n):
        Q_table = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            for next_sr in env.env.P[state][action]:
                transition_prob, next_state, reward_prob, _ = next_sr
                Q_table[action] += (transition_prob * (reward_prob + gamma *value_table[next_state]))
        policy[state] = np.argmax(Q_table)
    return policy

In [6]:
value_table = value_iterations(env,10000)
value_table

array([[0.82352941],
       [0.82352941],
       [0.82352941],
       [0.82352941],
       [0.82352941],
       [0.        ],
       [0.52941176],
       [0.        ],
       [0.82352941],
       [0.82352941],
       [0.76470588],
       [0.        ],
       [0.        ],
       [0.88235294],
       [0.94117647],
       [0.        ]])

In [7]:
policy = extract_policy(value_table)
print(policy)

[0. 3. 3. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]
