## FrozenLake with Dynamic Programming

In [1]:
import gym
import numpy as np

In [2]:
# create FrozenLake environment and show a few frames
env = gym.make('FrozenLake-v0')
env.reset()
for i in range(10):
    env.render()
    state, reward, done, info = env.step(env.action_space.sample())
    if done:
        print("Episode is done, resetting the environment")
        env.reset()
env.close()


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
[41mF[0mFFH
HFFG


### Custom Environments

In [3]:
# create deterministic version of Frozen Lake
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False}
)

In [4]:
env = gym.make('FrozenLakeNotSlippery-v0')

env.reset()
for i in range(10):
    env.render()
    state, reward, done, info = env.step(env.action_space.sample())
    if done:
        print("Episode is done, resetting the environment")
        env.reset()
env.close()


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFFF
FH[41mF[0mH
FFFH
HFFG
Episode is done, resetting the environment

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
Episode is done, resetting the environment

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG


## Perfect Model of Environment

In [5]:
# What does perfect knowledge of the model mean? 

env = gym.make('FrozenLakeNotSlippery-v0')

# We know all reward, transtition probabilities, next state conditions, done conditions, etc. of the model.
# Don't need to use trial-and-error can just use the model to calculate value states ans policies
env.P

#{state: {action: [(transition_probability, next_state, reward_done)], ...} ...}

{0: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 4, 0.0, False)],
  2: [(1.0, 1, 0.0, False)],
  3: [(1.0, 0, 0.0, False)]},
 1: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 5, 0.0, True)],
  2: [(1.0, 2, 0.0, False)],
  3: [(1.0, 1, 0.0, False)]},
 2: {0: [(1.0, 1, 0.0, False)],
  1: [(1.0, 6, 0.0, False)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 2, 0.0, False)]},
 3: {0: [(1.0, 2, 0.0, False)],
  1: [(1.0, 7, 0.0, True)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 3, 0.0, False)]},
 4: {0: [(1.0, 4, 0.0, False)],
  1: [(1.0, 8, 0.0, False)],
  2: [(1.0, 5, 0.0, True)],
  3: [(1.0, 0, 0.0, False)]},
 5: {0: [(1.0, 5, 0, True)],
  1: [(1.0, 5, 0, True)],
  2: [(1.0, 5, 0, True)],
  3: [(1.0, 5, 0, True)]},
 6: {0: [(1.0, 5, 0.0, True)],
  1: [(1.0, 10, 0.0, False)],
  2: [(1.0, 7, 0.0, True)],
  3: [(1.0, 2, 0.0, False)]},
 7: {0: [(1.0, 7, 0, True)],
  1: [(1.0, 7, 0, True)],
  2: [(1.0, 7, 0, True)],
  3: [(1.0, 7, 0, True)]},
 8: {0: [(1.0, 8, 0.0, False)],
  1: [(1.0, 12, 0.0, True)],
  2: [(

In [6]:
env = gym.make('FrozenLake-v0')
env.P

{0: {0: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  2: [(0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)],
  3: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 1: {0: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False)],
  2: [(0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  3: [(0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 2:

## Policy Evaluation

In [7]:
def policy_evaluation(states, policy, env_model, discount, theta=0.0001):
    delta = theta * 2
    state_len = env.nS
    action_len = env.nA
    while delta > theta:
        delta = 0
        for s in range(state_len):
            new_s = 0.
            for a in range(action_len):
                transitions_list = env_model[s][a]
                for i in transitions_list:
                    transition_prob, next_state, reward, done = i
                    new_s += policy[s,a]*transition_prob*(reward + discount*states[next_state])
            delta = max(delta, np.abs(states[s] - new_s))
            states[s] = new_s
            
    return states

In [8]:
env = gym.make('FrozenLake-v0')
state_size = env.nS
action_size = env.nA

#initialize states
state_value_array = np.zeros(state_size)
#initialize random policy for evaluation
policy_array = np.ones((state_size, action_size))/action_size
#discount factor gamma
gamma = 0.99

state_value_array = policy_evaluation(state_value_array, policy_array, env.P, gamma)

print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")  

#discount factor gamma
gamma = 0.7

state_value_array = policy_evaluation(state_value_array, policy_array, env.P, gamma)

print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")  

Reshaped State Value Estimates with gamma of 0.99:
[[0.012 0.01  0.019 0.009]
 [0.015 0.    0.039 0.   ]
 [0.033 0.084 0.138 0.   ]
 [0.    0.17  0.434 0.   ]]

Reshaped State Value Estimates with gamma of 0.7:
[[0.001 0.001 0.003 0.001]
 [0.001 0.    0.012 0.   ]
 [0.006 0.026 0.065 0.   ]
 [0.    0.076 0.333 0.   ]]



## Policy Improvement

In [21]:
def policy_evaluation(states, policy, env_model, discount, theta=0.0001):
    delta = theta * 2
    state_len = env.nS
    action_len = env.nA
    while delta > theta:
        delta = 0
        for s in range(state_len):
            new_s = 0.
            for a in range(action_len):
                transitions_list = env_model[s][a]
                for i in transitions_list:
                    transition_prob, next_state, reward, done = i
                    if done:
                        new_s += policy[s,a]*transition_prob*reward
                    else:
                        new_s += policy[s,a]*transition_prob*(reward + discount*states[next_state])
                    
            delta = max(delta, np.abs(states[s] - new_s))
            states[s] = new_s
            
    return states

def policy_improvement(states, policy, env_model, discount):
    policy_stable = True
    state_len = env.nS
    action_len = env.nA
    for s in range(state_len):
        old_action = np.argmax(policy[s])
        temp_array = np.zeros((action_len))
        for a in range(action_len):
            transitions_list = env_model[s][a]
            for i in transitions_list:
                transition_prob, next_state, reward, done = i
                if done:
                    temp_array[a] += transition_prob*reward
                else:
                    temp_array[a] += transition_prob*(reward + discount*states[next_state])
        policy[s] = np.zeros((action_len))
        policy[s, np.argmax(temp_array)] = 1.
        
        if old_action != np.argmax(policy[s]):
            policy_stable = False
            
    return policy_stable, states, policy
        
def policy_iteration(env_model, discount, theta=0.0001):
    # create random policy
    policy = np.ones((env.nS, env.nA))/env.nA
    # initialize states
    states = np.zeros(env.nS)
    policy_stable = False
    while not policy_stable:
        states = policy_evaluation(states, policy, env_model, discount, theta)
        policy_stable, states, policy = policy_improvement(states, policy, env_model, discount)
        
    return states, policy

In [22]:
env = gym.make('FrozenLakeNotSlippery-v0')

#discount factor gamma
gamma = 0.99

state_value_array, policy_array = policy_iteration(env.P, gamma)

print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")  

# print("Reshaped Policy Array {}:")
# print(np.round(policy_array.reshape(16,4),3))
# print("")

gamma = 0.5

state_value_array, policy_array = policy_iteration(env.P, gamma)

print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")  

print("Reshaped Policy Array:")
print(np.round(policy_array.reshape(16,4),3))
print("")

Reshaped State Value Estimates with gamma of 0.99:
[[0.951 0.961 0.97  0.961]
 [0.961 0.    0.98  0.   ]
 [0.97  0.98  0.99  0.   ]
 [0.    0.99  1.    0.   ]]

Reshaped State Value Estimates with gamma of 0.5:
[[0.031 0.062 0.125 0.062]
 [0.062 0.    0.25  0.   ]
 [0.125 0.25  0.5   0.   ]
 [0.    0.5   1.    0.   ]]

Reshaped Policy Array:
[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]



In [23]:
env = gym.make('FrozenLake-v0')

gamma = 0.99

state_value_array, policy_array = policy_iteration(env.P, gamma)

print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")  

# print("Reshaped Policy Array {}:")
# print(np.round(policy_array.reshape(16,4),3))
# print("")

gamma = 0.5

state_value_array, policy_array = policy_iteration(env.P, gamma)

print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")  

print("Reshaped Policy Array:")
print(np.round(policy_array.reshape(16,4),3))
print("")


gamma = 1

state_value_array, policy_array = policy_iteration(env.P, gamma)

print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")  

print("Reshaped Policy Array:")
print(np.round(policy_array.reshape(16,4),3))
print("")

Reshaped State Value Estimates with gamma of 0.99:
[[0.541 0.497 0.469 0.455]
 [0.557 0.    0.358 0.   ]
 [0.591 0.642 0.615 0.   ]
 [0.    0.741 0.863 0.   ]]

Reshaped State Value Estimates with gamma of 0.5:
[[0.    0.001 0.003 0.001]
 [0.001 0.    0.013 0.   ]
 [0.006 0.029 0.077 0.   ]
 [0.    0.089 0.418 0.   ]]

Reshaped Policy Array:
[[0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]

Reshaped State Value Estimates with gamma of 1:
[[0.822 0.821 0.821 0.821]
 [0.822 0.    0.528 0.   ]
 [0.822 0.823 0.764 0.   ]
 [0.    0.882 0.941 0.   ]]

Reshaped Policy Array:
[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 

### Taxi Environment

In [24]:
# create Taxi environment and show a few frames
env = gym.make('Taxi-v2')
env.reset()
for i in range(10):
    env.render()
    state, reward, done, info = env.step(env.action_space.sample())
    if done:
        print("Episode is done, resetting the environment")
        env.reset()
env.close()

+---------+
|[34;1mR[0m:[43m [0m| : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+

+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Dropoff)
+---------+
|[42mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|R:[42m_[0m| : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|R:[42m_[0m| : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :G|
| :[42m_[0m: : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :G|
| :[42m_[0m: : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|R:[42m_[0m| : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[

In [42]:
env = gym.make('Taxi-v2')

#discount factor gamma
gamma = 0.95

state_value_array, policy_array = policy_iteration(env.P, gamma)

# evaluate the agent using the found policy
episodes = 100
episode_reward_list, episode_len_list = [], []

for i in range(episodes):
    state = env.reset()
    episode_reward = 0
    episode_length = 0
    while True:
        action = np.argmax(policy_array[state])
        state, reward, done, info = env.step(action)
        episode_reward += reward
        episode_length += 1
        if done:
            episode_reward_list.append(episode_reward)
            episode_len_list.append(episode_length)
            #print("Episode {}: Reward: {} Length: {}".format(i, episode_reward, episode_length))
            break
    
print("Average Reward: {} Average Length: {}".format(np.mean(episode_reward_list), np.mean(episode_len_list)))

Average Reward: 8.45 Average Length: 12.55


## Value Iteration

In [14]:
def value_iteration(env_model, discount, theta=0.0001):
    state_len = env.nS
    action_len = env.nA
    
    delta = theta * 2
    states = np.zeros((state_len))
    while delta > theta:
        delta = 0
        for s in range(state_len):
            temp_array = np.zeros((action_len))
            for a in range(action_len):
                transitions_list = env_model[s][a]
                for i in transitions_list:
                    transition_prob, next_state, reward, done = i
                    if done:
                        temp_array[a] += transition_prob*reward
                    else:
                        temp_array[a] += transition_prob*(reward + discount*states[next_state])
            v_max = np.max(temp_array)
            delta = max(delta, np.abs(states[s] - v_max))
            states[s] = v_max
            
    # extract max policy from states
    policy = np.zeros((state_len, action_len))
    for s in range(state_len):
        temp_array = np.zeros((action_len))
        for a in range(action_len):
            transitions_list = env_model[s][a]
            for i in transitions_list:
                transition_prob, next_state, reward, done = i
                temp_array[a] += transition_prob*(reward + discount*states[next_state])
        
        # take max action every time (ie probability of max action = 1.0, prob of other actions stays 0)
        policy[s,np.argmax(temp_array)] = 1.
        
    return states, policy
            

In [15]:
env = gym.make('FrozenLakeNotSlippery-v0')

#discount factor gamma
gamma = 0.99

state_value_array, policy_array = value_iteration(env.P, gamma)

print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")  

# print("Reshaped Policy Array {}:")
# print(np.round(policy_array.reshape(16,4),3))
# print("")

gamma = 0.5

state_value_array, policy_array = value_iteration(env.P, gamma)

print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")   

print("Reshaped Policy Array:")
print(np.round(policy_array.reshape(16,4),3))
print("")

gamma = 1.

state_value_array, policy_array = value_iteration(env.P, gamma)

print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")   

Reshaped State Value Estimates with gamma of 0.99:
[[0.951 0.961 0.97  0.961]
 [0.961 0.    0.98  0.   ]
 [0.97  0.98  0.99  0.   ]
 [0.    0.99  1.    0.   ]]

Reshaped State Value Estimates with gamma of 0.5:
[[0.031 0.062 0.125 0.062]
 [0.062 0.    0.25  0.   ]
 [0.125 0.25  0.5   0.   ]
 [0.    0.5   1.    0.   ]]

Reshaped Policy Array:
[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]

Reshaped State Value Estimates with gamma of 1.0:
[[1. 1. 1. 1.]
 [1. 0. 1. 0.]
 [1. 1. 1. 0.]
 [0. 1. 1. 0.]]



In [16]:
env = gym.make('FrozenLake-v0')

#discount factor gamma
gamma = 0.99

state_value_array, policy_array = value_iteration(env.P, gamma)

print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")  

print("Reshaped Policy Array {}:")
print(np.round(policy_array.reshape(16,4),3))
print("")

gamma = 0.5

state_value_array, policy_array = value_iteration(env.P, gamma)

print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")   

print("Reshaped Policy Array:")
print(np.round(policy_array.reshape(16,4),3))
print("")

gamma = 1.

state_value_array, policy_array = value_iteration(env.P, gamma)

print("Reshaped State Value Estimates with gamma of {}:".format(gamma))
print(np.round(state_value_array.reshape(4,4),3))
print("")   

print("Reshaped Policy Array:")
print(np.round(policy_array.reshape(16,4),3))
print("")

Reshaped State Value Estimates with gamma of 0.99:
[[0.541 0.497 0.469 0.455]
 [0.557 0.    0.358 0.   ]
 [0.591 0.642 0.615 0.   ]
 [0.    0.741 0.863 0.   ]]

Reshaped Policy Array {}:
[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]

Reshaped State Value Estimates with gamma of 0.5:
[[0.    0.001 0.003 0.001]
 [0.001 0.    0.013 0.   ]
 [0.006 0.029 0.077 0.   ]
 [0.    0.089 0.418 0.   ]]

Reshaped Policy Array:
[[0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]

Reshaped State Value Estimates with gamma of 1.0:
[[0.822 0.821 0.821 0.821]
 [0.822 0.    0.528 0.   ]
 [0.822 0.823 0.764 0.   ]
 [0.    0.882 0

In [17]:
env = gym.make('Taxi-v2')

#discount factor gamma
gamma = 0.9

state_value_array, policy_array = value_iteration(env.P, gamma)

# evaluate the agent using the found policy
episodes = 100
episode_reward_list, episode_len_list = [], []

for i in range(episodes):
    state = env.reset()
    episode_reward = 0
    episode_length = 0
    while True:
        action = np.argmax(policy_array[state])
        state, reward, done, info = env.step(action)
        episode_reward += reward
        episode_length += 1
        if done:
            episode_reward_list.append(episode_reward)
            episode_len_list.append(episode_length)
            #print("Episode {}: Reward: {} Length: {}".format(i, episode_reward, episode_length))
            break
    
print("Average Reward: {} Average Length: {}".format(np.mean(episode_reward_list), np.mean(episode_len_list)))

Average Reward: 8.57 Average Length: 12.43
