In [None]:
from frozen_lake_env import FrozenLakeEnv
import numpy as np
import gym
import time

np.set_printoptions(precision=3)

In [19]:
# Glossary
# P: nested dictionary
# 	From gym.core.Environment
# 	For each pair of states in [1, nS] and actions in [1, nA], P[state][action] is a
# 	tuple of the form (probability, nextstate, reward, terminal) where
# 		- probability: float
# 			the probability of transitioning from "state" to "nextstate" with "action"
# 		- nextstate: int
# 			denotes the state we transition to (in range [0, nS - 1])
# 		- reward: int
# 			either 0 or 1, the reward for transitioning from "state" to
# 			"nextstate" with "action"
# 		- terminal: bool
# 		  True when "nextstate" is a terminal state (hole or goal), False otherwise
# nS: int
# 	number of states in the environment
# nA: int
# 	number of actions in the environment
# gamma: float
# 	Discount factor. Number in range [0, 1)
# Returns: index of action

def policy_update(P, policy, nS, nA, gamma=0.9):
    pass

def policy_evaluation(P, nS, nA, policy, gamma=0.9, tol=1e-3):
    pass

def policy_iteration(P, nS, nA, gamma=0.9, tol=1e-3):
    pass

def value_iteration(P, nS, nA, gamma=0.9, tol=1e-3):
    """
    Learn value function and policy by using value iteration method for a given
    gamma and environment.

    Parameters:
    ----------
    p = {a1: [(p1, s'1, r1, terminal) ...]}
    P, nS, nA, gamma:
        defined at beginning of file
    tol: float
        Terminate value iteration when
            max |value_function(s) - prev_value_function(s)| < tol
    Returns:
    ----------
    value_function: np.ndarray[nS]
    policy: np.ndarray[nS]
    """
    value_function = np.zeros(nS)
    policy = np.zeros(nS, dtype=int)
    should_continue = True
    i = 0
    with open("value_iteration_history.txt", "w") as f:
        while True:
            if not should_continue:
                break
            should_continue = False
            for s in range(nS):
                transitions = P[s]
                for action, action_ps in transitions.items():
                    v = 0
                    for action_p in action_ps:
                        p, s_prime, r, terminal = action_p
                        # NOTE: we are getting expectation of total rewards
                        v += p*(r + gamma * value_function[s_prime])
                    next_value = max(value_function[s], v)
                    # Pick the action that leads to the highest valued state.
                    if next_value == v:
                        policy[s] = action
                    if not should_continue and abs(next_value - value_function[s]) > tol:
                        should_continue = True
                    value_function[s] = next_value
            #TODO Remember to remove
            print(f'{value_function}')
                
            f.write(f"======================{i}======================\n")
            f.write("value function:\n")
            f.write(str(value_function)+"\n")
            f.write("policy:")
            f.write(str(policy) + "\n")
            i += 1
        # for every s, find all its ps. and probabilities and next state
    

    return value_function, policy

def render_single(env, policy, max_steps=100):
    """
    env: gym.core.Environment - Environment to play on. Must have nS, nA, and P as attributes.
    Policy: np.array of shape [env.nS]. The action to take at a given state
    """ 
    episode_reward = 0
    ob = env.reset()
    for t in range(max_steps):
        env.render("frozen_lake_output.tmp")
        a = policy[ob]
        ob, rew, done, _ = env.step(a)
        episode_reward += rew
        if done:
            break
        time.sleep(0.25)
    env.render("frozen_lake_output.tmp")
    if not done:
        print("The agent didn't reach a terminal state in {} steps.".format(max_steps))
    else:
        print("Episode reward: %f" % episode_reward)


envs = [
    FrozenLakeEnv(map_name="4x4", is_slippery=False), 
    # FrozenLakeEnv(map_name="8x8", is_slippery=False), 
    # FrozenLakeEnv(map_name="4x4", is_slippery=True)
    ]
for env in envs:
    # print("\n" + "-"*25 + "\nBeginning Policy Iteration\n" + "-"*25)
    # V_pi, p_pi = policy_iteration(env.P, env.observation_space.n, env.action_space.n, gamma=0.9, tol=1e-3)
    # render_single(env, p_pi, 100)

    print("\n" + "-"*25 + "\nBeginning Value Iteration\n" + "-"*25)
    V_vi, p_vi = value_iteration(env.P, env.nS, env.nA, gamma=0.9, tol=1e-3)
    render_single(env, p_vi, 100)

Rico: current state 14, new state: 15

-------------------------
Beginning Value Iteration
-------------------------
Rico: should continue
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
Rico: should continue
[0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.9 0.  0.  0.9 1.  0. ]
Rico: should continue
[0.   0.   0.   0.   0.   0.   0.81 0.   0.   0.81 0.9  0.   0.   0.9
 1.   0.  ]
Rico: should continue
[0.    0.    0.729 0.656 0.    0.    0.81  0.    0.729 0.81  0.9   0.
 0.    0.9   1.    0.   ]
Rico: should continue
[0.    0.656 0.729 0.656 0.656 0.    0.81  0.    0.729 0.81  0.9   0.
 0.    0.9   1.    0.   ]
Rico: should continue
[0.59  0.656 0.729 0.656 0.656 0.    0.81  0.    0.729 0.81  0.9   0.
 0.    0.9   1.    0.   ]
[0.59  0.656 0.729 0.656 0.656 0.    0.81  0.    0.729 0.81  0.9   0.
 0.    0.9   1.    0.   ]
Episode reward: 1.000000


## Value Iteration
Intro: Terminal State is 15, immediate state is 14. 
    - If there's no revisited states, and reward from goal state to goal state is 0,
    then value iteration will simply polulate from end pose back. Because r + gamma * V(s') = r at the second last state, and for rest of the states, r + gamma * V, with constant V for all other states.
    - What about revisited states?

```
======================0======================
value function:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
policy:[3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3]
======================1======================
value function:
[0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.9 0.  0.  0.9 1.  0. ]
policy:[3 3 3 3 3 3 3 3 3 3 1 3 3 2 2 3]
======================2======================
value function:
[0.   0.   0.   0.   0.   0.   0.81 0.   0.   0.81 0.9  0.   0.   0.9
 1.   0.  ]
policy:[3 3 3 3 3 3 1 3 3 2 1 3 3 2 2 3]
======================3======================
value function:
[0.    0.    0.729 0.656 0.    0.    0.81  0.    0.729 0.81  0.9   0.
 0.    0.9   1.    0.   ]
policy:[3 3 1 0 3 3 1 3 2 2 1 3 3 2 2 3]
======================4======================
value function:
[0.    0.656 0.729 0.656 0.656 0.    0.81  0.    0.729 0.81  0.9   0.
 0.    0.9   1.    0.   ]
policy:[3 2 1 0 1 3 1 3 2 2 1 3 3 2 2 3]
======================5======================
value function:
[0.59  0.656 0.729 0.656 0.656 0.    0.81  0.    0.729 0.81  0.9   0.
 0.    0.9   1.    0.   ]
policy:[2 2 1 0 1 3 1 3 2 2 1 3 3 2 2 3]
======================6======================
value function:
[0.59  0.656 0.729 0.656 0.656 0.    0.81  0.    0.729 0.81  0.9   0.
 0.    0.9   1.    0.   ]
policy:[2 2 1 0 1 3 1 3 2 2 1 3 3 2 2 3]
```