# Value Iteration
 
Sungchul Lee  

<div align="center"><img src="img/Value.jpg" width="40%" height="10%"></div>

http://seanheritage.com/wp-content/uploads/2017/03/Value.jpg

# Value iteration for $v_*$

- Initialize $v_*(s)=0$ for all $s$.

- Repeat.

    For every $s$  (synchronous or asynchronous) update $v_*$ using Bellman's optimality equation: 
\begin{eqnarray*}
v_*(s)&=&\max_{a}\left({\cal R}_s^a+\gamma\sum_{s'}{\cal P}^a_{ss'}v_*(s')\right)\nonumber\\
\end{eqnarray*}

- Find optimal policy $\pi_*$ by solving
$$
\pi_*(s)=\mbox{argmax}_{a}q_*(s,a)
$$
    where
$$
q_*(s,a)={\cal R}_s^a+\gamma\sum_{s'}{\cal P}^a_{ss'}v_*(s')
$$

# Value iteration for $q_*$

- Initialize $q_*(s,a)=0$ for all $s$ and $a$.

- Repeat.

    For every $s$ and $a$ (synchronous or asynchronous) update $q_*$ using Bellman's optimality equation: 
\begin{eqnarray*}
q_*(s,a)&=&{\cal R}_s^a+\gamma\sum_{s'}{\cal P}^a_{ss'}\left(\max_{a'}q_*(s',a')\right)\nonumber\\
\end{eqnarray*}

- Find optimal policy $\pi_*$ by solving
$$
\pi_*(s)=\mbox{argmax}_{a}q_*(s,a)
$$

# Value iteration for $v_*$ in Andrew Ng's lecture 16

<div align="center"><img src="img/cs188_mdp_optimal_policies.png" width="50%" height="10%"></div>

https://raw.githubusercontent.com/mebusy/notes/master/imgs/cs188_mdp_optimal_policies.png

<div align="center"><img src="img/Screenshot+2017-7.png" width="100%" height="10%"></div>

In [5]:
# Value iteration for $v_*$ in Andrew Ng's lecture 16

# import libraries
import numpy as np

# state
states = [0,1,2,3,4,5,6,7,8,9,10]
N_STATES = len(states)

# action
actions = [0,1,2,3] # left, right, up, down
N_ACTIONS = len(actions)

# transition probabilities
P = np.empty((N_STATES, N_ACTIONS, N_STATES))

#                0   1   2   3   4   5   6   7   8   9  10
P[ 0, 0, :] = [ .9,  0,  0,  0, .1,  0,  0,  0,  0,  0,  0]
P[ 0, 1, :] = [ .1, .8,  0,  0, .1,  0,  0,  0,  0,  0,  0]
P[ 0, 2, :] = [ .9, .1,  0,  0,  0,  0,  0,  0,  0,  0,  0]
P[ 0, 3, :] = [ .1, .1,  0,  0, .8,  0,  0,  0,  0,  0,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 1, 0, :] = [ .8, .2,  0,  0,  0,  0,  0,  0,  0,  0,  0]
P[ 1, 1, :] = [  0, .2, .8,  0,  0,  0,  0,  0,  0,  0,  0]
P[ 1, 2, :] = [ .1, .8, .1,  0,  0,  0,  0,  0,  0,  0,  0]
P[ 1, 3, :] = [ .1, .8, .1,  0,  0,  0,  0,  0,  0,  0,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 2, 0, :] = [  0, .8, .1,  0,  0, .1,  0,  0,  0,  0,  0]
P[ 2, 1, :] = [  0,  0, .1, .8,  0, .1,  0,  0,  0,  0,  0]
P[ 2, 2, :] = [  0, .1, .8, .1,  0,  0,  0,  0,  0,  0,  0]
P[ 2, 3, :] = [  0, .1,  0, .1,  0, .8,  0,  0,  0,  0,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 3, 0, :] = [  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0]
P[ 3, 1, :] = [  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0]
P[ 3, 2, :] = [  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0]
P[ 3, 3, :] = [  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 4, 0, :] = [ .1,  0,  0,  0, .8,  0,  0, .1,  0,  0,  0]
P[ 4, 1, :] = [ .1,  0,  0,  0, .8,  0,  0, .1,  0,  0,  0]
P[ 4, 2, :] = [ .8,  0,  0,  0, .2,  0,  0,  0,  0,  0,  0]
P[ 4, 3, :] = [  0,  0,  0,  0, .2,  0,  0, .8,  0,  0,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 5, 0, :] = [  0,  0, .1,  0,  0, .8,  0,  0,  0, .1,  0]
P[ 5, 1, :] = [  0,  0, .1,  0,  0,  0, .8,  0,  0, .1,  0]
P[ 5, 2, :] = [  0,  0, .8,  0,  0, .1, .1,  0,  0,  0,  0]
P[ 5, 3, :] = [  0,  0,  0,  0,  0, .1, .1,  0,  0, .8,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 6, 0, :] = [  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0]
P[ 6, 1, :] = [  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0]
P[ 6, 2, :] = [  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0]
P[ 6, 3, :] = [  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 7, 0, :] = [  0,  0,  0,  0, .1,  0,  0, .9,  0,  0,  0]
P[ 7, 1, :] = [  0,  0,  0,  0, .1,  0,  0, .1, .8,  0,  0]
P[ 7, 2, :] = [  0,  0,  0,  0, .8,  0,  0, .1, .1,  0,  0]
P[ 7, 3, :] = [  0,  0,  0,  0,  0,  0,  0, .9, .1,  0,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 8, 0, :] = [  0,  0,  0,  0,  0,  0,  0, .8, .2,  0,  0]
P[ 8, 1, :] = [  0,  0,  0,  0,  0,  0,  0,  0, .2, .8,  0]
P[ 8, 2, :] = [  0,  0,  0,  0,  0,  0,  0, .1, .8, .1,  0]
P[ 8, 3, :] = [  0,  0,  0,  0,  0,  0,  0, .1, .8, .1,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 9, 0, :] = [  0,  0,  0,  0,  0, .1,  0,  0, .8, .1,  0]
P[ 9, 1, :] = [  0,  0,  0,  0,  0, .1,  0,  0,  0, .1, .8]
P[ 9, 2, :] = [  0,  0,  0,  0,  0, .8,  0,  0, .1,  0, .1]
P[ 9, 3, :] = [  0,  0,  0,  0,  0,  0,  0,  0, .1, .8, .1]

#                0   1   2   3   4   5   6   7   8   9  10
P[10, 0, :] = [  0,  0,  0,  0,  0,  0, .1,  0,  0, .8, .1]
P[10, 1, :] = [  0,  0,  0,  0,  0,  0, .1,  0,  0,  0, .9]
P[10, 2, :] = [  0,  0,  0,  0,  0,  0, .8,  0,  0, .1, .1]
P[10, 3, :] = [  0,  0,  0,  0,  0,  0,  0,  0,  0, .1, .9]

# rewards
R = -0.02 * np.ones((N_STATES, N_ACTIONS)) 
R[3,:] = 1.
R[6,:] = -1.

# discount factor
gamma = 0.99

# value function
V = np.zeros(N_STATES)
V[3] = 1.
V[6] = -1.

# value iteration for V
for _ in range(100):
    for s in range(N_STATES):
        if (s!=3) and (s!=6):
            V[s] = max([R[s,a] + gamma * \
                       sum([P[s,a,s1] * V[s1] \
                           for s1 in range(N_STATES)]) \
                       for a in range(N_ACTIONS)])

print(V)
print()

# Q function
Q = np.zeros((N_STATES, N_ACTIONS))
Q[3,:] = 1.
Q[6,:] = -1.

# compute Q
for s in range(N_STATES):
    if (s!=3) and (s!=6):
        for a in range(N_ACTIONS):
            Q[s,a] = R[s,a] + gamma * \
                     sum([P[s,a,s1] * V[s1] \
                         for s1 in range(N_STATES)])

print(Q)
print()

optimal_policy = np.argmax(Q, axis=1)
print(optimal_policy)

[ 0.85530117  0.89580324  0.93236641  1.          0.81969892  0.68749634
 -1.          0.78026128  0.74559468  0.70873821  0.49092193]

[[ 0.82322354  0.85530117  0.83075787  0.80256088]
 [ 0.83476757  0.89580324  0.86645526  0.86645526]
 [ 0.84984258  0.93236641  0.90611872  0.71218162]
 [ 1.          1.          1.          1.        ]
 [ 0.79112222  0.79112222  0.81969892  0.76026732]
 [ 0.68696646 -0.64953064  0.68749634  0.5103828 ]
 [-1.         -1.         -1.         -1.        ]
 [ 0.75636299  0.72890705  0.78026128  0.74902668]
 [ 0.74559468  0.68894841  0.71792194  0.71792194]
 [ 0.70873821  0.50703739  0.64691224  0.66373581]
 [ 0.49092193  0.31841144 -0.69323365  0.48757652]]

[1 1 1 0 2 2 0 2 0 0 0]


# Value iteration for $q_*$ in Andrew Ng's lecture 16

<div align="center"><img src="img/cs188_mdp_optimal_policies.png" width="50%" height="10%"></div>

https://raw.githubusercontent.com/mebusy/notes/master/imgs/cs188_mdp_optimal_policies.png

<div align="center"><img src="img/Screenshot+2017-3.png" width="100%" height="10%"></div>

In [6]:
# Value iteration for $q_*$ in Andrew Ng's lecture 16

# import libraries
import numpy as np

# state
states = [0,1,2,3,4,5,6,7,8,9,10]
N_STATES = len(states)

# action
actions = [0,1,2,3] # left, right, up, down
N_ACTIONS = len(actions)

# transition probabilities
P = np.empty((N_STATES, N_ACTIONS, N_STATES))

#                0   1   2   3   4   5   6   7   8   9  10
P[ 0, 0, :] = [ .9,  0,  0,  0, .1,  0,  0,  0,  0,  0,  0]
P[ 0, 1, :] = [ .1, .8,  0,  0, .1,  0,  0,  0,  0,  0,  0]
P[ 0, 2, :] = [ .9, .1,  0,  0,  0,  0,  0,  0,  0,  0,  0]
P[ 0, 3, :] = [ .1, .1,  0,  0, .8,  0,  0,  0,  0,  0,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 1, 0, :] = [ .8, .2,  0,  0,  0,  0,  0,  0,  0,  0,  0]
P[ 1, 1, :] = [  0, .2, .8,  0,  0,  0,  0,  0,  0,  0,  0]
P[ 1, 2, :] = [ .1, .8, .1,  0,  0,  0,  0,  0,  0,  0,  0]
P[ 1, 3, :] = [ .1, .8, .1,  0,  0,  0,  0,  0,  0,  0,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 2, 0, :] = [  0, .8, .1,  0,  0, .1,  0,  0,  0,  0,  0]
P[ 2, 1, :] = [  0,  0, .1, .8,  0, .1,  0,  0,  0,  0,  0]
P[ 2, 2, :] = [  0, .1, .8, .1,  0,  0,  0,  0,  0,  0,  0]
P[ 2, 3, :] = [  0, .1,  0, .1,  0, .8,  0,  0,  0,  0,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 3, 0, :] = [  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0]
P[ 3, 1, :] = [  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0]
P[ 3, 2, :] = [  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0]
P[ 3, 3, :] = [  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 4, 0, :] = [ .1,  0,  0,  0, .8,  0,  0, .1,  0,  0,  0]
P[ 4, 1, :] = [ .1,  0,  0,  0, .8,  0,  0, .1,  0,  0,  0]
P[ 4, 2, :] = [ .8,  0,  0,  0, .2,  0,  0,  0,  0,  0,  0]
P[ 4, 3, :] = [  0,  0,  0,  0, .2,  0,  0, .8,  0,  0,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 5, 0, :] = [  0,  0, .1,  0,  0, .8,  0,  0,  0, .1,  0]
P[ 5, 1, :] = [  0,  0, .1,  0,  0,  0, .8,  0,  0, .1,  0]
P[ 5, 2, :] = [  0,  0, .8,  0,  0, .1, .1,  0,  0,  0,  0]
P[ 5, 3, :] = [  0,  0,  0,  0,  0, .1, .1,  0,  0, .8,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 6, 0, :] = [  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0]
P[ 6, 1, :] = [  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0]
P[ 6, 2, :] = [  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0]
P[ 6, 3, :] = [  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 7, 0, :] = [  0,  0,  0,  0, .1,  0,  0, .9,  0,  0,  0]
P[ 7, 1, :] = [  0,  0,  0,  0, .1,  0,  0, .1, .8,  0,  0]
P[ 7, 2, :] = [  0,  0,  0,  0, .8,  0,  0, .1, .1,  0,  0]
P[ 7, 3, :] = [  0,  0,  0,  0,  0,  0,  0, .9, .1,  0,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 8, 0, :] = [  0,  0,  0,  0,  0,  0,  0, .8, .2,  0,  0]
P[ 8, 1, :] = [  0,  0,  0,  0,  0,  0,  0,  0, .2, .8,  0]
P[ 8, 2, :] = [  0,  0,  0,  0,  0,  0,  0, .1, .8, .1,  0]
P[ 8, 3, :] = [  0,  0,  0,  0,  0,  0,  0, .1, .8, .1,  0]

#                0   1   2   3   4   5   6   7   8   9  10
P[ 9, 0, :] = [  0,  0,  0,  0,  0, .1,  0,  0, .8, .1,  0]
P[ 9, 1, :] = [  0,  0,  0,  0,  0, .1,  0,  0,  0, .1, .8]
P[ 9, 2, :] = [  0,  0,  0,  0,  0, .8,  0,  0, .1,  0, .1]
P[ 9, 3, :] = [  0,  0,  0,  0,  0,  0,  0,  0, .1, .8, .1]

#                0   1   2   3   4   5   6   7   8   9  10
P[10, 0, :] = [  0,  0,  0,  0,  0,  0, .1,  0,  0, .8, .1]
P[10, 1, :] = [  0,  0,  0,  0,  0,  0, .1,  0,  0,  0, .9]
P[10, 2, :] = [  0,  0,  0,  0,  0,  0, .8,  0,  0, .1, .1]
P[10, 3, :] = [  0,  0,  0,  0,  0,  0,  0,  0,  0, .1, .9] 

# rewards
R = -0.02 * np.ones((N_STATES, N_ACTIONS)) 
R[3,:] = 1.
R[6,:] = -1.

# discount factor
gamma = 0.99

# Q function
Q = np.zeros((N_STATES, N_ACTIONS))
Q[3,:] = 1.
Q[6,:] = -1.

# value iteration for Q
for _ in range(100):
    for s in range(N_STATES):
        if (s!=3) and (s!=6):
            for a in range(N_ACTIONS):
                Q[s,a] = R[s,a] + gamma * \
                         sum([P[s,a,s1] * \
                             max([Q[s1,a1] \
                                 for a1 in range(N_ACTIONS)]) \
                             for s1 in range(N_STATES)])

print(Q)
print()

optimal_policy = np.argmax(Q, axis=1)
print(optimal_policy)

[[ 0.82322354  0.85530117  0.83075787  0.80256088]
 [ 0.83476757  0.89580324  0.86645526  0.86645526]
 [ 0.84984258  0.93236641  0.90611872  0.71218162]
 [ 1.          1.          1.          1.        ]
 [ 0.79112222  0.79112222  0.81969892  0.76026732]
 [ 0.68696646 -0.64953064  0.68749634  0.5103828 ]
 [-1.         -1.         -1.         -1.        ]
 [ 0.75636299  0.72890705  0.78026128  0.74902668]
 [ 0.74559468  0.68894841  0.71792194  0.71792194]
 [ 0.70873821  0.50703739  0.64691224  0.66373581]
 [ 0.49092193  0.31841144 -0.69323365  0.48757652]]

[1 1 1 0 2 2 0 2 0 0 0]


##### one-hot encode

In [24]:
import numpy as np

print(np.eye(4))
print()
print(np.eye(4)[2])
print()
print(np.eye(4)[[0,0,3]])
print()
print(np.eye(4)[[0,0,3]].astype(np.int32))
print()
print(np.eye(4)[[0,0,3]].astype(np.float32))

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]

[0. 0. 1. 0.]

[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]]

[[1 0 0 0]
 [1 0 0 0]
 [0 0 0 1]]

[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]]
