In [1]:
import numpy as np
from scipy import linalg

In [2]:
n_states = 6
P_pi = np.zeros((n_states, n_states)) # transition matrix together with policy
R = np.zeros_like(P_pi)

In [3]:
P_pi[0, 1] = 0.5
P_pi[0, 3] = 0.5
P_pi[1, 2] = 0.5
P_pi[1, 5] = 0.5
P_pi[2, 4] = 0.5
P_pi[2, 5] = 0.5
P_pi[4, 5] = 0.5
P_pi[4, 0] = 0.5
P_pi[3, 0] = 0.5
P_pi[3, 3] = 0.5
P_pi[5, 5] = 1

In [4]:
P_pi

array([[0. , 0.5, 0. , 0.5, 0. , 0. ],
       [0. , 0. , 0.5, 0. , 0. , 0.5],
       [0. , 0. , 0. , 0. , 0.5, 0.5],
       [0.5, 0. , 0. , 0.5, 0. , 0. ],
       [0.5, 0. , 0. , 0. , 0. , 0.5],
       [0. , 0. , 0. , 0. , 0. , 1. ]])

In [5]:
R[0, 1] = -2
R[0, 3] = -1
R[1, 2] = -2
R[1, 5] = 0
R[2, 4] = 15
R[2, 5] = 10
R[4, 5] = 10
R[4, 0] = -10
R[3, 3] = -1
R[3, 0] = -3

In [6]:
R

array([[  0.,  -2.,   0.,  -1.,   0.,   0.],
       [  0.,   0.,  -2.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,  15.,  10.],
       [ -3.,   0.,   0.,  -1.,   0.,   0.],
       [-10.,   0.,   0.,   0.,   0.,  10.],
       [  0.,   0.,   0.,   0.,   0.,   0.]])

In [7]:
# check the correctness of P
assert((np.sum(P_pi, axis=1) == 1).all())

In [8]:
# expected reward for each state
R_expected = np.sum(P_pi * R, axis=1, keepdims=True)

In [9]:
R_expected

array([[-1.5],
       [-1. ],
       [12.5],
       [-2. ],
       [ 0. ],
       [ 0. ]])

In [15]:
# Now it is possible to solve the Bellman Equation
gamma = 0.9
A = np.eye(n_states, n_states) - gamma * P_pi
B = R_expected

In [16]:
# solve using scipy linalg
V = linalg.solve(A, B)

In [17]:
V

array([[-1.78587056],
       [ 4.46226255],
       [12.13836121],
       [-5.09753046],
       [-0.80364175],
       [ 0.        ]])

In [18]:
gamma = 0.
A = np.eye(n_states, n_states) - gamma * P_pi
B = R_expected
# solve using scipy linalg
V_gamma_zero = linalg.solve(A, B)
V_gamma_zero

array([[-1.5],
       [-1. ],
       [12.5],
       [-2. ],
       [ 0. ],
       [ 0. ]])

In [19]:
R_sa = np.zeros(((n_states-1)*2, 1))
R_sa[0] = -2 # study in state 0
R_sa[1] = -1 # social in state 0
R_sa[2] = -2 # study in state 1
R_sa[3] = 0 # sleep in state 1
R_sa[4] = 10 # sleep in state 2
R_sa[5] = 15 # beer in state 2
R_sa[6] = -1 # social in state 3 (social)
R_sa[7] = -3 # study in state 3 (social)
R_sa[8] = 10 # sleep in state 4 (pub)
R_sa[9] = -10 # study in state 4 (pub)

In [20]:
R_sa.shape

(10, 1)

In [21]:
P = np.zeros(((n_states-1)*2, n_states)) # Transition Matrix (states x action, states)
P[0, 1] = 1 # study in state 0 -> state 1
P[1, 3] = 1 # social in state 0 -> state 3
P[2, 2] = 1 # study in state 1 -> state 2
P[3, 5] = 1 # sleep in state 1 -> state 5 (bed)
P[4, 5] = 1 # sleep in state 2 -> state 5 (bed)
P[5, 4] = 1 # beer in state 2 -> state 4 (pub)
P[6, 3] = 1 # social in state 3 -> state 3 (social)
P[7, 0] = 1 # study in state 3 -> state 0 (class1)
P[8, 5] = 1 # sleep in state 4 -> state 5 (bed)
P[9, 0] = 1 # study in state 4 -> state 0 (class 1)

In [22]:
gamma = 0.9
Q_sa_pi = R_sa + gamma * P @ V

In [23]:
Q_sa_pi

array([[  2.01603629],
       [ -5.58777741],
       [  8.92452509],
       [  0.        ],
       [ 10.        ],
       [ 14.27672242],
       [ -5.58777741],
       [ -4.60728351],
       [ 10.        ],
       [-11.60728351]])

In [24]:
# reshape the column so that we obtain a vector with shape (n_states, n_actions)
n_actions = 2
Q_sa_pi2 = np.reshape(Q_sa_pi, (-1, n_actions))
Q_sa_pi2

array([[  2.01603629,  -5.58777741],
       [  8.92452509,   0.        ],
       [ 10.        ,  14.27672242],
       [ -5.58777741,  -4.60728351],
       [ 10.        , -11.60728351]])

In [25]:
best_actions = np.reshape(np.argmax(Q_sa_pi2, -1), (-1, 1))
best_actions

array([[0],
       [0],
       [1],
       [1],
       [0]])

In [26]:
Q_sa_pi_gamma_zero = R_sa
Q_sa_pi_gamma_zero

array([[ -2.],
       [ -1.],
       [ -2.],
       [  0.],
       [ 10.],
       [ 15.],
       [ -1.],
       [ -3.],
       [ 10.],
       [-10.]])

In [27]:
n_actions = 2
Q_sa_pi_gamma_zero2 = np.reshape(Q_sa_pi_gamma_zero, (-1, n_actions))
Q_sa_pi_gamma_zero2

array([[ -2.,  -1.],
       [ -2.,   0.],
       [ 10.,  15.],
       [ -1.,  -3.],
       [ 10., -10.]])

In [28]:
best_actions_gamma_zero = np.reshape(np.argmax(Q_sa_pi_gamma_zero2, -1), (-1, 1))
best_actions_gamma_zero

array([[1],
       [1],
       [1],
       [0],
       [0]])