In [188]:
import numpy as np
import matplotlib.pyplot as plt
import math

In [187]:
class possion:
    def __init__(self, n, lam):
        self.cache = np.full([n, lam], 0.0)
        for i in range(1, n):
            for j in range(1, lam):
                self.cache[i][j] = self.calc(i, j)
        # print("poission done")
    
    def get(self, n, lam):
        return self.cache[n][lam]

    def calc(self, n, lam):
        if n == 0: return np.exp(-lam)
        factorial = math.gamma(n + 1)
        return (lam ** n) * np.exp(-lam) / factorial


In [186]:
def policy_eval(v_pi, pi_s, params, pss, transitions):

    n = params["move_max"] - 1 ## 20
    m = 4

    for A in range(0, 21):
        for B in range(0, 21):
            value = 0
            s_dash_A = A - pi_s[A][B]
            s_dash_B = B - pi_s[A][B]
            for returned_A, rented_A, returned_B, rented_B in transitions:
                if rented_A > s_dash_A or rented_B > s_dash_B:
                    continue
                    
                A_new = min(n, s_dash_A - rented_A + returned_A)
                B_new = min(n, s_dash_B - rented_B + returned_B)

                value += (pss.get(returned_A, params["ret_A"]) * 
                        pss.get(rented_A, params["rent_A"]) *
                        pss.get(returned_B, params["ret_B"]) * 
                        pss.get(rented_B, params["rent_B"]) *
                        (10 * (rented_A + rented_B) + v_pi[A_new][B_new]))

            v_pi[A][B] = value

    return v_pi

def get_action(A, B, v_pi):
    maxValue = int(-1e9)
    action = -10

    for transfer in range(-5, 6):
        A_new = A - transfer
        B_new = B + transfer
        if A_new < 0 or A_new > 20 or B_new < 0 or B_new > 20:
            continue
        if v_pi[A_new][B_new] - 2 * abs(transfer) >= maxValue:
            maxValue = v_pi[A_new][B_new] - 2 * abs(transfer)
            action = transfer

    return action

In [189]:
params = {
    "rent_A" : 3,
    "ret_A" : 3,
    "rent_B" : 4,
    "ret_B" : 2,
    "gamma" : 0.9,
    "cars_max" : 21,
    "move_max" : 5,
    "r_rent" : 10.0,
    "r_move" : -2.0
}


## random initial values and arbitrary initial policy
v_pi = np.random.normal(size=[params["cars_max"], params["cars_max"]])
old_v_pi = np.random.normal(size=[params["cars_max"], params["cars_max"]])

 ## [-5, -4, ... , 4, 5] Positive numbers indicate moving cars from A to B
pi_s = np.zeros([params["cars_max"], params["cars_max"]], dtype='int32')

# print(np.mean(v_pi))
print(v_pi.astype(int))
print(pi_s)

pss = possion(params["cars_max"], 5)



n = params["move_max"] - 1 ## 20
m = 4

## returned_A, rented_A, returned_B, rented_B
transitions = np.zeros([int(n ** m), m], dtype='int')
for i in range(int(n ** m)):
    for j in range(m):
        transitions[i][j] = ((i // int(n ** (m - j - 1))) % n) + 1

for pas in range(10):
    ## policy evaluation
    for _ in range(1):
        v_pi = policy_eval(v_pi, pi_s, params, pss, transitions)

    ## policy improvement
    for A in range(0, 21):
        for B in range(0, 21):
            pi_s[A][B] = get_action(A, B, v_pi)
            
    # print(np.mean(v_pi))
    # print(v_pi)
    
# print(v_pi)
print(v_pi.astype(int))
print(pi_s)
            



# policy_iterations = 0
# epsilon = 1e-12
# policy_converged = False

# while not policy_converged:
#     ## bug fix mentioned in 4.4
#     if np.all(old_v_pi == v_pi): break

#     policy_iterations += 1
#     old_pi = np.copy(pi_s)


#     ## value iteration
#     value_converged = False
#     while not value_converged:
#         old_v_pi = np.copy(v_pi)
#         for s in range(1, 15):  
#             v_pi = policy_eval(s, v_pi, pi_s)

#         if (np.all(abs(old_v_pi - v_pi) <= epsilon)):
#             value_converged = True

#         ## print value functions
#         # print("Values:")
#         # print(v_pi.reshape([4, 4]))


#     ## policy iteration
#     for s in range(1, 15):
#         pi_s[s] = np.zeros([4])
#         greedy_action = np.argmax(neighbor_values(s, v_pi))
#         pi_s[s][greedy_action] = 1.0

#     ## convergence
#     if np.all(abs(old_pi - pi_s) <= epsilon):
#         policy_converged = True

#     ## print value and policy
#     print("\n\npolicy iteration:", policy_iterations)
#     print("Values:")
#     print(v_pi.reshape([4, 4]))
#     print("Greedy policy")
#     print(get_policy(pi_s))

# print("\n\nFinal Iteration")
# print("Values:")
# print(v_pi.reshape([4, 4]))
# print("Greedy policy")
# print(get_policy(pi_s))
# print("\n\nTotal policy iterations taken for convergence:", policy_iterations)

[[-2  0  0  1  0  1 -1  0 -1  0  0 -2  0  0  1 -1  0  0  0  0  0]
 [ 0  0  1  0 -1  0  0  0  0  1  0  0  0  0  0  1  0  0 -1  0  1]
 [ 0  1  1 -2  0 -1 -1  0 -1 -1  0  0  1 -1 -1 -1  0  0  0  0  0]
 [ 0 -1 -1  0  0 -2  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 1  0  2  0  0  0 -1  0 -1  0 -1  1 -1  1  0  1  0  0 -1 -1  0]
 [ 1  0 -1  1  0  1  0  0  1  2  0  0 -1  0  1 -1  0  0  0 -1  1]
 [ 0 -1 -1  0  0  1  0  0  0  0  0  0  0  0  0  0 -1  1  0 -1  0]
 [ 0  0  0  0  0  0 -1  0 -1  0  2  0  0  0  0  0  0  0  0  0 -1]
 [ 0  1 -1  0  1  0  1  0  0  0  1  0 -1  0  0  0  1  0  0  0  0]
 [ 0  0  0 -1  0  0  0 -1  0  0  0 -1  0  0  0 -1  0  0  1  0  0]
 [ 0  0  0  0  0 -1  0  0  1  0 -2  0  0 -1  0  0  0  0  0 -1  0]
 [ 0  0  0  0  0  0  0  1  0  0  0  1  0  0 -1  0 -1  1  0  0  0]
 [ 1  0  0  0  1  0  0  0  0  0  0  0  0  0  0 -2  1 -1 -1  0 -1]
 [ 1  0  1  2  0  0  0  2 -1  0  0  0  0  0  2 -1  1  0  0  1  0]
 [ 1  0 -1  0  0  0  0 -1  0  0 -1  0 -1  0  0  0  0  0  0  1  0]
 [ 1  0  0