## Test evaluation of MDP.py

In the following notebook the class MDP will be tested with some random examples.

In [1]:
import numpy as np
import MDP

In [2]:
T = np.array([[[0.5,0.5,0,0],[0,1,0,0],[0.5,0.5,0,0],[0,1,0,0]],[[1,0,0,0],[0.5,0,0,0.5],[0.5,0,0.5,0],[0,0,0.5,0.5]]])
# Reward function: |A| x |S| array
R = np.array([[0,0,10,10],[0,0,10,10]])
# Discount factor: scalar in [0,1)
discount = 0.9        
# MDP object
mdp = MDP.MDP(T,R,discount)

In [3]:
[V,nIterations,epsilon] = mdp.valueIteration(initialV=np.zeros(mdp.nStates))
V, nIterations,epsilon

(array([31.54265964, 38.56157171, 43.98173159, 54.15915409]),
 64,
 0.009432147662192136)

In [4]:
policy = mdp.extractPolicy(V)
policy

array([0, 1, 1, 1], dtype=int64)

In [5]:
V = mdp.evaluatePolicy(np.array([1,0,1,0]))
V

array([1.05415115e-15, 0.00000000e+00, 1.81818182e+01, 1.00000000e+01])

In [6]:
[policy,V,iterId] = mdp.policyIteration(np.array([0,0,0,0]))
policy,V,iterId

(array([0, 1, 1, 1], dtype=int64),
 array([31.58510431, 38.60401638, 44.02417625, 54.20159875]),
 1)

In [7]:
[V,iterId,epsilon] = mdp.evaluatePolicyPartially(np.array([1,0,1,0]),np.array([0,10,0,13]))
V,iterId,epsilon

(array([ 0.        ,  0.06362685, 18.18181818, 10.06362685]),
 47,
 0.00999799560440985)

In [8]:
[policy,V,iterId,tolerance] = mdp.modifiedPolicyIteration(np.array([1,0,1,0]),np.array([0,10,0,13]))
policy,V,iterId,tolerance

(array([0, 1, 1, 1], dtype=int64),
 array([31.54705966, 38.56597173, 43.98613161, 54.1635541 ]),
 9,
 0.008454366138067826)