In [28]:
import numpy as np

class MDP:

    def __init__(self,P,R,discount):
        '''
        P -- Transition probability matrix: |S| x |A| x |S'|
        R -- Reward function: |S| x |A|
        discount -- discount factor: scalar in [0,1)
        '''

        self.nStates = P.shape[0]
        self.nActions = P.shape[1]
        self.P = P
        self.R = R
        self.discount = discount
        
    def valueIteration(self,initialV,nIterations=np.inf,tolerance=0.01):
        '''
        initialV -- Initial value function: array of |S| entries
        nIterations -- limit on the # of iterations: scalar (default: infinity)
        tolerance -- threshold on ||V^n-V^n+1||_inf: scalar (default: 0.01)

        Outputs:
        V -- Value function: array of |S| entries
        '''
        #initialize
        V = initialV
        prevV = np.zeros(self.nStates)
        
        #repeat
        while True:
            prevV = V.copy()
            delta = 0
            for s in range(self.nStates):
                # V[s] <-- max_a R^a[s] + gamma sum_s' P^a[s,s'] prevV[s]
                # V[s] = max(R[s]) + self.discount * sum(P[s] * prevV[s])
                V[s] = np.max(np.add(self.R[s], self.discount * np.dot(self.P[s], prevV)))
                delta = max(delta, abs(V[s] - prevV[s]))
            print('V = ',V)
            if delta < tolerance:
                break
        
        return V

    def extractPolicy(self,V):
        '''최적 가치 함수로부터 (deterministic) 최적 정책 추출
        pi <-- argmax_a R^a + gamma P^a V
        Inputs:
        V -- Value function: array of |S| entries
        Output:
        policy -- Policy: array of |S| entries'''
        policy = np.zeros(self.nStates) - 1
        for s in range(self.nStates):
            # 다음 줄을 수정
            policy[s] = np.argmax(np.add(self.R[s], self.discount * np.dot(self.P[s], V)))
        return policy

# main
# DP 모델 정의
# Transition probability matrix: |S| x |A| x |S'|
P = np.array([[[0,1,0,0],[0,0,1,0]],[[0,1,0,0],[0,0,0,1]],[[0,0,0,1],[0,0,1,0]],[[0,0,0,1],[0,0,0,1]]])

# Reward function: |S| x |A|
R = np.array([[-1,1],[-2,3],[-1,-2],[0,0]])

# Discount factor: scalar in [0,1]
discount = 1

# MDP object 생성
mdp = MDP(P,R,discount)

# Value iteration 테스트
V = mdp.valueIteration(initialV=np.zeros(mdp.nStates))
print('V = ',V)
policy = mdp.extractPolicy(V)
print('policy =',policy)

V =  [ 1.  3. -1.  0.]
V =  [ 2.  3. -1.  0.]
V =  [ 2.  3. -1.  0.]
V =  [ 2.  3. -1.  0.]
policy = [0. 1. 0. 0.]
