In [1]:
import pandas as pd
import numpy as np

In [42]:
# s1 -> [a1, a2], s2 -> [a1, a2] ...
grid_reward = [[0, 1], [-1, 1], [5, -100], [0, 0]]
grid_transition = [[2, 1], [0, 3], [3, 0], [None, None]]
actions = [0, 1]
gamma = 0.8

In [82]:
def r(state, action, next_state=None):
    return env[state][action]

In [44]:
def P(state, action):
    return grid_transition[state][action]

P(0, 0)

2

In [252]:
def action(state):
    return actions[0]


def policy_1(state):
    return [1, 0]


def policy_2(state):
    return [0.5, 0.5]


state = 0
for action, proba in actions, policy_1(state):
    print(action, proba)

0 1
1 0


In [253]:
def v(state):
    # 方策によっては、終了しないので一定のステップで終了する
    global count
    if count > 200: return 0
    count += 1
    total_reward = 0
    probas = policy_1(state)
    for action, proba in zip(actions, probas):
        next_state = P(state, action)
        if next_state is None: return 0
        reward = r(state, action, next_state)
        total_reward += proba * (reward + gamma * v(next_state))
    
    return total_reward


for i in range(4):
    count = 0
    print(v(i))

4.0
2.2
5.0
0


In [257]:
class ValueIter():
    def __init__(self, policy):
        self.policy = policy
        self.reset()

    def reset(self):
        self.count = 0

    def v(self, state):
        total_reward = 0
        probas = self.policy(state)
        self.count += 1
        for action, proba in zip(actions, probas):
            next_state = P(state, action)
            if next_state is None: return 0
            
            reward = r(state, action, next_state)
            if self.count > 1000:
                break
                
            total_reward += proba * (reward + gamma * self.v(next_state))

        return total_reward


value_iter = ValueIter(policy_1)
for state in range(4):
    print(value_iter.v(state))
    value_iter.reset()

value_iter = ValueIter(policy_2)
for state in range(4):
    print(value_iter.v(state))
    value_iter.reset()

4.0
2.2
5.0
0
-22.61904761904762
-9.54761904761905
-56.54761904761905
0


行動価値関数でも見てみます

In [414]:
def q(state, action):
    total_reward = 0
    probas = policy_1(state)
    next_state = P(state, action)
    reward = r(state, action)
    if next_state is None: return 0

    probas = policy_2(next_state)
    next_action = np.random.choice(actions, p=probas)
    return reward + gamma * q(next_state, next_action)


for i in range(4):
    for j in range(2):
        print('state:{}, action:{}, 行動価値:{}'.format(i, j, q(i, j)))

state:0, action:0, 行動価値:4.0
state:0, action:1, 行動価値:-49.3616
state:1, action:0, 行動価値:1.2080000000000002
state:1, action:1, 行動価値:1.0
state:2, action:0, 行動価値:5.0
state:2, action:1, 行動価値:-161.952
state:3, action:0, 行動価値:0
state:3, action:1, 行動価値:0


In [465]:
class Q():
    def __init__(self, policy):
        self.policy = policy
        self.reset()

    def reset(self):
        self.count = 0

    def q(self, state, action):
        total_reward = 0
        probas = policy_1(state)
        next_state = P(state, action)
        reward = r(state, action)
        if next_state is None: return 0

        probas = policy_2(next_state)
        next_action = np.random.choice(actions, p=probas)
        return reward + gamma * q(next_state, next_action)

q_func = Q(policy_1)
results = np.zeros([4, 2]).tolist()
for _ in range(1000):
    for i in range(4):
        for j in range(2):
            q_value = q_func.q(i, j)
            results[i][j] += q_value

for i, v in enumerate(results):
    for j, vv in enumerate(v):
        print('state:{}, action:{}, 行動価値:{}'.format(i, j, vv / 1000))

state:0, action:0, 行動価値:-49.050359673711476
state:0, action:1, 行動価値:-8.43374434171098
state:1, action:0, 行動価値:-22.1541424094687
state:1, action:1, 行動価値:1.0
state:2, action:0, 行動価値:5.0
state:2, action:1, 行動価値:-122.05829381477696
state:3, action:0, 行動価値:0.0
state:3, action:1, 行動価値:0.0
