In [30]:
import numpy as np
import tensorflow as tf

In [31]:
# Define states and actions
states = [0, 1, 2]  # 3 states
actions = [0, 1]    # 2 actions


In [32]:
# Transition probabilities P[state][action][next_state]
P = np.array([
    [[0.7, 0.3, 0.0],  # From state 0 to states 0, 1, 2 with action 0
     [0.0, 0.2, 0.8]], # From state 0 to states 0, 1, 2 with action 1

    [[0.0, 0.6, 0.4],  # From state 1 to states 0, 1, 2 with action 0
     [0.0, 0.0, 1.0]], # From state 1 to states 0, 1, 2 with action 1

    [[0.0, 0.0, 1.0],  # From state 2 to states 0, 1, 2 with action 0
     [0.0, 0.0, 1.0]]  # From state 2 to states 0, 1, 2 with action 1
], dtype=np.float32)


In [33]:
# Rewards for each state-action pair
R = np.array([
    [5, 10],  # Rewards for actions in state 0
    [0, 0],   # Rewards for actions in state 1
    [0, 0]    # Rewards for actions in state 2
], dtype=np.float32)


In [34]:
# Value function initialization
value_function = tf.Variable(np.zeros(len(states), dtype=np.float32), dtype=tf.float32)

# Discount factor
gamma = 0.9

# Value iteration algorithm
def value_iteration(theta=1e-6):
    while True:
        delta = 0
        for s in range(len(states)):
            v = value_function[s].numpy()  # Current value
            new_value = tf.reduce_max(
                [tf.reduce_sum(P[s][a] * (R[s][a] + gamma * value_function.numpy())) for a in range(len(actions))]
            )
            value_function[s].assign(new_value)
            delta = max(delta, abs(v - value_function[s].numpy()))  # Maximum change in value
        if delta < theta:
            break


In [35]:

# Run value iteration
value_iteration()

# Display results
print("Optimal Value Function:", value_function.numpy())


Optimal Value Function: [13.513511  0.        0.      ]
