Markov Decision Process

In [13]:
import numpy as np

# Define the MDP parameters
num_states = 3
num_actions = 2
discount_factor = 0.9

# Define the MDP transition probabilities
# transition_probs[state][action][next_state]
transition_probs = np.array([
    [[0.7, 0.3, 0.0], [0.0, 1.0, 0.0]],  # From state 0
    [[0.0, 0.8, 0.2], [0.0, 0.0, 1.0]],  # From state 1
    [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0]]   # From state 2
])

# Define the rewards
# rewards[state][action]
rewards = np.array([
    [10, 0],  # From state 0
    [0, 1],   # From state 1
    [0, 0]    # From state 2
])

# Print the transition probabilities
print("Transition Probabilities:")
for s in range(num_states):
    for a in range(num_actions):
        print(f"State {s}, Action {a}:")
        for next_state in range(num_states):
            transition_prob = transition_probs[s][a][next_state]
            print(f"  -> State {next_state} with Probability {transition_prob}")
        print()

# Print the rewards
print("Rewards:")
for s in range(num_states):
    for a in range(num_actions):
        reward = rewards[s][a]
        print(f"State {s}, Action {a} -> Reward {reward}")

# Initialize the value function arbitrarily
value_function = np.zeros(num_states)

# Perform Value Iteration
num_iterations = 100
for _ in range(num_iterations):
    new_value_function = np.zeros(num_states)
    for s in range(num_states):
        for a in range(num_actions):
            expected_return = sum(
                transition_probs[s][a][s_prime] * (rewards[s][a] + discount_factor * value_function[s_prime])
                for s_prime in range(num_states)
            )
            new_value_function[s] = max(new_value_function[s], expected_return)
    value_function = new_value_function

# Calculate the optimal policy
policy = np.zeros(num_states, dtype=int)
for s in range(num_states):
    action_values = [
        sum(
            transition_probs[s][a][s_prime] * (rewards[s][a] + discount_factor * value_function[s_prime])
            for s_prime in range(num_states)
        )
        for a in range(num_actions)
    ]
    policy[s] = np.argmax(action_values)

# Display the results for all states
for s in range(num_states):
    print(f"State {s}:")
    print("Value Function:", value_function[s])
    print("Optimal Policy (Action):", policy[s])
    print()

# Print the results
print("Optimal Value Function:", value_function)
print("Optimal Policy:", policy)


Transition Probabilities:
State 0, Action 0:
  -> State 0 with Probability 0.7
  -> State 1 with Probability 0.3
  -> State 2 with Probability 0.0

State 0, Action 1:
  -> State 0 with Probability 0.0
  -> State 1 with Probability 1.0
  -> State 2 with Probability 0.0

State 1, Action 0:
  -> State 0 with Probability 0.0
  -> State 1 with Probability 0.8
  -> State 2 with Probability 0.2

State 1, Action 1:
  -> State 0 with Probability 0.0
  -> State 1 with Probability 0.0
  -> State 2 with Probability 1.0

State 2, Action 0:
  -> State 0 with Probability 0.0
  -> State 1 with Probability 0.0
  -> State 2 with Probability 1.0

State 2, Action 1:
  -> State 0 with Probability 0.0
  -> State 1 with Probability 0.0
  -> State 2 with Probability 1.0

Rewards:
State 0, Action 0 -> Reward 10
State 0, Action 1 -> Reward 0
State 1, Action 0 -> Reward 0
State 1, Action 1 -> Reward 1
State 2, Action 0 -> Reward 0
State 2, Action 1 -> Reward 0
State 0:
Value Function: 27.756756756756747
Optimal 