In [3]:
import numpy as np
import random

### MDP Algorithms ###

def value_iteration(P, R, gamma=0.9, epsilon=1e-6):
    n_states, n_actions = R.shape[1], R.shape[0]
    V = np.zeros(n_states)
    while True:
        delta = 0
        for s in range(n_states):
            v = V[s]
            V[s] = max(sum(P[a, s, s1] * (R[a, s] + gamma * V[s1]) for s1 in range(n_states)) for a in range(n_actions))
            delta = max(delta, abs(v - V[s]))
        if delta < epsilon:
            break
    policy = np.argmax([[sum(P[a, s, s1] * (R[a, s] + gamma * V[s1]) for s1 in range(n_states)) for a in range(n_actions)] for s in range(n_states)], axis=1)
    return policy, V

def policy_iteration(P, R, gamma=0.9, epsilon=1e-6):
    n_states, n_actions = R.shape[1], R.shape[0]
    policy = np.zeros(n_states, dtype=int)
    V = np.zeros(n_states)
    while True:
        while True:
            delta = 0
            for s in range(n_states):
                v = V[s]
                V[s] = sum(P[policy[s], s, s1] * (R[policy[s], s] + gamma * V[s1]) for s1 in range(n_states))
                delta = max(delta, abs(v - V[s]))
            if delta < epsilon:
                break
        policy_stable = True
        for s in range(n_states):
            old_action = policy[s]
            policy[s] = np.argmax([sum(P[a, s, s1] * (R[a, s] + gamma * V[s1]) for s1 in range(n_states)) for a in range(n_actions)])
            if old_action != policy[s]:
                policy_stable = False
        if policy_stable:
            break
    return policy, V

def q_learning(P, R, gamma=0.9, alpha=0.1, epsilon=0.1, episodes=1000):
    n_states, n_actions = R.shape[1], R.shape[0]
    Q = np.zeros((n_states, n_actions))
    for _ in range(episodes):
        state = random.choice(range(n_states))
        while True:
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(n_actions))
            else:
                action = np.argmax(Q[state])
            next_state = np.argmax(P[action, state])
            reward = R[action, state]
            best_next_action = np.argmax(Q[next_state])
            td_target = reward + gamma * Q[next_state, best_next_action]
            td_error = td_target - Q[state, action]
            Q[state, action] += alpha * td_error
            if state == next_state:
                break
            state = next_state
    policy = np.argmax(Q, axis=1)
    return policy, Q

### Utility Functions ###

def validate_transition_matrix(P):
    assert np.allclose(P.sum(axis=2), 1), "Transition probabilities must sum to 1."

def validate_reward_matrix(R, P):
    assert R.shape == P.shape[:2], "Reward matrix dimensions must match the transition matrix."

def generate_random_mdp(n_states, n_actions):
    P = np.zeros((n_actions, n_states, n_states))
    for a in range(n_actions):
        for s in range(n_states):
            P[a, s, :] = np.random.dirichlet(np.ones(n_states))
    R = np.random.rand(n_actions, n_states)
    return P, R

### Example Usage ###

# Generate a random MDP
n_states = 3
n_actions = 2
P, R = generate_random_mdp(n_states, n_actions)

# Validate the MDP
validate_transition_matrix(P)
validate_reward_matrix(R, P)

# Solve the MDP using Value Iteration
policy_vi, V_vi = value_iteration(P, R)
print("Optimal Policy (Value Iteration):", policy_vi)
print("Value Function (Value Iteration):", V_vi)

# Solve the MDP using Policy Iteration
policy_pi, V_pi = policy_iteration(P, R)
print("Optimal Policy (Policy Iteration):", policy_pi)
print("Value Function (Policy Iteration):", V_pi)

# Solve the MDP using Q-Learning
policy_ql, Q_ql = q_learning(P, R)
print("Optimal Policy (Q-Learning):", policy_ql)
print("Q-Table (Q-Learning):", Q_ql)


Optimal Policy (Value Iteration): [1 0 0]
Value Function (Value Iteration): [7.36098808 7.78839586 7.74216696]
Optimal Policy (Policy Iteration): [1 0 0]
Value Function (Policy Iteration): [7.36098834 7.7883961  7.7421672 ]
Optimal Policy (Q-Learning): [0 0 0]
Q-Table (Q-Learning): [[6.60306568 6.49908673]
 [6.87500159 6.53743838]
 [6.90107315 5.20307937]]
