<a href="https://colab.research.google.com/github/Rakesh33333/Pai-lab/blob/main/Mdp7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

def value_iteration(states, actions, transition_probabilities, rewards, gamma, theta=1e-6):
    """
    Perform value iteration for a given MDP.

    Args:
    - states: List of all states.
    - actions: List of all actions.
    - transition_probabilities: A dict where keys are (s, a) pairs and values are lists of (probability, next_state) pairs.
    - rewards: A dict where keys are (s, a, s') tuples and values are the reward for that transition.
    - gamma: Discount factor.
    - theta: A small threshold for determining convergence.

    Returns:
    - V: Final value function.
    - policy: Final policy.
    """
    # Initialize value function
    V = {s: 0 for s in states}

    while True:
        delta = 0
        for s in states:
            v = V[s]
            # Compute the maximum value function over all actions
            V[s] = max(sum(prob * (rewards.get((s, a, s_prime), 0) + gamma * V[s_prime])
                           for prob, s_prime in transition_probabilities[(s, a)])
                       for a in actions)
            delta = max(delta, abs(v - V[s]))
        # Check for convergence
        if delta < theta:
            break

    # Derive policy
    policy = {}
    for s in states:
        # Select the action that maximizes the expected value
        policy[s] = max(actions, key=lambda a: sum(prob * (rewards.get((s, a, s_prime), 0) + gamma * V[s_prime])
                                                   for prob, s_prime in transition_probabilities[(s, a)]))

    return V, policy

# Example Usage
states = [0, 1, 2]
actions = ['a', 'b']
transition_probabilities = {
    (0, 'a'): [(1.0, 1)],
    (0, 'b'): [(1.0, 2)],
    (1, 'a'): [(1.0, 0)],
    (1, 'b'): [(1.0, 2)],
    (2, 'a'): [(1.0, 0)],
    (2, 'b'): [(1.0, 1)],
}
rewards = {
    (0, 'a', 1): 1,
    (0, 'b', 2): 1,
    (1, 'a', 0): 1,
    (1, 'b', 2): 1,
    (2, 'a', 0): 1,
    (2, 'b', 1): 1,
}
gamma = 0.9

V, policy = value_iteration(states, actions, transition_probabilities, rewards, gamma)
print("Optimal Value Function:", V)
print("Optimal Policy:", policy)

Optimal Value Function: {0: 9.999997681730786, 1: 9.999997913557708, 2: 9.999998122201937}
Optimal Policy: {0: 'b', 1: 'b', 2: 'b'}
