<a href="https://colab.research.google.com/github/SanjayS2348553/Reinforcement-Learning/blob/main/2348553_SANJAY_S_RL_LAB_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Implement Markov Decision Process (MDP) Simulation and Value Iteration

In [None]:
import numpy as np

class MDP:
    def __init__(self, states, actions, transition_prob, rewards, gamma=0.9):
        """
        Initialize the MDP.

        :param states: List of states.
        :param actions: List of actions.
        :param transition_prob: A dict where T[s][a][s'] gives the probability of transitioning from state s to s'.
        :param rewards: A dict where R[s][a] gives the reward for taking action a in state s.
        :param gamma: Discount factor (0 <= gamma <= 1).
        """
        self.states = states
        self.actions = actions
        self.transition_prob = transition_prob
        self.rewards = rewards
        self.gamma = gamma

    def value_iteration(self, epsilon=1e-6):
        """
        Perform Value Iteration to compute optimal state-value function and policy.

        :param epsilon: Convergence threshold for value iteration.
        :return: Optimal values (V*) and optimal policy (pi*).
        """
        V = np.zeros(len(self.states))  # Initialize value function to zero
        policy = np.zeros(len(self.states), dtype=int)  # Initialize arbitrary policy

        while True:
            delta = 0  # Track maximum value change across states
            for s in self.states:
                # Compute the value for each action
                action_values = []
                for a in self.actions:
                    value = sum(
                        self.transition_prob[s][a][s_next] *
                        (self.rewards[s][a] + self.gamma * V[s_next])
                        for s_next in self.states
                    )
                    action_values.append(value)

                # Update state value and policy
                best_action_value = max(action_values)
                delta = max(delta, abs(best_action_value - V[s]))  # Track convergence
                V[s] = best_action_value
                policy[s] = np.argmax(action_values)  # Update policy to best action

            if delta < epsilon:  # Check convergence
                break

        return V, policy

    def simulate(self, start_state, policy, steps=10):
        """
        Simulate the MDP given a policy.

        :param start_state: Initial state for the simulation.
        :param policy: Optimal policy from value iteration.
        :param steps: Number of steps to simulate.
        """
        state = start_state
        for _ in range(steps):
            print(f"State: {state}")
            action = policy[state]
            print(f"  Action: {action}")

            # Get the next state probabilistically
            next_state_probs = self.transition_prob[state][action]
            state = np.random.choice(self.states, p=next_state_probs)

        print(f"Final State: {state}")

# Example Grid-World MDP Setup
states = [0, 1, 2, 3]  # Example states (could represent positions in a 2x2 grid)
actions = [0, 1]  # 0: Move Left, 1: Move Right

# Define Transition Probabilities T[s][a][s']
transition_prob = {
    0: {0: [1.0, 0.0, 0.0, 0.0], 1: [0.0, 1.0, 0.0, 0.0]},
    1: {0: [1.0, 0.0, 0.0, 0.0], 1: [0.0, 0.0, 1.0, 0.0]},
    2: {0: [0.0, 0.0, 1.0, 0.0], 1: [0.0, 0.0, 0.0, 1.0]},
    3: {0: [0.0, 0.0, 1.0, 0.0], 1: [0.0, 0.0, 0.0, 1.0]},
}

# Define Rewards R[s][a]
rewards = {
    0: {0: 0, 1: 1},
    1: {0: 0, 1: 1},
    2: {0: 0, 1: 1},
    3: {0: 0, 1: 0},  # Terminal state with no reward
}

# Create MDP and Run Value Iteration
mdp = MDP(states, actions, transition_prob, rewards, gamma=0.9)
V, policy = mdp.value_iteration()

print("Optimal Values (V*):", V)
print("Optimal Policy (pi*):", policy)

# Simulate the MDP using the optimal policy
print("\nMDP Simulation:")
mdp.simulate(start_state=0, policy=policy, steps=5)


Optimal Values (V*): [6.16315401 5.73683861 5.26315475 4.73683927]
Optimal Policy (pi*): [1 1 1 0]

MDP Simulation:
State: 0
  Action: 1
State: 1
  Action: 1
State: 2
  Action: 1
State: 3
  Action: 0
State: 2
  Action: 1
Final State: 3
