# **EXPERIMENT 4**
## **Name**: Rishikesh Vadodaria
## **Roll no**: C114

In [8]:
import numpy as np


class PolicyIterationMDP:
    def __init__(self, states, actions, transitions, rewards, gamma=0.9, theta=1e-6):
        self.states = states
        self.actions = actions
        self.transitions = transitions
        self.rewards = rewards
        self.gamma = gamma
        self.theta = theta
        self.V = {s: 0 for s in states}  # Initialize values to zero
        self.policy = {s: np.random.choice(actions) for s in states if s != "S8"}  # Random policy

    def policy_evaluation(self):
        """Evaluates the current policy"""
        while True:
            delta = 0
            for s in self.states:
                if s == "S8":  # Terminal state
                    continue
                v = self.V[s]
                action = self.policy[s]
                next_state = self.transitions[s][action]
                self.V[s] = self.rewards[s] + self.gamma * self.V[next_state]
                delta = max(delta, abs(v - self.V[s]))
            if delta < self.theta:
                break

    def policy_improvement(self):
        """Improves the policy based on the current value function"""
        policy_stable = True
        for s in self.states:
            if s == "S8":
                continue
            old_action = self.policy[s]
            action_values = {a: self.rewards[s] + self.gamma * self.V[self.transitions[s][a]] for a in self.actions}
            self.policy[s] = max(action_values, key=action_values.get)  # Best action
            if old_action != self.policy[s]:
                policy_stable = False
        return policy_stable

    def run(self):
        """Runs the full Policy Iteration algorithm"""
        while True:
            self.policy_evaluation()
            if self.policy_improvement():
                break

    def display_results(self):
        """Displays the final value function and optimal policy"""
        print("\nOptimal Value Function:")
        for s in self.states:
            print(f"V({s}) = {self.V[s]:.2f}")

        print("\nOptimal Policy:")
        for s in self.states:
            if s != "S8":
                print(f"π({s}) = {self.policy[s]}")

In [6]:
# Define states, actions, transitions, and rewards
states = ["S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8"]
actions = ["left", "right"]



transitions = {
    "S1": {"left": "S1", "right": "S3"},
    "S2": {"left": "S1", "right": "S5"},
    "S3": {"left": "S1", "right": "S6"},
    "S4": {"left": "S2", "right": "S7"},
    "S5": {"left": "S2", "right": "S8"},
    "S6": {"left": "S3", "right": "S8"},
    "S7": {"left": "S4", "right": "S5"},
    "S8": {"left": "S8", "right": "S8"}  # Terminal state
}



rewards = {
    "S1": 0, "S2": +2, "S3": +1, "S4": -1, "S5": +3,
    "S6": -3, "S7": -7, "S8": +5
}

In [7]:
# Run Policy Iteration
mdp = PolicyIterationMDP(states, actions, transitions, rewards)
mdp.run()
mdp.display_results()


Optimal Value Function:
V(S1) = 4.74
V(S2) = 24.74
V(S3) = 5.26
V(S4) = 21.26
V(S5) = 25.26
V(S6) = 1.74
V(S7) = 15.74
V(S8) = 0.00

Optimal Policy:
π(S1) = right
π(S2) = right
π(S3) = left
π(S4) = left
π(S5) = left
π(S6) = left
π(S7) = right
