<a href="https://colab.research.google.com/github/Tdas-christ/Reinforcement_Learning/blob/main/2348569_RL_Lab4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np

In [11]:
def policy_improvement(env, V, gamma=0.9):
    """
    Improve a policy for an MDP.
    Parameters:
    - env (object): An MDP environment with methods `transition_prob` and `reward`.
    - V (dict): The state-value function.
    - gamma (float): Discount factor.
    """
    states = env.states
    actions = env.actions
    policy = {}

    print("Improving policy...")
    for s in states:
        max_value = float('-inf')
        best_action = None
        for a in actions:
            value = sum(env.transition_prob(s, a, s_) * (env.reward(s, a, s_) + gamma * V[s_]) for s_ in states)
            if value > max_value:
                max_value = value
                best_action = a
        policy[s] = best_action

    print("Policy improvement complete.")
    return policy

In [12]:
def policy_evaluation(env, policy, gamma=0.9, theta=1e-8):
    """
    Evaluate a given policy for an MDP.
    Parameters:
    - env (object): An MDP environment with methods `transition_prob` and `reward`.
    - policy (dict): A mapping of states to actions.
    - gamma (float): Discount factor.
    - theta (float): Threshold for convergence.
    """
    states = env.states
    num_states = len(states)
    V = {s: 0 for s in states}  # Initialize state-value function

    print("Evaluating policy...")
    while True:
        delta = 0
        for s in states:
            v = V[s]
            V[s] = sum(env.transition_prob(s, policy[s], s_) * (env.reward(s, policy[s], s_) + gamma * V[s_]) for s_ in states)
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break

    print("Policy evaluation complete.")
    return V

In [14]:
# Example usage
class GridWorldEnv:
    def __init__(self):
        self.states = [(x, y) for x in range(3) for y in range(3)]
        self.actions = ['up', 'down', 'left', 'right']

    def transition_prob(self, state, action, next_state):
        # Implement transition probabilities
        return 0.8 if self._next_state(state, action) == next_state else 0.05

    def reward(self, state, action, next_state):
        # Implement rewards
        return 1 if next_state == (2, 2) else 0

    def _next_state(self, state, action):
        x, y = state
        if action == 'up':
            return (x, min(y + 1, 2))
        elif action == 'down':
            return (x, max(y - 1, 0))
        elif action == 'left':
            return (max(x - 1, 0), y)
        elif action == 'right':
            return (min(x + 1, 2), y)

    env = GridWorldEnv()
    initial_policy = {s: 'up' for s in env.states}

    # Evaluate the initial policy
    print("Evaluating initial policy...")
    V = policy_evaluation(env, initial_policy)
    print("Initial state-value function:")
    for state, value in V.items():
        print(f"State: {state}, Value: {value:.2f}")

    # Improve the policy
    print("Improving policy...")
    improved_policy = policy_improvement(env, V)
    print("Improved policy:")
    for state, action in improved_policy.items():
        print(f"State: {state}, Action: {action}")

Evaluating initial policy...
Evaluating policy...
Policy evaluation complete.
Initial state-value function:
State: (0, 0), Value: inf
State: (0, 1), Value: inf
State: (0, 2), Value: inf
State: (1, 0), Value: inf
State: (1, 1), Value: inf
State: (1, 2), Value: inf
State: (2, 0), Value: inf
State: (2, 1), Value: inf
State: (2, 2), Value: inf
Improving policy...
Improving policy...
Policy improvement complete.
Improved policy:
State: (0, 0), Action: up
State: (0, 1), Action: up
State: (0, 2), Action: up
State: (1, 0), Action: up
State: (1, 1), Action: up
State: (1, 2), Action: up
State: (2, 0), Action: up
State: (2, 1), Action: up
State: (2, 2), Action: up
