<a href="https://colab.research.google.com/github/Vyshnavijulapelly/Reinforcement-Learning/blob/main/RL_Lab_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import gymnasium as gym
import numpy as np

# Create the FrozenLake environment (non-slippery for deterministic behavior)
env = gym.make("FrozenLake-v1", is_slippery=False)
env = env.unwrapped  # Unwrap to access .P

n_states = env.observation_space.n
n_actions = env.action_space.n
gamma = 0.99
theta = 1e-8


def value_iteration(env):
    V = np.zeros(n_states)
    while True:
        delta = 0
        for s in range(n_states):
            q_values = []
            for a in range(n_actions):
                q = 0
                for prob, next_state, reward, done in env.P[s][a]:
                    q += prob * (reward + gamma * V[next_state])
                q_values.append(q)
            max_q = max(q_values)
            delta = max(delta, abs(V[s] - max_q))
            V[s] = max_q
        if delta < theta:
            break

    # Derive policy
    policy = np.zeros(n_states, dtype=int)
    for s in range(n_states):
        q_values = []
        for a in range(n_actions):
            q = 0
            for prob, next_state, reward, done in env.P[s][a]:
                q += prob * (reward + gamma * V[next_state])
            q_values.append(q)
        policy[s] = np.argmax(q_values)
    return policy, V


def policy_iteration(env):
    policy = np.zeros(n_states, dtype=int)
    V = np.zeros(n_states)

    while True:
        # Policy Evaluation
        while True:
            delta = 0
            for s in range(n_states):
                v = 0
                a = policy[s]
                for prob, next_state, reward, done in env.P[s][a]:
                    v += prob * (reward + gamma * V[next_state])
                delta = max(delta, abs(V[s] - v))
                V[s] = v
            if delta < theta:
                break

        # Policy Improvement
        policy_stable = True
        for s in range(n_states):
            old_action = policy[s]
            q_values = []
            for a in range(n_actions):
                q = 0
                for prob, next_state, reward, done in env.P[s][a]:
                    q += prob * (reward + gamma * V[next_state])
                q_values.append(q)
            new_action = np.argmax(q_values)
            policy[s] = new_action
            if old_action != new_action:
                policy_stable = False

        if policy_stable:
            break

    return policy, V


def run_policy(env, policy, render=False):
    total_rewards = 0
    state, _ = env.reset()
    while True:
        if render:
            env.render()
        action = policy[state]
        state, reward, terminated, truncated, _ = env.step(action)
        total_rewards += reward
        if terminated or truncated:
            break
    return total_rewards


# --- Run Value Iteration ---
vi_policy, vi_V = value_iteration(env)
vi_rewards = [run_policy(env, vi_policy) for _ in range(100)]
print("Value Iteration - Average Reward over 100 episodes:", np.mean(vi_rewards))

# --- Run Policy Iteration ---
pi_policy, pi_V = policy_iteration(env)
pi_rewards = [run_policy(env, pi_policy) for _ in range(100)]
print("Policy Iteration - Average Reward over 100 episodes:", np.mean(pi_rewards))

Value Iteration - Average Reward over 100 episodes: 1.0
Policy Iteration - Average Reward over 100 episodes: 1.0


In [9]:
import numpy as np
# Custom 5-state MDP
states = [0, 1, 2, 3, 4]
actions = [0, 1]  # e.g., 0=left, 1=right
gamma = 0.9
theta = 1e-6
# MDP transition model: P[state][action] = list of (probability, next_state, reward, done)
P = {
    0: {0: [(1.0, 1, 0, False)], 1: [(1.0, 2, 0, False)]},
    1: {0: [(1.0, 3, 1, True)],  1: [(1.0, 4, 0, False)]},
    2: {0: [(1.0, 4, 0, False)], 1: [(1.0, 3, 1, True)]},
    3: {0: [(1.0, 3, 0, True)],  1: [(1.0, 3, 0, True)]},
    4: {0: [(1.0, 0, 0, False)], 1: [(1.0, 1, 0, False)]},
}
def value_iteration(P):
    V = np.zeros(len(states))
    while True:
        delta = 0
        for s in states:
            q_values = []
            for a in actions:
                q = sum(prob * (reward + gamma * V[next_state]) for prob, next_state, reward, done in P[s][a])
                q_values.append(q)
            max_q = max(q_values)
            delta = max(delta, abs(V[s] - max_q))
            V[s] = max_q
        if delta < theta:
            break
    policy = np.zeros(len(states), dtype=int)
    for s in states:
        q_values = [
            sum(prob * (reward + gamma * V[next_state]) for prob, next_state, reward, done in P[s][a])
            for a in actions
        ]
        policy[s] = np.argmax(q_values)
    return policy, V
def policy_iteration(P):
    policy = np.zeros(len(states), dtype=int)
    V = np.zeros(len(states))
    while True:
        # Policy evaluation
        while True:
            delta = 0
            for s in states:
                v = sum(prob * (reward + gamma * V[next_state]) for prob, next_state, reward, done in P[s][policy[s]])
                delta = max(delta, abs(V[s] - v))
                V[s] = v
            if delta < theta:
                break
        # Policy improvement
        policy_stable = True
        for s in states:
            old_action = policy[s]
            q_values = [
                sum(prob * (reward + gamma * V[next_state]) for prob, next_state, reward, done in P[s][a])
                for a in actions
            ]
            best_action = np.argmax(q_values)
            if old_action != best_action:
                policy_stable = False
            policy[s] = best_action
        if policy_stable:
            break
    return policy, V
pi_policy, pi_value = policy_iteration(P)
vi_policy, vi_value = value_iteration(P)

print("Policy Iteration:")
print("Policy:", pi_policy)
print("Values:", pi_value)

print("\nValue Iteration:")
print("Policy:", vi_policy)
print("Values:", vi_value)

Policy Iteration:
Policy: [0 0 1 0 1]
Values: [0.9 1.  1.  0.  0.9]

Value Iteration:
Policy: [0 0 1 0 1]
Values: [0.9 1.  1.  0.  0.9]
