<a href="https://colab.research.google.com/github/SanjayS2348553/Reinforcement-Learning/blob/main/2348553_SANJAY_S_RL_LAB_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random
from collections import defaultdict

# Define the grid environment
class GridWorldEnv:
    def __init__(self, grid_size=(4, 4), terminal_states={(0, 0), (3, 3)}):
        self.grid_size = grid_size
        self.terminal_states = terminal_states
        self.state = (3, 0)  # Starting state
        self.actions = ['up', 'down', 'left', 'right']
        self.rewards = {s: 1.0 for s in terminal_states}
        self.rewards.update({s: -1.0 for s in [(0, 3), (3, 1)]})  # Add negative rewards

    def reset(self):
        self.state = (3, 0)  # Reset to starting state
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 'up':
            x = max(x - 1, 0)
        elif action == 'down':
            x = min(x + 1, self.grid_size[0] - 1)
        elif action == 'left':
            y = max(y - 1, 0)
        elif action == 'right':
            y = min(y + 1, self.grid_size[1] - 1)

        self.state = (x, y)
        reward = self.rewards.get(self.state, 0.0)
        done = self.state in self.terminal_states
        return self.state, reward, done

    def sample_action(self):
        return random.choice(self.actions)

    def get_action_space(self):
        return self.actions

# Create the environment
env = GridWorldEnv()


In [2]:
def mc_prediction(env, policy, episodes, gamma=0.9):
    """Monte Carlo Prediction to estimate V(s) for a given policy."""
    V = defaultdict(float)
    returns = defaultdict(list)

    for ep in range(episodes):
        # Generate an episode
        state = env.reset()
        episode = []
        done = False

        while not done:
            action = policy(state)
            next_state, reward, done = env.step(action)
            episode.append((state, reward))
            state = next_state

        # Compute returns and update V
        G = 0
        visited_states = set()
        for t in reversed(range(len(episode))):
            state, reward = episode[t]
            G = reward + gamma * G
            if state not in visited_states:  # First-visit MC
                returns[state].append(G)
                V[state] = np.mean(returns[state])
                visited_states.add(state)

    return V


In [3]:
def mc_control(env, episodes, gamma=0.9, epsilon=0.1):
    """Monte Carlo Control with Exploring Starts to find optimal policy."""
    Q = defaultdict(lambda: defaultdict(float))
    returns = defaultdict(lambda: defaultdict(list))
    actions = env.get_action_space()

    for ep in range(episodes):
        # Generate an episode with exploring starts
        state = env.reset()
        action = random.choice(actions)
        episode = []
        done = False

        while not done:
            next_state, reward, done = env.step(action)
            episode.append((state, action, reward))
            state = next_state
            if not done:
                action = random.choice(actions)

        # Compute returns and update Q
        G = 0
        visited_state_action_pairs = set()
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = reward + gamma * G
            if (state, action) not in visited_state_action_pairs:  # First-visit MC
                returns[state][action].append(G)
                Q[state][action] = np.mean(returns[state][action])
                visited_state_action_pairs.add((state, action))

    # Derive policy from Q
    policy = {}
    for state in Q:
        policy[state] = max(Q[state], key=Q[state].get)  # Greedy policy

    return Q, policy


In [4]:
def random_policy(state):
    """Random policy: chooses actions uniformly at random."""
    return env.sample_action()


In [5]:
# Test Monte Carlo Prediction
V = mc_prediction(env, random_policy, episodes=1000)
print("State-Value Function V(s):")
for state, value in V.items():
    print(f"State {state}: {value:.2f}")

# Test Monte Carlo Control
Q, policy = mc_control(env, episodes=1000)
print("\nOptimal Action-Value Function Q(s, a):")
for state, actions in Q.items():
    for action, value in actions.items():
        print(f"State {state}, Action {action}: {value:.2f}")

print("\nDerived Optimal Policy:")
for state, action in policy.items():
    print(f"State {state}: {action}")


State-Value Function V(s):
State (1, 0): 0.28
State (2, 0): -0.17
State (3, 0): -0.55
State (2, 3): 0.19
State (2, 2): -0.07
State (2, 1): -0.12
State (3, 2): 0.23
State (3, 1): 0.36
State (1, 1): -0.03
State (1, 2): -0.14
State (0, 1): 0.23
State (0, 2): -0.10
State (0, 3): 0.23
State (1, 3): -0.13

Optimal Action-Value Function Q(s, a):
State (1, 0), Action up: 1.00
State (1, 0), Action right: -0.25
State (1, 0), Action left: 0.07
State (1, 0), Action down: -0.46
State (2, 0), Action up: 0.22
State (2, 0), Action left: -0.37
State (2, 0), Action down: -0.79
State (2, 0), Action right: -0.41
State (2, 1), Action left: -0.25
State (2, 1), Action down: -1.46
State (2, 1), Action right: -0.22
State (2, 1), Action up: -0.11
State (3, 1), Action up: -0.26
State (3, 1), Action right: 0.11
State (3, 1), Action left: -0.43
State (3, 1), Action down: -1.22
State (2, 2), Action left: -0.44
State (2, 2), Action right: -0.05
State (2, 2), Action down: -0.24
State (2, 2), Action up: -0.40
State (1