<a href="https://colab.research.google.com/github/SJhawar1010/Reinforcement-Learning/blob/main/Satyam_554_RL_Lab5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Model-Free Prediction & Control With Monte Carlo

### Initialisation

In [1]:
import numpy as np
grid = np.array([
    [0, 0, 0, 1],
    [0, 3, 0, 2],
    [0, 0, 0, 0]
])
rewards = {0: -0.04, 1: 1, 2: -1, 3: 0}
actions = ['up', 'down', 'left', 'right']
action_effects = {
    'up': (-1, 0),
    'down': (1, 0),
    'left': (0, -1),
    'right': (0, 1)
}
discount_factor = 0.9
epsilon = 0.1

### Check for terminal state

In [2]:
def is_terminal(state):
    i, j = state
    return grid[i, j] in (1, 2)

### Next State

In [3]:
def next(state, action):
    i, j = state
    di, dj = action_effects[action]
    next_i, next_j = i + di, j + dj

    if 0 <= next_i < grid.shape[0] and 0 <= next_j < grid.shape[1]:
        return next_i, next_j
    return state

### ε-greedy policy

In [4]:
import random
def epsilon_greedy_policy(Q, state):
    if random.random() < epsilon:
        return random.choice(actions)
    else:
        return max(actions, key=lambda a: Q[state, a])

### Finding optimal Policy using using MC control

In [5]:
from collections import defaultdict
def monte_carlo_control(num_episodes=5000):
    Q = defaultdict(float)
    returns = defaultdict(list)
    policy = {}

    for episode in range(num_episodes):
        state = (2, 0)
        episode = []

        while not is_terminal(state):
            action = epsilon_greedy_policy(Q, state)
            next_state = next(state, action)
            reward = rewards[grid[state]]
            episode.append((state, action, reward))
            state = next_state

        G = 0
        visited = set()
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = reward + discount_factor * G
            if (state, action) not in visited:
                visited.add((state, action))
                returns[(state, action)].append(G)
                Q[state, action] = np.mean(returns[(state, action)])

    for state_action in Q.keys():
        state, action = state_action
        if state not in policy:
            policy[state] = max(actions, key=lambda a: Q[state, a])

    return Q, policy


In [6]:
Q, policy = monte_carlo_control()

print("Learned Q-values:")
for key, value in Q.items():
    print(f"State {key[0]} Action {key[1]}: {value:.2f}")

Learned Q-values:
State (2, 0) Action up: -0.11
State (2, 0) Action down: -0.15
State (2, 0) Action left: -0.15
State (2, 0) Action right: -0.12
State (1, 0) Action up: -0.15
State (1, 0) Action down: -0.15
State (1, 0) Action left: -0.12
State (1, 0) Action right: -0.08
State (0, 0) Action up: -0.20
State (0, 0) Action down: -0.20
State (0, 0) Action left: -0.21
State (0, 0) Action right: -0.12
State (0, 1) Action up: -0.16
State (0, 1) Action down: -0.08
State (0, 1) Action left: -0.14
State (0, 1) Action right: -0.13
State (0, 2) Action up: -0.08
State (0, 2) Action down: -0.08
State (0, 2) Action left: -0.25
State (0, 2) Action right: -0.04
State (2, 1) Action up: -0.08
State (2, 1) Action down: -0.13
State (2, 1) Action left: -0.16
State (2, 1) Action right: -0.16
State (1, 1) Action up: -0.07
State (1, 1) Action down: -0.08
State (1, 1) Action left: -0.07
State (1, 1) Action right: -0.04
State (2, 2) Action up: -0.15
State (2, 2) Action down: -0.17
State (2, 2) Action left: -0.11

In [7]:
print("\nDerived Policy:")
for state, action in policy.items():
    print(f"State {state}: {action}")


Derived Policy:
State (2, 0): up
State (1, 0): right
State (0, 0): right
State (0, 1): down
State (0, 2): right
State (2, 1): up
State (1, 1): right
State (2, 2): left
State (1, 2): right
State (2, 3): up


## END