In [None]:
import numpy as np

In [None]:
class GridWorld:
    def __init__(self, size=4):
        self.size = size
        self.grid = np.zeros((size, size))
        self.grid[size-1, size-1] = 1  # Terminal state
        self.current_position = (0, 0)

    def reset(self):
        self.current_position = (0, 0)
        return self.current_position

    def step(self, action):
        if action == 0:  # Up
            self.current_position = (max(self.current_position[0] - 1, 0), self.current_position[1])
        elif action == 1:  # Down
            self.current_position = (min(self.current_position[0] + 1, self.size - 1), self.current_position[1])
        elif action == 2:  # Left
            self.current_position = (self.current_position[0], max(self.current_position[1] - 1, 0))
        elif action == 3:  # Right
            self.current_position = (self.current_position[0], min(self.current_position[1] + 1, self.size - 1))

        reward = 0
        if self.current_position == (self.size - 1, self.size - 1):  # Reached terminal state
            reward = 1
            done = True
        else:
            done = False

        return self.current_position, reward, done, None

In [None]:
class MonteCarloAgent:
    def __init__(self, num_states, num_actions, gamma=0.9):
        self.num_states = num_states
        self.num_actions = num_actions
        self.gamma = gamma
        self.Q = np.zeros((num_states, num_actions))
        self.returns_sum = np.zeros((num_states, num_actions))
        self.returns_count = np.ones((num_states, num_actions))  # Initialized to 1 to avoid division by zero

    def update_q_value(self, episode):
        G = 0
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = self.gamma * G + reward
            if (state, action) not in [(x[0], x[1]) for x in episode[:t]]:  # First visit
                self.returns_sum[state, action] += G
                self.returns_count[state, action] += 1
                self.Q[state, action] = self.returns_sum[state, action] / self.returns_count[state, action]

    def get_action(self, state, epsilon):
        if np.random.random() < epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.Q[state])


In [None]:
class TDAgent:
    def __init__(self, num_states, num_actions, alpha=0.1, gamma=0.9):
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha
        self.gamma = gamma
        self.Q = np.zeros((num_states, num_actions))

    def update_q_value(self, state, action, reward, next_state):
        old_value = self.Q[state, action]
        next_max = np.max(self.Q[next_state])
        new_value = old_value + self.alpha * (reward + self.gamma * next_max - old_value)
        self.Q[state, action] = new_value

    def get_action(self, state, epsilon):
        if np.random.random() < epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.Q[state])

In [None]:
# Training the agents
num_episodes = 100
epsilon = 0.1
max_steps_per_episode = 100  # Maximum number of steps per episode

env = GridWorld()
num_states = env.size ** 2
num_actions = 4

In [None]:
agent_mc = MonteCarloAgent(num_states, num_actions)
agent_td = TDAgent(num_states, num_actions)

In [None]:
for episode in range(num_episodes):
    state = env.reset()
    done = False

    # Run episode with Monte Carlo agent
    episode_mc = []  # Store (state, action, reward) tuples
    while not done:
        action = agent_mc.get_action(state[0] * env.size + state[1], epsilon)
        next_state, reward, done, _ = env.step(action)
        episode_mc.append((state[0] * env.size + state[1], action, reward))  # Flatten state
        state = next_state
    agent_mc.update_q_value(episode_mc)


In [None]:
for episode in range(num_episodes):
    state = env.reset()
    done = False
# Run episode with Temporal-Difference agent
    state = env.reset()
    done = False
    while not done:
        action = agent_td.get_action(state[0] * env.size + state[1], epsilon)  # Flatten state
        next_state, reward, done, _ = env.step(action)
        agent_td.update_q_value(state[0] * env.size + state[1], action, reward, next_state[0] * env.size + next_state[1])
        state = next_state

In [None]:
import matplotlib.pyplot as plt

def evaluate_agent(agent, env, num_episodes=100):
    total_rewards = []
    for _ in range(num_episodes):
        state = env.reset()
        done = False
        episode_reward = 0
        while not done:
            action = agent.get_action(state[0] * env.size + state[1], epsilon=0)  # Greedy action
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward
            state = next_state
        total_rewards.append(episode_reward)
    avg_reward = np.mean(total_rewards)
    return avg_reward

# Evaluate agents
avg_reward_mc = evaluate_agent(agent_mc, env)
avg_reward_td = evaluate_agent(agent_td, env)

print("Average reward for Monte Carlo agent:", avg_reward_mc)
print("Average reward for Temporal-Difference agent:", avg_reward_td)

Average reward for Monte Carlo agent: 1.0
Average reward for Temporal-Difference agent: 1.0


In [None]:
# Comparison of Policies
print("Monte Carlo agent's Q-values:")
print(agent_mc.Q)
print("Temporal-Difference agent's Q-values:")
print(agent_td.Q)

Monte Carlo agent's Q-values:
[[2.03763046e-001 4.59886300e-001 1.44790205e-001 3.31575917e-070]
 [2.76313265e-070 4.03026290e-066 3.33543634e-002 3.12467625e-001]
 [1.31220000e-001 1.78131150e-001 1.96830000e-001 6.28192385e-001]
 [1.82250000e-001 7.28543077e-001 1.31220000e-001 2.19915000e-001]
 [1.77147000e-001 2.31563620e-001 2.64083650e-001 5.26953846e-001]
 [2.26171442e-001 5.97146851e-001 1.84489824e-001 3.71682954e-001]
 [5.61867169e-001 2.03467634e-001 1.10414839e-001 2.02500000e-001]
 [1.82250000e-001 8.40029268e-001 9.88131292e-324 4.86000000e-001]
 [3.14626473e-001 0.00000000e+000 0.00000000e+000 0.00000000e+000]
 [3.39314015e-001 2.07465054e-001 2.98771348e-001 6.95641703e-001]
 [4.93331521e-001 6.00000000e-001 3.64500000e-001 8.77935000e-001]
 [4.99500000e-001 9.89898990e-001 9.88131292e-324 6.00000000e-001]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
 [2.30516726e-001 0.00000000e+000 0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 0.00000000e+00

In [None]:
# Comparison of State Values
print("Monte Carlo agent's learned state values:")
print(np.max(agent_mc.Q, axis=1))
print("Temporal-Difference agent's learned state values:")
print(np.max(agent_td.Q, axis=1))


Monte Carlo agent's learned state values:
[0.4598863  0.31246763 0.62819238 0.72854308 0.52695385 0.59714685
 0.56186717 0.84002927 0.31462647 0.6956417  0.877935   0.98989899
 0.         0.23051673 0.66666667 0.        ]
Temporal-Difference agent's learned state values:
[0.56475104 0.64436817 0.72467231 0.80862527 0.0783065  0.03812255
 0.33945722 0.89975277 0.         0.         0.01103519 0.99997344
 0.         0.         0.         0.        ]
