In [3]:
import numpy as np

# Define a simple environment with deterministic transitions
# For simplicity, let's assume there are 5 states moving from one state to the next gives a reward of 1, with state 4 being terminal
class SimpleEnvironment:
	def __init__(self, num_states=5):
		self.num_states = num_states

	def step(self, state):
		reward = 0
		terminal = False

		if state < self.num_states - 1:
			next_state = state + 1
			reward = 1
		else:
			next_state = state
			terminal = True

		return next_state, reward, terminal

	def reset(self):
		return 0 # Start from state 0

# Define a random policy for the sake of demonstration
def random_policy(state, num_actions=5):
	return np.random.choice(num_actions)

# Monte Carlo Policy Evaluation function
def monte_carlo_policy_evaluation(policy, env, num_episodes, gamma=1.0):
	value_table = np.zeros(env.num_states)
	returns = {state: [] for state in range(env.num_states)}

	for _ in range(num_episodes):
		state = env.reset()
		episode = []
		# Generate an episode
		while True:
			action = policy(state)
			next_state, reward, terminal = env.step(action)
			episode.append((state, reward))
			if terminal:
				break
			state = next_state

		# Calculate the return and update the value table
		G = 0
		for state, reward in reversed(episode):
			G = gamma * G + reward
			returns[state].append(G)
			value_table[state] = np.mean(returns[state])

	return value_table

# Define the number of episodes for MC evaluation
num_episodes = 1000

# Create a simple environment instance
env = SimpleEnvironment(num_states=5)

# Evaluate the policy
v = monte_carlo_policy_evaluation(random_policy, env, num_episodes)

print("The value table is:")
print(v)

The value table is:
[3.778      4.24315789 3.93414387 3.68756757 3.98471616]


In [4]:
import numpy as np

# Define the grid world environment
class GridWorld:
    def __init__(self):
        self.grid_size = (3, 3)
        self.num_actions = 4  # Up, Down, Left, Right
        self.start_state = (0, 0)
        self.goal_state = (2, 2)
        self.actions = [(0, 1), (0, -1), (-1, 0), (1, 0)]  # Right, Left, Up, Down

    def step(self, state, action):
        next_state = (state[0] + action[0], state[1] + action[1])
        if next_state[0] < 0 or next_state[0] >= self.grid_size[0] or \
                next_state[1] < 0 or next_state[1] >= self.grid_size[1]:
            # Out of bounds, stay in the same state
            return state, -1  # Reward of -1 for hitting the wall
        return next_state, 0 if next_state == self.goal_state else -1  # 0 reward at goal, -1 otherwise

# Define a random policy for the grid world
def random_policy(state):
    return np.random.choice(4)  # Randomly choose an action (0 to 3)

# Monte Carlo simulation
def monte_carlo_simulation(env, policy, num_episodes):
    # Initialize Q-values and state visit counts
    Q = np.zeros((env.grid_size[0], env.grid_size[1], env.num_actions))
    N = np.zeros((env.grid_size[0], env.grid_size[1], env.num_actions))

    for _ in range(num_episodes):
        state = env.start_state
        episode = []
        while state != env.goal_state:
            action = policy(state)
            next_state, reward = env.step(state, env.actions[action])
            episode.append((state, action, reward))
            state = next_state

        G = 0  # Return
        for t in reversed(range(len(episode))):
            state, action, reward = episode[t]
            G = reward + G
            N[state[0], state[1], action] += 1
            Q[state[0], state[1], action] += (1 / N[state[0], state[1], action]) * (G - Q[state[0], state[1], action])

    return Q

# Example usage
env = GridWorld()
Q_values = monte_carlo_simulation(env, random_policy, num_episodes=1000)
print("Q-values:")
print(Q_values)

Q-values:
[[[-25.358      -28.46938776 -26.93565332 -25.92005242]
  [-21.80674847 -26.82553606 -23.70049261 -22.7681592 ]
  [-23.17527174 -24.6846473  -21.12378303 -15.65703022]]

 [[-21.43409316 -25.56862745 -29.087      -22.51946721]
  [-16.33247423 -25.36096257 -25.28244275 -16.42076503]
  [-15.76908752 -21.10037879 -22.47826087   0.        ]]

 [[-15.44385027 -20.9527027  -25.56118143 -22.11509716]
  [  0.         -21.97722567 -21.856      -16.68306011]
  [  0.           0.           0.           0.        ]]]
