In [22]:
import numpy as np

class Riverswim:
    """
    Sourced from Absalon and rewritten by bkx591
    """
    def __init__(self, states_count, actions_count=2, gamma=0.95):
        self.states_count = states_count
        self.actions_count = actions_count
        self.P = np.zeros((states_count, actions_count, states_count))
        self.R = np.zeros((states_count, actions_count))
        self.gamma = gamma  # Discount factor

        for s in range(states_count):
            if s == 0:
                self.P[s, 0, s] = 1
                self.P[s, 1, s] = 0.6
                self.P[s, 1, s + 1] = 0.4
                self.R[s, 0] = 0.05
            elif s == states_count - 1:
                self.P[s, 0, s - 1] = 1
                self.P[s, 1, s] = 0.6
                self.P[s, 1, s - 1] = 0.4
                self.R[s, 1] = 1 
            else:
                self.P[s, 0, s - 1] = 1
                self.P[s, 1, s] = 0.55
                self.P[s, 1, s + 1] = 0.4
                self.P[s, 1, s - 1] = 0.05

        self.s = 0

    def reset(self, s=0):
        self.s = s
        return self.s

    def step(self, action):
        new_s = np.random.choice(np.arange(self.states_count), p=self.P[self.s, action])
        reward = self.R[self.s, action]
        self.s = new_s
        return new_s, reward


    def policy(self, s):
        # Define the policy function based on the state
        if s < 3:  # For states s1, s2, s3
            return np.array([0.5, 0.5])  # Equal probabilities for 'left' and 'right'
        else:  # For states s4, s5
            return np.array([0.0, 1.0])  # Always 'right'

    def policy_matrix(self):
        # Create a policy matrix that represents the transition probabilities under the given policy.
        policy_mat = np.zeros((self.states_count, self.states_count))
        for s in range(self.states_count):
            policy_prob = self.policy(s)
            # Calculate the expected transition probability for each state under the given policy
            policy_mat[s] = policy_prob[0] * self.P[s, 0] + policy_prob[1] * self.P[s, 1]
        return policy_mat

    def reward_vector(self):
        # Create a reward vector that represents the expected immediate reward from each state under the given policy.
        reward_vec = np.zeros(self.states_count)
        for s in range(self.states_count):
            policy_prob = self.policy(s)
            # Calculate the expected immediate reward for each state under the given policy
            reward_vec[s] = policy_prob[0] * self.R[s, 0] + policy_prob[1] * self.R[s, 1]
        return reward_vec

def monte_carlo_policy_evaluation(env, n, T):
    V_pi = np.zeros(env.states_count)
    for s in range(env.states_count):
        G_sum = np.zeros(n)  # To store the sum of returns for each trajectory
        for i in range(n):
            state = env.reset(s)  # Start at the given state
            G = 0  # Initialize return
            for t in range(T):
                action_probs = env.policy(state)
                action = np.random.choice(env.actions_count, p=action_probs)
                state, reward = env.step(action)
                G += reward * (env.gamma ** t)
            G_sum[i] = G
        V_pi[s] = np.mean(G_sum)  # Calculate the average return for the state
    return V_pi

def compute_exact_v_pi(env):
    P_pi = env.policy_matrix()
    r_pi = env.reward_vector()
    I = np.eye(env.states_count)
    V_pi = np.linalg.inv(I - env.gamma * P_pi).dot(r_pi)
    return V_pi

# Create the environment
env = Riverswim(5)

# Run the Monte Carlo Policy Evaluation
V_pi_estimate = monte_carlo_policy_evaluation(env, n=100, T=200)
print("Estimated V^pi via Monte Carlo: ", V_pi_estimate)

# Compute the exact V^pi
V_pi_exact = compute_exact_v_pi(env)
print("Exact V^pi: ", V_pi_exact)

Estimated V^pi via Monte Carlo:  [1.65185151 1.93620515 3.14803208 7.30577418 8.8869764 ]
Exact V^pi:  [1.60365251 1.89408739 3.15492297 7.29485928 8.77220123]
