Method 1

In [1]:
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

class MDP():
    def __init__(self):
        self.A = [0, 1]
        self.S = [0, 1, 2, 3, 4]

        P0 = np.array([[0.5, .15, .15, 0, .20],
                       [0, .5, .0, .25, .25],
                       [0, 0, .15, .05, .8],
                       [0, 0, 0, 0, 1],
                       [0, 0, 0, 0, 1]])

        R0 = np.array([0, 0, 0, 10, 0])

        P1 = np.array([[0.5, .25, .15, 0, .10],
                       [0, .5, .0, .35, .15],
                       [0, 0, .20, .05, .75],
                       [0, 0, 0, 0, 1],
                       [0, 0, 0, 0, 1]])

        R1 = np.array([-0.1, -0.1, -0.1, 10, 0])

        self.P = [P0, P1]
        self.R = [R0, R1]

    def step(self, s, a):
        s_prime = np.random.choice(len(self.S), p=self.P[a][s])
        R = self.R[a][s]
        if s_prime == 4:
            done = True
        else:
            done = False
        return s_prime, R, done

    def simulate(self, s, a, π):
        done = False
        t = 0
        history = []
        while not done:
            if t > 0:
                a = π[s]
            s_prime, R, done = self.step(s, a)
            history.append((s, a, R))
            s = s_prime
            t += 1
        return history

def monte_carlo_policy_evaluation(mdp, policy, num_episodes=10000, gamma=0.9960):
    V = {s: 0 for s in range(5)}
    N = {s: 0 for s in range(5)}

    for episode in range(num_episodes):
        s = 0
        a = policy[0]

        history = mdp.simulate(s, a, policy)

        G = 0
        for t in range(len(history)-1, -1, -1):
            s_t, _, r_t = history[t]
            G = gamma * G + r_t

            first_visit = True
            for i in range(t):
                if history[i][0] == s_t:
                    first_visit = False
                    break

            if first_visit:
                N[s_t] += 1
                V[s_t] += (G - V[s_t]) / N[s_t]

    return V

# Create MDP instance
mdp = MDP()

# Define policies
always_invest = [1, 1, 1, 1]  # Action 1 (invest) for all states
never_invest = [0, 0, 0, 0]   # Action 0 (don't invest) for all states

# Evaluate both policies
print("Evaluating policies...")
V_always_invest = monte_carlo_policy_evaluation(mdp, always_invest)
V_never_invest = monte_carlo_policy_evaluation(mdp, never_invest)

# Print results
print("\nValue function for Always Invest policy:")
for state, value in V_always_invest.items():
    print(f"State {state}: ${value:.3f}M")

print("\nValue function for Never Invest policy:")
for state, value in V_never_invest.items():
    print(f"State {state}: ${value:.3f}M")

Evaluating policies...

Value function for Always Invest policy:
State 0: $3.345M
State 1: $6.729M
State 2: $0.520M
State 3: $10.000M
State 4: $0.000M

Value function for Never Invest policy:
State 0: $1.666M
State 1: $5.055M
State 2: $0.596M
State 3: $10.000M
State 4: $0.000M


Method 2

In [2]:
import numpy as np

class MDP():
    def __init__(self):
        self.A = [0, 1]  # Actions: 0 = don't invest, 1 = invest
        self.S = [0, 1, 2, 3, 4]  # States: phases of development

        # Transition probabilities without investment
        P0 = np.array([[0.5, .15, .15, 0, .20],
                       [0, .5, .0, .25, .25],
                       [0, 0, .15, .05, .8],
                       [0, 0, 0, 0, 1],
                       [0, 0, 0, 0, 1]])

        # Rewards without investment
        R0 = np.array([0, 0, 0, 10, 0])

        # Transition probabilities with investment
        P1 = np.array([[0.5, .25, .15, 0, .10],
                       [0, .5, .0, .35, .15],
                       [0, 0, .20, .05, .75],
                       [0, 0, 0, 0, 1],
                       [0, 0, 0, 0, 1]])

        # Rewards with investment (-0.1 is the $100,000 investment cost)
        R1 = np.array([-0.1, -0.1, -0.1, 10, 0])

        self.P = [P0, P1]
        self.R = [R0, R1]

    def step(self, s, a):
        s_prime = np.random.choice(len(self.S), p=self.P[a][s])
        R = self.R[a][s]
        if s_prime == 4:
            done = True
        else:
            done = False
        return s_prime, R, done

    def simulate(self, s, a, π):
        done = False
        t = 0
        history = []
        while not done:
            if t > 0:
                a = π[s]
            s_prime, R, done = self.step(s, a)
            history.append((s, a, R))
            s = s_prime
            t += 1
        return history

def monte_carlo_policy_evaluation(mdp, policy, num_episodes=10000, gamma=0.9960):
    # Initialize value function and visit counts
    V = {s: 0 for s in range(5)}
    N = {s: 0 for s in range(5)}

    for episode in range(num_episodes):
        # Start from phase 0
        s = 0
        # Get first action according to policy
        a = policy[0]

        # Generate episode
        history = mdp.simulate(s, a, policy)

        # Calculate returns for each state in the episode
        G = 0
        for t in range(len(history)-1, -1, -1):
            s_t, _, r_t = history[t]
            G = gamma * G + r_t

            # First-visit MC: only update if this is first visit to s_t in episode
            first_visit = True
            for i in range(t):
                if history[i][0] == s_t:
                    first_visit = False
                    break

            if first_visit:
                N[s_t] += 1
                V[s_t] += (G - V[s_t]) / N[s_t]

    return V

def main():
    # Set random seed for reproducibility
    np.random.seed(42)

    # Create MDP instance
    mdp = MDP()

    # Define policies
    always_invest = [1, 1, 1, 1]  # Action 1 (invest) for all states
    never_invest = [0, 0, 0, 0]   # Action 0 (don't invest) for all states

    # Evaluate both policies
    print("Evaluating policies...")
    V_always_invest = monte_carlo_policy_evaluation(mdp, always_invest)
    V_never_invest = monte_carlo_policy_evaluation(mdp, never_invest)

    # Print results
    print("\nValue function for Always Invest policy:")
    for state, value in V_always_invest.items():
        print(f"State {state}: ${value:.3f}M")

    print("\nValue function for Never Invest policy:")
    for state, value in V_never_invest.items():
        print(f"State {state}: ${value:.3f}M")

if __name__ == "__main__":
    main()

Evaluating policies...

Value function for Always Invest policy:
State 0: $3.345M
State 1: $6.729M
State 2: $0.520M
State 3: $10.000M
State 4: $0.000M

Value function for Never Invest policy:
State 0: $1.666M
State 1: $5.055M
State 2: $0.596M
State 3: $10.000M
State 4: $0.000M
