<a href="https://colab.research.google.com/github/Web-Jose/CSCI-167/blob/main/Block_Discount_World.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Imports
import numpy as np

# Define the states and actions
states = ['a', 'b', 'c', 'd', 'e']
actions = ['Left', 'Right', 'Exit']

# Define rewards for each state-action pair
rewards = {
    'a': {'Exit': 10, 'Left': 0, 'Right': 0},
    'b': {'Left': 0, 'Right': 0},
    'c': {'Left': 0, 'Right': 0},
    'd': {'Left': 0, 'Right': 0},
    'e': {'Exit': 1, 'Left': 0, 'Right': 0}
}

# Define transitions for deterministic case
transitions = {
    'a': {'Left': 'a', 'Right': 'b', 'Exit': None},
    'b': {'Left': 'a', 'Right': 'c'},
    'c': {'Left': 'b', 'Right': 'd'},
    'd': {'Left': 'c', 'Right': 'e'},
    'e': {'Left': 'd', 'Right': 'e', 'Exit': None}
}

In [10]:
def value_iteration(states, actions, rewards, transitions, gamma, theta=1e-6):
    V = {state: 0 for state in states}  # Initialize value function
    policy = {state: None for state in states}  # Initialize policy

    while True:
        delta = 0
        for state in states:
            if state in rewards and 'Exit' in rewards[state]:
                best_action_value = rewards[state]['Exit']
                best_action = 'Exit'
            else:
                action_values = []
                for action in actions:
                    if action in transitions[state]:
                        next_state = transitions[state][action]
                        reward = rewards[state][action]
                        action_value = reward
                        if next_state:
                            action_value += gamma * V[next_state]
                        action_values.append((action_value, action))
                best_action_value, best_action = max(action_values)

            delta = max(delta, np.abs(V[state] - best_action_value))
            V[state] = best_action_value
            policy[state] = best_action

        if delta < theta:
            break

    return V, policy

In [11]:
# (1) Calculate Optimum Policy for gamma=1
gamma_1 = 1.0
V_1, policy_1 = value_iteration(states, actions, rewards, transitions, gamma_1)
print("Optimum Policy for gamma=1:")
print(policy_1)

Optimum Policy for gamma=1:
{'a': 'Exit', 'b': 'Right', 'c': 'Right', 'd': 'Left', 'e': 'Exit'}


In [12]:
# (2) Calculate Optimum Policy for gamma=0.1
gamma_01 = 0.1
V_01, policy_01 = value_iteration(states, actions, rewards, transitions, gamma_01)
print("Optimum Policy for gamma=0.1:")
print(policy_01)

Optimum Policy for gamma=0.1:
{'a': 'Exit', 'b': 'Left', 'c': 'Left', 'd': 'Right', 'e': 'Exit'}


In [13]:
# (3) Calculate the value of the sequence of rewards from each of the states under the optimum policy
print("Value Function for gamma=1:")
print(V_1)

print("Value Function for gamma=0.1:")
print(V_01)

Value Function for gamma=1:
{'a': 10, 'b': 10.0, 'c': 10.0, 'd': 10.0, 'e': 1}
Value Function for gamma=0.1:
{'a': 10, 'b': 1.0, 'c': 0.1, 'd': 0.1, 'e': 1}


In [14]:
# (4) For which gamma, are West and East equally good when in state d?

def find_gamma_for_equal_value(states, actions, rewards, transitions, state, action1, action2, theta=1e-6):
    low, high = 0, 1
    while high - low > theta:
        gamma = (low + high) / 2
        V, _ = value_iteration(states, actions, rewards, transitions, gamma, theta)
        value1 = rewards[state][action1] + (gamma * V[transitions[state][action1]])
        value2 = rewards[state][action2] + (gamma * V[transitions[state][action2]])
        if np.abs(value1 - value2) < theta:
            return gamma
        elif value1 > value2:
            low = gamma
        else:
            high = gamma
    return (low + high) / 2

In [15]:
gamma_equal = find_gamma_for_equal_value(states, actions, rewards, transitions, 'd', 'Left', 'Right')
print(f"Gamma for which West and East are equally good in state d: {gamma_equal}")

Gamma for which West and East are equally good in state d: 0.9999995231628418
