In [None]:
def value_function(states, actions, rewards, transition_function, discount_factor=0.9, theta=0.0001):
    """
    Calculate the value function for a given policy.

    Args:
        states: List of all states in the environment.
        actions: List of all actions in the environment.
        rewards: Rewards for each state-action pair.
        transition_function: Function to get the next state given current state and action.
        discount_factor: Discount factor for future rewards.
        theta: Threshold for value iteration.

    Returns:
        Value function: A dictionary mapping state -> value.
    """
    # Initialize value function
    V = {s: 0 for s in states}

    while True:
        delta = 0
        # For each state, perform a full backup
        for s in states:
            v = 0
            # Look at the possible next actions
            for a in actions[s]:
                # For each action, look at the possible next states...
                for prob, next_state, reward, done in transition_function(s, a):
                    # Calculate the expected value
                    v += prob * (reward + discount_factor * V[next_state])
            # How much our value function changed (across any states)
            delta = max(delta, abs(v - V[s]))
            V[s] = v
        # Stop evaluating once our value function change is below a threshold
        if delta < theta:
            break

    return V
states = ['s1', 's2', 's3','s4','s5','s6','Home']
actions = {'s1': ['a1', 'a2'], 's2': ['a1', 'a2'], 's3': ['a1', 'a2']}
rewards = {('s1', 'a1'): 1, ('s1', 'a2'): 2, ('s2', 'a1'): 2, ('s2', 'a2'): 1, ('s3', 'a1'): 2, ('s3', 'a2'): 1}

def transition_function(s, a):
    if s == 's1' and a == 'a1':
        return [(0.5, 's2', 1, False), (0.5, 's3', 2, False)]
    elif s == 's1' and a == 'a2':
        return [(1, 's2', 2, False)]
    # ... continue for all state-action pairs

discount_factor = 0.9
theta = 0.0001

V = value_function(states, actions, rewards, transition_function, discount_factor, theta)


In [3]:
states = {'s1', 's2', 's3', 's4', 's5', 's6', 's7'}
initial_state = 's1'
current_state = initial_state
reward = [2, 5, -2, -1, 5, 10]
T = [
    [0, 0.5, 0.5, 0, 0, 0, 0],
    [0, 0, 0, 0.6, 0.4, 0, 0],
    [0.3, 0, 0.7, 0, 0, 0, 0],
    [0, 0.2, 0, 0.8, 0, 0, 0],
    [0, 0, 0, 0.3, 0, 0.7, 0],
    [0, 0, 0, 0, 0.1, 0, 0.9],
    [0, 0, 0, 0, 0, 0, 0]
    ]



NameError: name 'value_function' is not defined

In [11]:
states = ['s1', 's2', 's3', 's4', 's5', 's6', 's7']
initial_state = 's1'
reward = [2, 5, -2, -1, 5, 10, 0]
T = [
    [0, 0.5, 0.5, 0, 0, 0, 0],
    [0, 0, 0, 0.6, 0.4, 0, 0],
    [0.3, 0, 0.7, 0, 0, 0, 0],
    [0, 0.2, 0, 0.8, 0, 0, 0],
    [0, 0, 0, 0.3, 0, 0.7, 0],
    [0, 0, 0, 0, 0.1, 0, 0.9],
    [0, 0, 0, 0, 0, 0, 0]
    ]

def state_value_function(states, T, reward, gamma=0.5, theta=0.0001):
    V = {}
    for s in states:
        V[s] = 0

    while True:
        delta = 0
        for s in states:
            v = V[s]
            if s == 's7':
                V[s] = 0
            else:
                # Calculate the sum of future expected rewards
                future_reward_sum = 0
                for s_next in states:
                    if s_next != 's7':
                        future_reward_sum += T[states.index(s)][states.index(s_next)] * (reward[states.index(s_next)] + gamma * V[s_next])
                # Update the value of the current state
                V[s] = future_reward_sum
            # Update the maximum change in value
            delta = max(delta, abs(v - V[s]))
        # Check for convergence
        if delta < theta:
            break
    return V

state_values = state_value_function(states, T, reward)
for state, value in state_values.items():
    print(f"State: {state}, Value: {value}")


State: s1, Value: 2.0822294536319355
State: s2, Value: 3.079265150757922
State: s3, Value: -0.750268786699871
State: s4, Value: 0.8464984593656143
State: s5, Value: 7.12669168575214
State: s6, Value: 0.856334584287607
State: s7, Value: 0


In [22]:
states = ['s1', 's2', 's3', 's4', 's5', 's6', 's7']
initial_state = 's1'
reward = [2, 5, -2, -1, 5, 10, 0]  # Modified to include the reward for 's7'
T = [
    [0, 0.5, 0.5, 0, 0, 0, 0],
    [0, 0, 0, 0.6, 0.4, 0, 0],
    [0.3, 0, 0.7, 0, 0, 0, 0],
    [0, 0.2, 0, 0.8, 0, 0, 0],
    [0, 0, 0, 0.3, 0, 0.7, 0],
    [0, 0, 0, 0, 0.1, 0, 0.9],
    [0, 0, 0, 0, 0, 0, 0]
]

def state_value_function(states, T, reward, gamma=0.5, theta=0.0001):
    V = {}
    for s in states:
        V[s] = 0

    print("Initial State Values:")
    print(V)

    while True:
        delta = 0
        for s in states:
            v = V[s]
            if s == 's7':
                V[s] = 0
            else:
                future_reward_sum = 0
                for s_next in states:
                    if s_next != 's7':
                        future_reward_sum += T[states.index(s)][states.index(s_next)] * (reward[states.index(s_next)] + gamma * V[s_next])
                        print(f"State: {s}, Next State: {s_next}, Future Reward Sum: {future_reward_sum}")
                V[s] = future_reward_sum
                print(v)
            print(v - V[s])
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break

    print("\nFinal State Values:")
    print(V)
    return V  # Return the state values

state_values = state_value_function(states, T, reward)
if state_values is not None:
    for state, value in state_values.items():
        print(f"State: {state}, Value: {value}")
else:
    print("State values were not calculated due to an error.")


Initial State Values:
{'s1': 0, 's2': 0, 's3': 0, 's4': 0, 's5': 0, 's6': 0, 's7': 0}
State: s1, Next State: s1, Future Reward Sum: 0.0
State: s1, Next State: s2, Future Reward Sum: 2.5
State: s1, Next State: s3, Future Reward Sum: 1.5
State: s1, Next State: s4, Future Reward Sum: 1.5
State: s1, Next State: s5, Future Reward Sum: 1.5
State: s1, Next State: s6, Future Reward Sum: 1.5
0
-1.5
State: s2, Next State: s1, Future Reward Sum: 0.0
State: s2, Next State: s2, Future Reward Sum: 0.0
State: s2, Next State: s3, Future Reward Sum: 0.0
State: s2, Next State: s4, Future Reward Sum: -0.6
State: s2, Next State: s5, Future Reward Sum: 1.4
State: s2, Next State: s6, Future Reward Sum: 1.4
0
-1.4
State: s3, Next State: s1, Future Reward Sum: 0.825
State: s3, Next State: s2, Future Reward Sum: 0.825
State: s3, Next State: s3, Future Reward Sum: -0.575
State: s3, Next State: s4, Future Reward Sum: -0.575
State: s3, Next State: s5, Future Reward Sum: -0.575
State: s3, Next State: s6, Future Re