In [5]:
# Bellmann equation implementation for Gridworld problem

import numpy as np

# Define the grid-world environment
grid_world = np.array([
    [0, 0, 0, 0],
    [0, -1, 0, 0],
    [0, 0, 0, 0],
    [0, 0, 0, 0]
])

# Define the rewards for each cell
rewards = np.array([
    [0, 0, 0, 0],
    [0, 0, 0, 10],
    [0, 0, 0, 0],
    [0, 0, 0, 0]
])

# Define the discount factor
discount_factor = 0.9

# Define the number of iterations for the Bellman equation
num_iterations = 100

# Initialize the value function array
value_function = np.zeros_like(grid_world, dtype=np.float32)

# Perform value iteration
for _ in range(num_iterations):
    updated_value_function = np.copy(value_function)
    for i in range(grid_world.shape[0]):
        for j in range(grid_world.shape[1]):
            if grid_world[i, j] == -1:  # Skip walls or obstacles
                continue
            up_value = value_function[max(i - 1, 0), j]
            down_value = value_function[min(i + 1, grid_world.shape[0] - 1), j]
            left_value = value_function[i, max(j - 1, 0)]
            right_value = value_function[i, min(j + 1, grid_world.shape[1] - 1)]
            max_value = max(up_value, down_value, left_value, right_value)
            updated_value_function[i, j] = rewards[i, j] + discount_factor * max_value
    value_function = updated_value_function

# Find the optimal policy
optimal_policy = np.zeros_like(grid_world, dtype=np.str)
for i in range(grid_world.shape[0]):
    for j in range(grid_world.shape[1]):
        if grid_world[i, j] == -1:
            optimal_policy[i, j] = "W"  # Mark walls or obstacles
        else:
            up_value = value_function[max(i - 1, 0), j]
            down_value = value_function[min(i + 1, grid_world.shape[0] - 1), j]
            left_value = value_function[i, max(j - 1, 0)]
            right_value = value_function[i, min(j + 1, grid_world.shape[1] - 1)]
            max_value = max(up_value, down_value, left_value, right_value)
            if max_value == up_value:
                optimal_policy[i, j] = "U"
            elif max_value == down_value:
                optimal_policy[i, j] = "D"
            elif max_value == left_value:
                optimal_policy[i, j] = "L"
            elif max_value == right_value:
                optimal_policy[i, j] = "R"

# Print the results
print("Optimal Value Function:")
print(value_function)
print("\nOptimal Policy:")
print(optimal_policy)


Optimal Value Function:
[[65.607346 72.89734  80.997345 89.997345]
 [59.046345  0.       89.997345 99.997345]
 [65.607346 72.89734  80.997345 89.997345]
 [59.046345 65.607346 72.89734  80.997345]]

Optimal Policy:
[['R' 'R' 'D' 'D']
 ['U' 'W' 'R' 'R']
 ['R' 'R' 'U' 'U']
 ['U' 'U' 'U' 'U']]


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  optimal_policy = np.zeros_like(grid_world, dtype=np.str)


In [2]:
# Bellman equation for fitness coach AI
import numpy as np

# Parameters
max_energy = 10  # Maximum energy level
num_states = max_energy + 1  # Number of states (energy levels)
num_actions = 3  # Number of actions (performance training, endurance training, rest)
discount_factor = 0.9  # Discount factor
num_iterations = 100  # Number of iterations for value iteration

# Initialize the value function with zeros
V = np.zeros(num_states)

# Bellman equation
def bellman_equation(state, action):
    if action == 0:  # Performance training (jogging or swimming)
        next_state = min(state - 2 , max_energy)  # Increase energy by 2
        reward = 1  # Positive reward for performance training
    elif action == 1:  # Endurance training (weight lifting)
        next_state = max(state - 1, 0)  # Decrease energy by 1
        reward = 2  # Higher reward for endurance training
    else:  # Rest
        next_state = state  # Energy level remains the same
        reward = 0  # No additional reward for resting

    return next_state, reward

# Value iteration
for _ in range(num_iterations):
    new_V = np.zeros(num_states)
    for state in range(num_states):
        max_value = float('-inf')
        for action in range(num_actions):
            next_state, reward = bellman_equation(state, action)
            value = reward + discount_factor * V[next_state]
            max_value = max(max_value, value)
        new_V[state] = max_value
    V = new_V

# Optimal policy extraction
policy = np.zeros(num_states, dtype=int)
for state in range(num_states):
    max_value = float('-inf')
    best_action = None
    for action in range(num_actions):
        next_state, reward = bellman_equation(state, action)
        value = reward + discount_factor * V[next_state]
        if value > max_value:
            max_value = value
            best_action = action
    policy[state] = best_action

# Print the optimal policy
actions = ['Performance Training', 'Endurance Training', 'Rest']
for state, action in enumerate(policy):
    print(f'Energy: {state}, Action: {actions[action]}')

Energy: 0, Action: Endurance Training
Energy: 1, Action: Endurance Training
Energy: 2, Action: Endurance Training
Energy: 3, Action: Endurance Training
Energy: 4, Action: Endurance Training
Energy: 5, Action: Endurance Training
Energy: 6, Action: Endurance Training
Energy: 7, Action: Endurance Training
Energy: 8, Action: Endurance Training
Energy: 9, Action: Endurance Training
Energy: 10, Action: Endurance Training
