# Production and Maintenance Planning (MDP)

In [2]:
import numpy as np
import matplotlib.pyplot as plt

rows, cols = 3, 4  # Dimensions of the grid (3 rows, 4 columns)
num_states = rows * cols  # Total number of states

# Convert 2D grid position to 1D state index
def state_index(row, col):
    return row * cols + col

# Convert 1D state index back to 2D grid position (row, col)
def state_position(index):
    return divmod(index, cols)

# Action space
actions = ['continue to manufacture', 'shut down and do maintenance']
action_index = {'continue to manufacture': 0, 'shut down and do maintenance': 1}


# Transition probabilities
transition_probabilities = {
    0: {
        0: [(0, 0.5), (1, 0.4), (2, 0.1)],
        1: [(0, 1.0)]
    },
    1: {
        0: [(1, 0.5), (2, 0.4), (3, 0.1)],
        1: [(0, 0.9), (1, 0.1)]
    },
    2: {
        0: [(2, 0.5), (3, 0.5)],
        1: [(0, 0.6), (1, 0.3), (2, 0.1)]
    },
    3: {
        0: [(3, 1.0)],
        1: [(0, 0.3), (1, 0.3), (2, 0.3), (3, 0.1)]
    },
    4: {
        0: [(1, 0.25), (2, 0.25), (3, 0.25), (4, 0.25)],
        1: [(4, 1.0)]
    },
    5: {
        0: [(2, 0.6), (3, 0.2), (5, 0.2)],
        1: [(4, 0.9), (5, 0.1)]
    },
    6: {
        0: [(3, 0.6), (6, 0.2), (7, 0.2)],
        1: [(4, 0.6), (5, 0.3), (6, 0.1)]
    },
    7: {
        0: [(3, 0.3), (7, 0.7)],
        1: [(4, 0.3), (5, 0.3), (6, 0.3), (7, 0.1)]
    },
    8: {
        0: [(3, 0.25), (5, 0.25), (6, 0.25), (8, 0.25)],
        1: [(8, 1.0)]
    },
    9: {
        0: [(3, 0.25), (6, 0.25), (7, 0.25), (9, 0.25)],
        1: [(8, 0.9), (9, 0.1)]
    },
    10: {
        0: [(7, 0.4), (10, 0.3), (11, 0.3)],
        1: [(8, 0.6), (9, 0.3), (10, 0.1)]
    },
    11: {
        0: [(11, 1.0)],
        1: [(8, 0.3), (9, 0.3), (10, 0.3), (9, 0.1)]
    }
}

# Rewards for each action and state
reward_action = {
    0: {
        0: [-40, -45, -50],
        1: [-40, -45, -50],
        2: [-40, -45],
        3: [-40],
        4: [-80, -85, -90, -90],
        5: [-80, -85, -90],
        6: [-90, -90, -110],
        7: [-90, -90],
        8: [-110, -80, -110, -110],
        9: [-110, -80, -110, -110],
        10: [-110, -110, -140],
        11: [-110]
    },
    1: {
        0: [-40],
        1: [-70, -40],
        2: [-70, -70, -40],
        3: [-90, -70, -70, -40],
        4: [-80],
        5: [-110, -80],
        6: [-110, -110, -80],
        7: [-130, -110, -110, -80],
        8: [-110],
        9: [-140, -110],
        10: [-140, -140, -110],
        11: [-160, -140, -140, -110]
    }
}

# Value Iteration 

In [9]:
# Parameters
gamma = 0.7  # Discount factor
theta = 1e-4  # Convergence threshold

# Initialize state values
state_values = np.zeros(num_states)  # V(s) for all states
policy = np.zeros(num_states, dtype=int)  # Policy π(s)
# Value iteration with a maximum iteration limit
def value_iteration(transition_probabilities, reward_action, gamma, theta, max_iterations=1000):
    global state_values, policy
    iteration = 0  # Initialize iteration counter

    while iteration < max_iterations:
        delta = 0
        new_state_values = state_values.copy()
        
        # Iterate over all states
        for state in range(num_states):
            action_values = []
            
            # Iterate over all actions
            for action in range(len(actions)):
                value = 0
                
                # Sum over possible next states
                for next_state, prob in transition_probabilities[state][action]:
                    reward = reward_action[action].get(state, [0])[0]  # Default reward if not defined
                    value += prob * (reward + gamma * state_values[next_state])
                
                action_values.append(value)
            
            # Update value of the state
            new_state_values[state] = max(action_values)
            delta = max(delta, abs(new_state_values[state] - state_values[state]))
        
        state_values = new_state_values
        iteration += 1  # Increment the iteration counter
        
        # Check for convergence
        if delta < theta:
            print(f"Converged in {iteration} iterations.")
            break

    else:
        print(f"Stopped after reaching the maximum of {max_iterations} iterations.")
    
    # Extract policy
    for state in range(num_states):
        action_values = []
        for action in range(len(actions)):
            value = 0
            for next_state, prob in transition_probabilities[state][action]:
                reward = reward_action[action].get(state, [0])[0]
                value += prob * (reward + gamma * state_values[next_state])
            action_values.append(value)
        policy[state] = np.argmax(action_values)

    return state_values, policy

# Run value iteration with a maximum iteration limit
state_values, policy = value_iteration(
    transition_probabilities, reward_action, gamma, theta, max_iterations=500
)

# Print results
print("Optimal State Values:")
print(state_values.reshape(rows, cols))
print("\nOptimal Policy (Value Iteration):")
print(policy.reshape(rows, cols))


Converged in 38 iterations.
Optimal State Values:
[[-133.33316009 -133.33316009 -133.33316009 -133.33316009]
 [-181.81800857 -179.84478799 -207.43256729 -231.37237577]
 [-243.7658039  -254.69589828 -313.86297506 -348.41684312]]

Optimal Policy (Value Iteration):
[[0 0 0 0]
 [0 0 0 0]
 [0 0 0 1]]


# Policy Iteration

In [10]:
import numpy as np
import matplotlib.pyplot as plt

# Policy iteration with iteration limit
def policy_iteration(transition_probabilities, reward_action, gamma, theta, max_iterations=1000):
    global state_values, policy
    is_policy_stable = False
    iteration = 0  # Track the number of iterations
    
    while not is_policy_stable and iteration < max_iterations:
        # Policy evaluation
        while True:
            delta = 0
            new_state_values = state_values.copy()
            
            for state in range(num_states):
                action = policy[state]
                value = 0
                for next_state, prob in transition_probabilities[state][action]:
                    reward = reward_action[action].get(state, [0])[0]
                    value += prob * (reward + gamma * state_values[next_state])
                new_state_values[state] = value
                delta = max(delta, abs(new_state_values[state] - state_values[state]))
            
            state_values = new_state_values
            if delta < theta:
                break
        
        # Policy improvement
        is_policy_stable = True
        for state in range(num_states):
            old_action = policy[state]
            action_values = []
            
            for action in range(len(actions)):
                value = 0
                for next_state, prob in transition_probabilities[state][action]:
                    reward = reward_action[action].get(state, [0])[0]
                    value += prob * (reward + gamma * state_values[next_state])
                action_values.append(value)
            
            best_action = np.argmax(action_values)  # Find the action with the minimum value
            policy[state] = best_action
            
            if old_action != best_action:
                is_policy_stable = False
        
        iteration += 1  # Increment iteration counter
    
    return state_values, policy, iteration

# Print results
print("Optimal State Values:")
print(state_values.reshape(rows, cols))
print("\nOptimal Policy (Policy Iteration):")
print(policy.reshape(rows, cols))

# Run policy iteration with a limit on iterations
state_values = np.zeros(num_states)  # Reset state values
policy = np.zeros(num_states, dtype=int)  # Reset policy

state_values, policy, iterations_used = policy_iteration(transition_probabilities, reward_action, gamma, theta, max_iterations=500)

# Print the number of iterations used
print(f"Policy iteration completed in {iterations_used} iterations.")


Optimal State Values:
[[-133.33316009 -133.33316009 -133.33316009 -133.33316009]
 [-181.81800857 -179.84478799 -207.43256729 -231.37237577]
 [-243.7658039  -254.69589828 -313.86297506 -348.41684312]]

Optimal Policy (Policy Iteration):
[[0 0 0 0]
 [0 0 0 0]
 [0 0 0 1]]
Policy iteration completed in 3 iterations.
