In [5]:
import numpy as np
import gym

In [6]:
class FooddTruck(gym.Env):
    def __init__(self):
        """
        Simulation environment of a food truck.
        """        
        self.demand_values = [100, 200, 300, 400]
        self.demand_probabilities = [0.3, 0.4, 0.2, 0.1]
        self.capacity = 400
        self.days = ['Mon', 'Tue', 'Wed',
                    'Thu', 'Fri', 'Weekend']
        self.unit_cost = 4
        self.net_revenue = 7
        self.action_space = [0, 100, 200, 300, 400]
        self.state_space = [("Mon", 0)] \
                            + [(day, inventory) for day in self.days[1:] for inventory in [0, 100, 200, 300]]

    def get_next_state_reward(self, state, action, demand):
        """
        Get the next state based on the current statue, action and demand.add()

        Parameters
        ----------
        state : tuple
            The state is defined through the day and inventory: state = (day, inventory)
        action : int
            Number of burger patties bought to refill the inventory
        demand : int
            Number of burgers requested 

        Returns
        -------
        dict
            Resulting consequences for the next state
        """        
        day, inventory = state
        result = dict()
        result['next_day'] = self.days[self.days.index(day) + 1]
        result['starting_inventory'] = min(self.capacity, inventory + action)
        result['cost'] = self.unit_cost * action
        result['sales'] = min(result['starting_inventory'], demand)
        result['revenue'] = self.net_revenue * result['sales']
        result['next_inventory'] = result['starting_inventory'] - result['sales']
        result['reward'] = result['revenue'] - result['cost']
        return result

    def get_transition_probability(self, state, action):
        """
        Get the transition probability values for the given state and action.
        Rate the next state and the according reward with the probability of the event.
        The probability is derived from the assumed probability of the demand. 

        Parameters
        ----------
        state : tuple
            The state is defined through the day and inventory: state = (day, inventory)
        action : int
            Number of burger patties bought to refill the inventory

        Returns
        -------
        dict
            State and reward tuples with corresponding probability
        """        
        next_state_and_reward_probability = dict()
        for index, demand in enumerate(self.demand_values):
            result = self.get_next_state_reward(state, action, demand)
            next_state = (result['next_day'], result['next_inventory'])
            reward = result['reward']
            probability = self.demand_probabilities[index]
            if (next_state, reward) not in next_state_and_reward_probability:
                next_state_and_reward_probability[next_state, reward] = probability
            else:
                next_state_and_reward_probability[next_state, reward] += probability
        return next_state_and_reward_probability
    
    def is_terminal(self, state):
        day, _ = state
        if day == 'Weekend':
            return True
        else:
            return False

# Policy evaluation

In [7]:
def base_policy(states):
    """
    Defining a base policy. States are mapped to actions.

    Parameters
    ----------
    states : list
        List of environment states

    Returns
    -------
    dict
        The resulting policy. Keys are states while the values are itself dictionaries composed 
        of action probability pairs.
    """    
    policy = {}
    for state in states:
        action_probability_pairs = dict()
        day, inventory = state
        if inventory >= 300:
            action_probability_pairs[0] = 1.0
        else:
            action_probability_pairs[400 - inventory] = 0.5
            action_probability_pairs[300 - inventory] = 0.5
        policy[state] = action_probability_pairs
    return policy

In [8]:
def expected_update(env, value, state, action_probability_pairs, gamma):
    """
    Update the expected value.
    For a given current state, the transition probabilities are calculated. To this end,
    the probabilities for the next state with the corresponding reward are available. From this
    events, the expected value is calculated. The expected value is the summed over all events. 
    
    Converting the Bellman equation into an update rule.
    

    Parameters
    ----------
    env : _type_
        _description_
    value : _type_
        _description_
    state : _type_
        _description_
    action_probability_pairs : _type_
        _description_
    gamma : _type_
        _description_

    Returns
    -------
    _type_
        _description_
    """    
    expected_value = 0
    for action in action_probability_pairs:
        probability_next_state_and_reward = env.get_transition_probability(state, action)
        for next_state, reward in probability_next_state_and_reward:
            expected_value += action_probability_pairs[action] \
                            * probability_next_state_and_reward[next_state, reward] \
                            * (reward + gamma * value[next_state])
    return expected_value

In [9]:
def policy_evaluation(env, policy, max_iter=100,
                      value=None, eps=0.1, gamma=1):
    """
    Policy evaluation executes expected updates for all states until the state value converges or 
    it reaches the maximum number of iterations.

    Parameters
    ----------
    env : class object
        simulation environment
    policy : dict
        A policy that maps states to actions. Keys are states while the values are itself dictionaries composed 
        of action probability pairs.
    max_iter : int, optional
        Maximum number of iterations if algorithm does not converge earlier, by default 100
    value : _type_, optional
        Expected discounted cumulative reward starting in state s and following the policy, by default None
    eps : float, optional
        Acceptance criteria for the convergence of the algorithm, by default 0.1
    gamma : int, optional
        Discount factor, by default 1

    Returns
    -------
    dict
        Converged expected discounted cumulative rewards
    """    
    if not value:
        value = {state: 0 for state in env.state_space}
    k = 0
    while True:
        max_delta = 0
        for state in value:
            if not env.is_terminal(state):
                value_old = value[state]
                action_probability_pairs = policy[state]
                value[state] = expected_update(env, value, state, action_probability_pairs, gamma)
                max_delta = max(max_delta, abs(value[state]- value_old))
        k += 1
        if max_delta < eps:
            print(f"Converged in {k} iterations.")
            break
        elif k == max_iter:
            print(f"Terminating after {k} iterations.")
            break
    return value

In [10]:
foodTruck = FooddTruck()

In [11]:
foodTruck.get_next_state_reward(state=('Mon', 0), action=300, demand=300)

{'next_day': 'Tue',
 'starting_inventory': 300,
 'cost': 1200,
 'sales': 300,
 'revenue': 2100,
 'next_inventory': 0,
 'reward': 900}

In [12]:
foodTruck.get_transition_probability(('Tue', 200), 100)

{(('Wed', 200), 300): 0.3,
 (('Wed', 100), 1000): 0.4,
 (('Wed', 0), 1700): 0.30000000000000004}

In [13]:
policy = base_policy(foodTruck.state_space)

In [14]:
value = policy_evaluation(env=foodTruck, policy=policy)
print(f"Expected weekly profit: {value['Mon', 0]}")

Converged in 6 iterations.
Expected weekly profit: 2510.3852187500006


In [15]:
value

{('Mon', 0): 2510.3852187500006,
 ('Tue', 0): 1897.4318750000002,
 ('Tue', 100): 2297.431875,
 ('Tue', 200): 2697.431875,
 ('Tue', 300): 3083.7875000000004,
 ('Wed', 0): 1283.7875000000001,
 ('Wed', 100): 1683.7875000000004,
 ('Wed', 200): 2083.7875,
 ('Wed', 300): 2474.75,
 ('Thu', 0): 674.75,
 ('Thu', 100): 1074.75,
 ('Thu', 200): 1474.75,
 ('Thu', 300): 1835.0,
 ('Fri', 0): 35.00000000000003,
 ('Fri', 100): 435.0,
 ('Fri', 200): 835.0,
 ('Fri', 300): 1400.0,
 ('Weekend', 0): 0,
 ('Weekend', 100): 0,
 ('Weekend', 200): 0,
 ('Weekend', 300): 0}