In [1]:
import numpy as np
import gym

In [24]:
class FooddTruck(gym.Env):
    def __init__(self):
        self.demand_values = [100, 200, 300, 400]
        self.demand_probabilities = [0.3, 0.4, 0.2, 0.1]
        self.capacity = 400
        self.days = ['Mon', 'Tue', 'Wed',
                    'Thu', 'Fri', 'Weekend']
        self.unit_cost = 4
        self.net_revenue = 7
        self.action_space = [0, 100, 200, 300, 400]
        self.state_space = [("Mon", 0)] \
                            + [(day, inventory) for day in self.days[1:] for inventory in [0, 100, 200, 300]]

    def get_next_state_reward(self, state, action, demand):
        day, inventory = state
        result = dict()
        # next day
        result['next_day'] = self.days[self.days.index(day) + 1]
        # starting inventory
        result['starting_inventory'] = min(self.capacity, inventory + action)
        # cost for action
        result['cost'] = self.unit_cost * action
        # sales
        result['sales'] = min(result['starting_inventory'], demand)
        # revenue
        result['revenue'] = self.net_revenue * result['sales']
        # next inventory
        result['next_inventory'] = result['starting_inventory'] - result['sales']
        # reward
        result['reward'] = result['revenue'] - result['cost']
        return result

    def get_transition_probability(self, state, action):
        next_state_and_reward_probability = dict()
        for index, demand in enumerate(self.demand_values):
            result = self.get_next_state_reward(state, action, demand)
            next_state = (result['next_day'], result['next_inventory'])
            reward = result['reward']
            probability = self.demand_probabilities[index]
            if (next_state, reward) not in next_state_and_reward_probability:
                next_state_and_reward_probability[next_state, reward] = probability
            else:
                next_state_and_reward_probability[next_state, reward] += probability
        return next_state_and_reward_probability
    
    def is_terminal(self, state):
        day, inventory = state
        if day == 'Weekend':
            return True
        else:
            return False

# Policy evaluation

In [25]:
def base_policy(states):
    policy = {}
    for state in states:
        action_probability_pairs = dict()
        day, inventory = state
        if inventory >= 300:
            action_probability_pairs[0] = 1.0
        else:
            action_probability_pairs[400 - inventory] = 0.5
            action_probability_pairs[300 - inventory] = 0.5
        policy[state] = action_probability_pairs
    return policy

In [26]:
def expected_update(env, value, state, action_probability_pairs, gamma):
    expected_value = 0
    for action in action_probability_pairs:
        probability_next_state_and_reward = env.get_transition_probability(state, action)
        for next_state, reward in probability_next_state_and_reward:
            expected_value += action_probability_pairs[action] \
                            * probability_next_state_and_reward[next_state, reward] \
                            * (reward + gamma * value[next_state])
    return expected_value

In [34]:
def policy_evaluation(env, policy, max_iter=100,
                      value=None, eps=0.1, gamma=1):
    if not value:
        value = {state: 0 for state in env.state_space}
    k = 0
    while True:
        max_delta = 0
        for state in value:
            if not env.is_terminal(state):
                value_old = value[state]
                action_probability_pairs = policy[state]
                value[state] = expected_update(env, value, state, action_probability_pairs, gamma)
                max_delta = max(max_delta, abs(value[state]- value_old))
        k += 1
        if max_delta < eps:
            print(f"Converged in {k} iterations.")
            break
        elif k == max_iter:
            print(f"Terminating after {k} iterations.")
            break
    return value

In [35]:
foodTruck = FooddTruck()

In [36]:
foodTruck.get_next_state_reward(state=('Mon', 0), action=300, demand=300)

{'next_day': 'Tue',
 'starting_inventory': 300,
 'cost': 1200,
 'sales': 300,
 'revenue': 2100,
 'next_inventory': 0,
 'reward': 900}

In [37]:
foodTruck.get_transition_probability(('Tue', 200), 100)

{(('Wed', 200), 300): 0.3,
 (('Wed', 100), 1000): 0.4,
 (('Wed', 0), 1700): 0.30000000000000004}

In [38]:
policy = base_policy(foodTruck.state_space)

In [44]:
value = policy_evaluation(env=foodTruck, policy=policy)
print(f"Expected weekly profit: {value['Mon', 0]}")

Converged in 6 iterations.
Expected weekly profit: 2510.3852187500006
