Imports

In [1]:
import itertools
import mdptoolbox.example
import numpy as np

init variables

In [2]:
items = 3
factory_size = 4
#we can also set the probability for each item to be stored or unstored
whites = 1/3
reds = 1/3
blues = 1/3
trans_probas = [whites/2, reds/2, blues/2, whites/2, reds/2, blues/2]

not_possible_reward = -10


# encoding: x = between(0,actions-1): store on field x; x = between(actions, actions*2-1): restore on field x - actions
actions = range(factory_size * 2)

# encoding: x = between(0,items-1): store item x; x = between(items, items*2-1): restore item x - items
tasks = range(items * 2)
# trans_proba = 1 / len(tasks)

field_combs = list(itertools.product(range(items + 1), repeat=factory_size))
# encoding: ((0,0,0,0),0) until ((3,3,3,3),5)
states = list(itertools.product(range(items * 2), field_combs, repeat=1))


before we can create and fill the transition probability matrix and the reward matrix,
we have to create some helper functions

is_transition_possible checks, wether we can transition from state s to state s_prime
by performing action action

In [3]:
def is_transition_possible(action, s, s_prime):
        task = states[s][0] + 1
        if action < factory_size:
            stored_fields = states[s_prime][1]
            unstored_fields = states[s][1]
        else:
            action -= factory_size
            task -= items
            stored_fields = states[s][1]
            unstored_fields = states[s_prime][1]
        diff = np.subtract(stored_fields, unstored_fields)
        if len(np.nonzero(diff)[0]) != 1 or unstored_fields[action] != 0 or stored_fields[action] != task or \
                stored_fields[action] == unstored_fields[action]:
            return False
        return True

we can now create the transition and reward matrices

In [4]:
transitions = np.zeros(shape=(len(actions), len(states), len(states)))
rewards = np.full((len(states), len(actions)), not_possible_reward)
for action in range(len(transitions)):
    for s in range(len((transitions[action]))):
        reward_found = False
        for s_prime in range(len(transitions[action][s])):
            if is_transition_possible(action, s, s_prime):
                transitions[action][s][s_prime] = trans_probas[s_prime[0]]
                reward_found = True
        if reward_found:
            rewards[s][action] = -action
        if np.sum(transitions[action][s]) == 0:
            base_state = s % len(field_combs)
            transitions[action][s] = np.where(
                np.asarray(range(len(states))) % len(field_combs) == base_state, trans_proba, 0)

We can now train a mdp model

In [5]:
mdp = mdptoolbox.mdp.ValueIteration(transitions, rewards, 1, epsilon=0.0001)
mdp.setVerbose()
mdp.run()
print("trained for " + str(mdp.iter) + " iterations")
print("time:  " + str(mdp.time))
print("mdp.policy")
policy = mdp.policy
print(mdp.policy)
print("mdp.V")
print(mdp.V)

  Iteration		V-variation
    1		  10.0
    2		  6.0
    3		  4.222222222222222
    4		  3.467592592592597
    5		  3.0277777777777786
    6		  2.662294238683124
    7		  2.3539094650205783
    8		  2.0495148891175106
    9		  1.7757749390336883
    10		  1.5259190195854373
    11		  1.3094833878696122
    12		  1.120619537221387
    13		  0.9599177783846216
    14		  0.8243910399255867
    15		  0.7096770735520295
    16		  0.6122757419754805
    17		  0.5290951820058609
    18		  0.4576328397961049
    19		  0.3960631975939748
    20		  0.34381220428436166
    21		  0.29951657542164867
    22		  0.26162694202967884
    23		  0.22909169580135824
    24		  0.20103152998930796
    25		  0.1767499961882777
    26		  0.155659777173355
    27		  0.13727961757379603
    28		  0.12121916691356205
    29		  0.10715533969820967
    30		  0.09481826794754511
    31		  0.08398003613137917
    32		  0.07444753615615696
    33		  0.06609271032220931
    34		  0.05874217241387214
    35		  0.0522618

Before we can evaluate the policy, we need to generate some data

In [7]:
data = np.random.choice(range(6), p=[whites / 2, reds / 2, blues / 2, whites / 2, reds / 2, blues / 2],size=1000)

In the next step, we evaluate the returned policy

In [9]:
factory = (0, 0, 0, 0)
reward = 0
for task in data:
    x = np.array(list(map(lambda x: x==(task, factory), states)))
    curr_state = np.where(x)[0][0]
    action = policy[curr_state]
    reward += rewards[curr_state][action]
    factory = states[np.where(transitions[action][curr_state] == trans_probas[task])[0][0]][1]
avg_reward = reward/len(data)
print(avg_reward)

-5.732
