Imports

In [56]:
import itertools
import mdptoolbox.example
import numpy as np

init variables

In [57]:
items = 3
factory_size = 4
#we can also set the probability for each item to be stored or unstored
whites = 0.8
reds = 0.15
blues = 0.05
trans_probas = [whites/2, reds/2, blues/2, whites/2, reds/2, blues/2]

not_possible_reward = -10


# encoding: x = between(0,actions-1): store on field x; x = between(actions, actions*2-1): restore on field x - actions
actions = range(factory_size * 2)

# encoding: x = between(0,items-1): store item x; x = between(items, items*2-1): restore item x - items
tasks = range(items * 2)
trans_proba = 1 / len(tasks)

field_combs = list(itertools.product(range(items + 1), repeat=factory_size))
# encoding: ((0,0,0,0),0) until ((3,3,3,3),5)
states = list(itertools.product(range(items * 2), field_combs, repeat=1))


before we can create and fill the transition probability matrix and the reward matrix,
we have to create some helper functions

is_transition_possible checks, wether we can transition from state s to state s_prime
by performing action action

In [58]:
def is_transition_possible(action, s, s_prime):
        task = states[s][0] + 1
        if action < factory_size:
            stored_fields = states[s_prime][1]
            unstored_fields = states[s][1]
        else:
            action -= factory_size
            task -= items
            stored_fields = states[s][1]
            unstored_fields = states[s_prime][1]
        diff = np.subtract(stored_fields, unstored_fields)
        if len(np.nonzero(diff)[0]) != 1 or unstored_fields[action] != 0 or stored_fields[action] != task or \
                stored_fields[action] == unstored_fields[action]:
            return False
        return True

we can now create the transition and reward matrices

In [59]:
transitions = np.zeros(shape=(len(actions), len(states), len(states)))
rewards = np.full((len(states), len(actions)), not_possible_reward)
for action in range(len(transitions)):
    for s in range(len((transitions[action]))):
        reward_found = False
        for s_prime in range(len(transitions[action][s])):
            if is_transition_possible(action, s, s_prime):
                transitions[action][s][s_prime] = trans_probas[states[s_prime][0]]
                reward_found = True
        if reward_found:
            rewards[s][action] = -action
        if np.sum(transitions[action][s]) == 0:
            base_state = s % len(field_combs)
            #TODO: Fix
            transitions[action][s] = np.where(
                np.asarray(range(len(states))) % len(field_combs) == base_state, np.average(trans_probas), 0)

We can now train our mdp model

In [60]:
mdp = mdptoolbox.mdp.ValueIteration(transitions, rewards, 1, epsilon=0.0001)
mdp.setVerbose()
mdp.run()
print("trained for " + str(mdp.iter) + " iterations")
print("time:  " + str(mdp.time))
print("mdp.policy")
policy = mdp.policy
print(mdp.policy)
print("mdp.V")
print(mdp.V)

  Iteration		V-variation
    1		  10.0
    2		  7.225
    3		  5.351041666666669
    4		  4.248682291666668
    5		  3.41838935185185
    6		  3.017184918981485
    7		  2.809627946767293
    8		  2.4601544598866276
    9		  2.2314838215857193
    10		  1.9574756149992325
    11		  1.760999682393333
    12		  1.556056335522591
    13		  1.4052620216329643
    14		  1.2599544462692052
    15		  1.140242547645343
    16		  1.0285626922747184
    17		  0.9326230247167047
    18		  0.8443940528982239
    19		  0.7666334710722111
    20		  0.6955497727390849
    21		  0.6319735694009552
    22		  0.5739802492871462
    23		  0.5220222294143895
    24		  0.4754017197063263
    25		  0.43262784738756466
    26		  0.39389803081496666
    27		  0.35901723304209554
    28		  0.32711681042684404
    29		  0.2982237447307057
    30		  0.271798300805159
    31		  0.24762968317310197
    32		  0.22566092984044417
    33		  0.20559653643655906
    34		  0.1872748904127377
    35		  0.1705598550541651

Before we can evaluate the policy, we need to generate some data

In [61]:
data = np.random.choice(range(6), p=[whites / 2, reds / 2, blues / 2, whites / 2, reds / 2, blues / 2],size=1000)

In the next step, we evaluate the returned policy

In [63]:
factory = (0, 0, 0, 0)
reward = 0
for task in data:
    x = np.array(list(map(lambda x: x==(task, factory), states)))
    curr_state = np.where(x)[0][0]
    action = policy[curr_state]
    reward += rewards[curr_state][action]
    factory = states[np.where(transitions[action][curr_state]== trans_proba)[0][0]][1]
avg_reward = reward/len(data)
print(avg_reward)

IndexError: index 0 is out of bounds for axis 0 with size 0

But How good is our result? We need to compare it to be able to classify it
Let's evaluate the same data with a Greedy algorithm

In [None]:
factory = (0, 0, 0, 0)
reward = 0
for task in data:
    if task < 3: #store item task
        needed = 0
    else:
        needed = task - 2
    available_space = np.where(factory == needed)
    if len(available_space) >= 1:
        if task < 3:
            factory[available_space[0]] = task + 1
        else:
            factory[available_space[0]] = 0
        reward -= available_space[0]
    else:
        reward += not_possible_reward
avg_reward = reward/len(data)
print(avg_reward)