In [1]:
import matplotlib

matplotlib.use("TkAgg")
import gym
import gridworld
from gym import wrappers, logger
import numpy as np
import copy
from scipy.sparse import dok_matrix, lil_matrix

In [65]:
class ValueIterationAgent(object):
    """Agent implementing Value Iteration. Naive implementation with dictionary structure."""

    def __init__(self, env):
        self.env = env
        self.action_space = env.action_space
        self.statedic, self.mdp = env.getMDP()
        self.policy = {}
        for state, state_id in self.statedic.items():
            if state in self.mdp:
                list_actions = self.mdp[state].keys()
                self.policy[state_id] = self.action_space.sample()

    def act(self, observation, reward, done):
        return self.policy[self.statedic[self.env.state2str(observation)]]
                
    def train(self, eps=5e-4, gamma=0.99):  # Value Iteration algorithm
        value = {}
        for state, state_id in self.statedic.items():
            value[state_id] = 0
            
        distance = np.inf
        while distance > eps:
            new_value = {}
            
            for state, state_id in self.statedic.items():
                if state in self.mdp:
                    results = [sum([proba*(reward + gamma*value[self.statedic[new_state]]) for (proba, new_state, reward, done) in transitions]) for action, transitions in self.mdp[state].items()]
                    new_value[state_id] = np.max(results)
                else:
                    new_value[state_id] = value[state_id]
                    
            distance = np.linalg.norm(np.array(list(value.values()))-np.array(list(new_value.values())), ord=np.inf)
            value = new_value
                    
        for state, state_id in self.statedic.items():
                if state in self.mdp:
                    results = [sum([proba*(reward + gamma*value[self.statedic[new_state]]) for (proba, new_state, reward, done) in transitions]) for action, transitions in self.mdp[state].items()]
                    self.policy[state_id] = np.argmax(results)

In [66]:
class OptimizedValueIterationAgent(object):
    """Agent implementing Value Iteration with an efficient implementation (scipy sparse matrices)."""

    def __init__(self, env):
        self.env = env
        self.statedic, self.mdp = env.getMDP()
        self.policy = np.zeros(len(self.statedic.items()))
        
        # Translation of the MDP as CSC scipy matrices
        # See https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csc_matrix.html#scipy.sparse.csc_matrix
        rewards = [lil_matrix((env.nS, env.nS)) for a in range(env.nA)]
        probas = [lil_matrix((env.nS, env.nS)) for a in range(env.nA)]
        for state in self.mdp.keys():
            for action, transitions in self.mdp[state].items():
                for (proba, new_state, reward, done) in transitions:
                    rewards[action][self.statedic[state], self.statedic[new_state]] = reward
                    probas[action][self.statedic[state], self.statedic[new_state]] += proba
                    
        self.rewards = [x.tocsc() for x in rewards]
        self.probas = [x.tocsc() for x in probas]

    def act(self, observation, reward, done):
        return self.policy[self.statedic[self.env.state2str(observation)]]
                
    def train(self, eps=5e-4, gamma=0.99):
        nS = self.env.nS
        nA = self.env.nA
        value = np.zeros(nS)
        distance = np.inf
        while distance > eps:
            new_value = np.zeros(nS)

            action_values = np.zeros((nS, nA))
            for a in range(nA):
                reward = self.rewards[a].copy()
                proba = self.probas[a]
                
                # create the array of gamma*value of the appropriate shape to be added to the rewards
                delta = np.repeat(gamma*value, np.diff(reward.indptr))
                reward.data += delta
                action_values[:, a] = (proba.multiply(reward)).sum(axis=1).flatten()

            new_value = np.max(action_values, axis=1)

            distance = np.linalg.norm(new_value-value, ord=np.inf)
            value = new_value

        action_values = np.zeros((nS, nA))
        for a in range(nA):
            reward = self.rewards[a].copy()
            proba = self.probas[a].copy()

            delta = np.repeat(gamma*value, np.diff(reward.indptr))
            reward.data += delta
            action_values[:, a] = (proba.multiply(reward)).sum(axis=1).flatten()


        self.policy = np.argmax(action_values, axis=1)

In [67]:
env = gym.make("gridworld-v0")
env.setPlan("gridworldPlans/plan0.txt", {0: -0.001, 3: 1, 4: 1, 5: -1, 6: -1})
env.seed(0)  # Initialise le seed du pseudo-random

[0]

In [68]:
agentVI = ValueIterationAgent(env)
agentVI.train()

In [69]:
agentVI.policy

{0: 0, 2: 2, 3: 2, 4: 3, 6: 3, 7: 3, 8: 1, 9: 1, 10: 2}

In [70]:
agentOVI = OptimizedValueIterationAgent(env)

In [71]:
agentOVI.train()
agentOVI.policy

array([0, 0, 2, 2, 3, 0, 3, 3, 1, 1, 2])

In [10]:
statedic, mdp = env.getMDP()

In [13]:
for key, value in statedic.items():
    print(env.str2state(key))
    print(value)

[[1 1 1 1 1 1]
 [1 0 0 0 3 1]
 [1 0 1 0 5 1]
 [1 0 0 0 2 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]
0
[[1 1 1 1 1 1]
 [1 0 0 0 3 1]
 [1 0 1 0 2 1]
 [1 0 0 0 0 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]
1
[[1 1 1 1 1 1]
 [1 0 0 0 3 1]
 [1 0 1 0 5 1]
 [1 0 0 2 0 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]
2
[[1 1 1 1 1 1]
 [1 0 0 0 3 1]
 [1 0 1 2 5 1]
 [1 0 0 0 0 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]
3
[[1 1 1 1 1 1]
 [1 0 0 2 3 1]
 [1 0 1 0 5 1]
 [1 0 0 0 0 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]
4
[[1 1 1 1 1 1]
 [1 0 0 0 2 1]
 [1 0 1 0 5 1]
 [1 0 0 0 0 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]
5
[[1 1 1 1 1 1]
 [1 0 2 0 3 1]
 [1 0 1 0 5 1]
 [1 0 0 0 0 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]
6
[[1 1 1 1 1 1]
 [1 2 0 0 3 1]
 [1 0 1 0 5 1]
 [1 0 0 0 0 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]
7
[[1 1 1 1 1 1]
 [1 0 0 0 3 1]
 [1 2 1 0 5 1]
 [1 0 0 0 0 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]
8
[[1 1 1 1 1 1]
 [1 0 0 0 3 1]
 [1 0 1 0 5 1]
 [1 2 0 0 0 1]
 [1 1 1 1 1 1]
 [1 1 1 1 1 1]]
9
[[1 1 1 1 1 1]
 [1 0 0 0 3 1]
 [1 0 1 0 5 1]
 [1 0 2 0 0 1]
 [1 1 1 1 