In [42]:
import numpy as np

In [43]:
class LineWorld:
    def __init__(self, nb_cells=10, start_cell=1, good_end_cell=9, bad_end_cell=0):
        self.done = None
        self.current_state = start_cell  # État actuel
        self.end_good_state = good_end_cell  # État final
        self.end_bad_state = bad_end_cell
        self.reward = 0.0
        self.num_states = nb_cells  # Nombre total d'états
        self.states = [i for i in range(nb_cells)]
        self.actions = [0, 1]
        self.num_actions = 2  # Nombre total d'actions possibles
        self.line_world = ["_"] * (self.num_states - 1)
        self.line_world.insert(self.current_state, "X")

    def reset(self):
        self.reward = 0.0
        self.done = False
        self.current_state = np.random.randint(1, 8)

    def state_description(self):
        return np.array([self.current_state / (self.num_states - 1) * 2.0 - 1.0])

    def state_dim(self):
        return len(self.state_description())

    def step(self, action):
        # Si l'action est 1, on avance à droite
        if (action == 1) and (self.current_state != self.end_good_state) and (self.current_state != self.end_bad_state):
            self.current_state += 1
            self.reward = 0  # Pas de récompense pour avancer
            self.line_world.remove("X")
            self.line_world.insert(self.current_state, "X")
            print(self.line_world)
        # Si l'action est 0, on avance à gauche
        elif (action == 0) and (self.current_state != self.end_good_state) and (
                self.current_state != self.end_bad_state):
            self.current_state -= 1
            self.reward = 0  # Pas de récompense pour avancer
            self.line_world.remove("X")
            self.line_world.insert(self.current_state, "X")
            print(self.line_world)
        # Si l'on atteint l'état final, la partie est terminée
        if self.current_state == self.end_good_state:
            self.reward = 1  # Récompense de 1 pour atteindre l'état final
            # print(self.line_world)
            self.done = True
        elif self.current_state == self.end_bad_state:
            self.reward = -1
            # print(self.line_world)
            self.done = True
        return self.current_state, self.reward, self.done

In [44]:
def initP(line):
    P = {}
    for state in line.states:
        st = line.current_state
        for action in line.actions:
            line.current_state = state
            state_, reward_, done_ = line.step(action)
            stt = line.current_state
            P[(stt, reward_, st, action)] = 1
    return P

In [45]:
def evaluate_policy(env, V, policy, discount_factor=0.9):
    theta=1e-6
    P = initP(env)
    #Initialize the Value function 
    #V = np.zeros((world.grid_size[0],world.grid_size[1]))
    
    while True:
        DELTA = 0
        for s in env.states:
            old_V = V[s]
            weight = 1 / len(policy[s])
            for action in policy[s]:
                total = 0
                for key in P:
                    (newState, reward, oldState, act) = key
                    if oldState == s and act == action:
                        total += weight*P[key]*(reward+discount_factor*V[newState])
            V[s] = total
            DELTA = max(DELTA, np.abs(old_V-V[s]))
            
        if DELTA < theta:
            return V            


In [46]:
def policy_improvement(env, V, policy, discount_factor=0.9):
    policy_stable = True
    newPolicy = {}
    for s in env.states:
        old_actions = policy[s]
        value = []
        newAction = []
        P = initP(env)
        for action in old_actions:
            total = 0
            weight = 1 / len(policy[s])
            for key in P:
                (newState, reward, oldState, act) = key
                if oldState == s and act == action:
                    total += weight*P[key]*(reward+discount_factor*V[newState])
                    value.append(np.round(total, 2))
                    newAction.append(action)  
                    
        value = np.array(value) #Get the list of gotten value actions in the current state
        print(value)
        best = np.where(value == value.max())[0] #Get the position of the best gotten value action
        bestActions = [newAction[item] for item in best]
        newPolicy[s] = bestActions
        
        if old_actions != bestActions:
            policy_stable = False
    return policy_stable, newPolicy


In [47]:
def policy_iteration(env, discount_factor=0.9):
    
    #Initialize the policy
    V = np.random.random((len(env.states),))
    V[0] = 0
    V[env.num_states-1] = 0
    
    policy = {}
    for state in env.states:
        # equiprobable random strategy
        policy[state] = env.actions
    print("Policy : ")
    print(policy)
    
    policy_stable = False
    while not policy_stable:
        old_policy = policy.copy()
        
        print("Start of policy evaluation")
        #Evaluate the policy
        V = evaluate_policy(env, V, policy, discount_factor=0.9)
        print("V = ")
        print(V)
        
        print("Start of policy Improvement")
        #Improve the policy
        policy_stable, policy =  policy_improvement(env, V, policy, discount_factor=0.9)
        print("Policy : ")
        print(policy)

            
    return policy
 

In [48]:
if __name__ == '__main__':
    world = LineWorld()
    policy_iteration(world, discount_factor=0.9)

Policy : 
{0: [0, 1], 1: [0, 1], 2: [0, 1], 3: [0, 1], 4: [0, 1], 5: [0, 1], 6: [0, 1], 7: [0, 1], 8: [0, 1], 9: [0, 1]}
Start of policy evaluation
['X', '_', '_', '_', '_', '_', '_', '_', '_', '_']
['_', '_', 'X', '_', '_', '_', '_', '_', '_', '_']
['_', 'X', '_', '_', '_', '_', '_', '_', '_', '_']
['_', '_', '_', 'X', '_', '_', '_', '_', '_', '_']
['_', '_', 'X', '_', '_', '_', '_', '_', '_', '_']
['_', '_', '_', '_', 'X', '_', '_', '_', '_', '_']
['_', '_', '_', 'X', '_', '_', '_', '_', '_', '_']
['_', '_', '_', '_', '_', 'X', '_', '_', '_', '_']
['_', '_', '_', '_', 'X', '_', '_', '_', '_', '_']
['_', '_', '_', '_', '_', '_', 'X', '_', '_', '_']
['_', '_', '_', '_', '_', 'X', '_', '_', '_', '_']
['_', '_', '_', '_', '_', '_', '_', 'X', '_', '_']
['_', '_', '_', '_', '_', '_', 'X', '_', '_', '_']
['_', '_', '_', '_', '_', '_', '_', '_', 'X', '_']
['_', '_', '_', '_', '_', '_', '_', 'X', '_', '_']
['_', '_', '_', '_', '_', '_', '_', '_', '_', 'X']
V = 
[ 0.00339647 -0.49847159  0.007

ValueError: zero-size array to reduction operation maximum which has no identity