In [1]:
import random
import numpy as np
from collections import defaultdict

In [2]:
#Grid params
GRID_SIZE = 4
TERMINAL_STATE = (3,3)
START_STATE = (0,0)
ACTIONS = ['UP', 'DOWN', 'LEFT', 'RIGHT']
ACTION_MAP = {
    'UP' : (-1,0),
    'DOWN' : (1,0),                 #Les index qui se déplacent en gardant soit la même ligne soit la même colonne
    'LEFT' : (0,-1),
    'RIGHT' : (0,1)
}

In [3]:
class GridWorld : 
    def __init__(self, size, start, terminal) :
        self.size = size
        self.start = start
        self.terminal = terminal
        self.state = start

    def reset(self) : 
        self.state = self.start
        return self.state

    def step(self, action) : 
        if self.state == self.terminal : 
            return self.state, 0, True  #Reward = 0

        #Compute the next step
        delta = ACTION_MAP[action]
        next_state = (self.state[0] + delta[0], self.state[1] + delta[1])

        #keep the agent within limits
        next_state = (
            max(0, min(self.state[0], self.size-1)),
            max(0, min(self.state[1], self.size-1))
        )

        #Reward = -1 for each other step
        reward = -1
        done = self.state == self.terminal
        self.state = next_state
        return next_state, reward, done

In [4]:
class MonteCarlo : 
    def __init__(self, env, gamma = 0.99) : 
        self.env = env
        self.gamma = gamma
        self.state_values = defaultdict(float)
        self.visit_counts = defaultdict(int)

    def generate_episode(self) : 
        #generate episode with a random policy
        episode = []
        state = self.env.reset()
        done = False

        while not done : 
            action = random.choice(ACTIONS) #random policy
            next_state, reward, done = self.env.step(action)
            episode.append((state, action, reward))
            state = next_state

        return episode

    def update_values(self, episode) :
        #Monte carlo method for updating value
        G = 0
        visited_states = set()

        for t in reversed(range(len(episode))) : 
            state, _, reward = episode[t]
            G = reward + self.gamma*G #return function
            
            if state not in visited_states :
                self.visit_counts[state] += 1
                self.state_values[state] += (G - self.state_values[state]/self.visit_counts[state])
                visited_state.add(state)    

In [5]:
env = GridWorld(size=GRID_SIZE, start=START_STATE, terminal=TERMINAL_STATE)
agent = MonteCarlo(env)

In [None]:
num_episodes = 100
for episode in range(num_episodes):
    episode_data = agent.generate_episode()
    agent.update_values(episode_data)

In [None]:
print("Values states :")
for i in range(GRID_SIZE):
    for j in range(GRID_SIZE):
        print(f"{(i, j)}: {agent.state_values[(i, j)]:.2f}", end="  ")
    print()