In [1]:
import numpy as np

In [2]:
class Game():
    def __init__(self, grid, start_state=(0, 0)):
        self.grid = grid
        self.rows, self.cols = grid.shape
        self.start_state = start_state
        self.num_turns = 0
        
    def get_reward(self):
        """
        return reward obtained in current state
        """
        if self.grid[self.state[0]][self.state[1]] == 1:
            return 1
        else:
            return -1
    
    def is_valid(self, state):
        """
        checks if this is a valid state(move)
        """
        if state[0] < 0 or state[0] >= self.rows:
            # outside row bound
            return False
        if state[1] < 0 or state[1] >= self.cols:
            # outside column bound
            return False
        if self.grid[state[0]][state[1]] == np.nan:
            # wall
            return False
        return True
    
    def next_state(self, action):
        """
        update state according to action taken
        """
        if action == "up":
            next_state = (self.state[0]-1, self.state[1])
        elif action == "down":
            next_state = (self.state[0]+1, self.state[1])
        elif action == "right":
            next_state = (self.state[0], self.state[1]+1)
        elif action == "left":
            next_state = (self.state[0], self.state[1]-1)
        else:
            raise ValueError("Invalid action!")
        if self.is_valid(next_state):
            return next_state
        else:
            return self.state
        
    def reset(self):
        """
        reset states
        """
        self.state = self.start_state
        self.num_turns = 0
    
    def is_end(self):
        """
        Returns true if the game has come to an end
        """
        val = self.grid[self.state[0]][self.state[1]]
        return val == 1 or self.num_turns == 1000

In [3]:
class Agent():
    def __init__(self, game):
        self.game = game
        self.lr = 0.2
        self.explore_rate = 0.2
        self.gamma = 0.95
        self.actions = ["up", "down", "left", "right"]
        
        self.state_values = {}
        for i in range(self.game.rows):
            for j in range(self.game.cols):
                self.state_values[(i, j)] = 0
    
    def select_action(self):
        if np.random.uniform(0, 1) <= self.explore_rate:
            action = np.random.choice(self.actions)
        else:
            max_reward = -np.inf
            action = ""
            for a in self.actions:
                exp_reward = self.state_values[self.game.next_state(a)]
                if exp_reward >= max_reward:
                    max_reward = exp_reward
                    action = a
        return action
    
    
    def play_episode(self, max_iter=1000):
        self.game.reset()
        it = 0
        episode_states = []
        episode_states.append(self.game.state)
        while True:
            action = self.select_action()
            self.game.state = self.game.next_state(action)
            self.game.num_turns += 1
            episode_states.append(self.game.state)
            
            if self.game.is_end():
                reward = self.game.get_reward()
                print('Final reward:', reward)
                for state in reversed(episode_states):
                    self.state_values[state] += self.lr * (reward-self.state_values[state])
                    reward *= self.gamma
                return reward, len(episode_states)
    
    def play(self, episodes=1000):
        for _ in range(episodes):
            print(self.play_episode())
            self.explore_rate *= 0.95
        

In [5]:
N = np.nan
grid = np.array([
    [0, 0, 0, 0, 0, 0], 
    [0, N, 0, 0, 1, 0], 
    [0, 0, 0, N, 0, 0], 
    [0, 0, 0, 1, 0, 0]
])
game = Game(grid)
agent = Agent(game)
agent.play(1000)

Final reward: 1
(0.5403600876626365, 12)
Final reward: 1
(0.6983372960937497, 7)
Final reward: 1
(0.0694428401872336, 52)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.6634204312890623, 8)
Final reward: 1
(0.6634204312890623, 8)
Final reward: 1
(0.6983372960937497, 7)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.6983372960937497, 7)
Final reward: 1
(0.6983372960937497, 7)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.6983372960937497, 7)
Final reward: 1
(0.7350918906249998, 6

Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)
Final reward: 1
(0.7350918906249998, 6)


In [17]:
5/np.inf

0.0