In [2]:
import numpy as np
import random

In [3]:
size = 100
grid = np.zeros((size,size))
start = (0,0)
goal = (size-1,size-1)
grid[goal] = 10

In [4]:
num_obstacles = int(0.3*size*size)
for i in range(num_obstacles):
    x,y = random.randint(0,size-1),random.randint(0,size-1)
    if (x,y) != start and (x,y) != goal:
        grid[x,y] = -1

In [5]:
grid

array([[ 0.,  0.,  0., ...,  0.,  0., -1.],
       [ 0.,  0., -1., ...,  0.,  0., -1.],
       [ 0.,  0.,  0., ...,  0.,  0., -1.],
       ...,
       [ 0.,  0.,  0., ..., -1.,  0., -1.],
       [ 0.,  0., -1., ..., -1.,  0.,  0.],
       [ 0., -1.,  0., ...,  0., -1., 10.]])

In [6]:
def state_valid(state):
    x, y = state
    return 0 <= x < size and 0 <= y < size and grid[x, y] != -1
def get_reward(state):
    return 10 if state == goal else -1 if grid[state] == -1 else 0.1
def nxt_state(state, action):
        x, y = state
        if action == 'up':
            x -= 1
        elif action == 'down':
            x += 1
        elif action == 'left':
            y -= 1
        elif action == 'right':
            y += 1
        next_state = (x, y)
        return next_state if state_valid(next_state) else state

In [7]:
gamma = 0.8
theta = 0.01
value_table = np.zeros((size,size))
actions = ['up','down','left','right']

In [8]:
 while True:
     delta = 0
     new_value_table = np.copy(value_table)
     for x in range(size):
         for y in range(size):
             state = (x, y)
             if not state_valid(state):
                 continue
             if state == goal:
                 continue
                    
             max_value = float('-inf')
             for action in actions:
                 next_state = nxt_state(state, action)
                 reward = get_reward(next_state)
                 value = reward + gamma * value_table[next_state]
                 max_value = max(max_value, value)
                 new_value_table[state] = max_value
                 delta = max(delta, abs(new_value_table[state] - value_table[state]))
         value_table = new_value_table
     if delta < theta:
        break

In [9]:
delta

0.00858993459199997

In [10]:
value_table

array([[ 0.49999708,  0.4999988 ,  0.49999904, ...,  0.5       ,
         0.5       ,  0.        ],
       [ 0.4999988 ,  0.49999961,  0.        , ...,  0.5       ,
         0.5       ,  0.        ],
       [ 0.49999951,  0.49999984,  0.49999995, ...,  0.5       ,
         0.5       ,  0.        ],
       ...,
       [ 0.5       ,  0.5       ,  0.5       , ...,  0.        ,
         6.58      ,  0.        ],
       [ 0.5       ,  0.5       ,  0.        , ...,  0.        ,
         8.1       , 10.        ],
       [ 0.5       ,  0.        ,  0.49998885, ...,  0.63690943,
         0.        ,  0.        ]])

In [14]:
policy = {}
actions = ['up', 'down', 'left', 'right']
for x in range(size):
    for y in range(size):
        state = (x, y)
        if not state_valid(state) or state == goal:
            policy[state] = None
            continue
                
        best_action = None
        max_value = float('-inf')
        for action in actions:
            next_state = nxt_state(state, action)
            reward = get_reward(next_state)
            value = reward + gamma * value_table[next_state]
            if value > max_value:
                max_value = value
                best_action = action
                
        policy[state] = best_action

def get_policy():
    return policy

In [15]:
get_policy()

{(0, 0): 'down',
 (0, 1): 'down',
 (0, 2): 'right',
 (0, 3): 'down',
 (0, 4): 'down',
 (0, 5): 'left',
 (0, 6): None,
 (0, 7): 'up',
 (0, 8): None,
 (0, 9): 'down',
 (0, 10): 'down',
 (0, 11): None,
 (0, 12): 'down',
 (0, 13): None,
 (0, 14): 'down',
 (0, 15): None,
 (0, 16): 'down',
 (0, 17): 'down',
 (0, 18): None,
 (0, 19): 'up',
 (0, 20): 'up',
 (0, 21): None,
 (0, 22): 'down',
 (0, 23): 'down',
 (0, 24): 'down',
 (0, 25): 'left',
 (0, 26): 'down',
 (0, 27): 'down',
 (0, 28): 'right',
 (0, 29): 'down',
 (0, 30): 'down',
 (0, 31): None,
 (0, 32): 'up',
 (0, 33): None,
 (0, 34): 'down',
 (0, 35): 'right',
 (0, 36): 'down',
 (0, 37): None,
 (0, 38): 'down',
 (0, 39): None,
 (0, 40): 'right',
 (0, 41): 'right',
 (0, 42): 'right',
 (0, 43): 'down',
 (0, 44): None,
 (0, 45): None,
 (0, 46): 'right',
 (0, 47): 'down',
 (0, 48): 'down',
 (0, 49): 'down',
 (0, 50): None,
 (0, 51): 'down',
 (0, 52): None,
 (0, 53): 'down',
 (0, 54): None,
 (0, 55): 'down',
 (0, 56): 'down',
 (0, 57): 'down',

In [19]:
import numpy as np
import random

class GridWorld:
    def __init__(self, size=100, obstacle_ratio=0.2):
        self.size = size
        self.grid = np.zeros((size, size))
        
        self.start = (0, 0)
        self.goal = (size - 1, size - 1)
        self.grid[self.goal] = 10 
        
        num_obstacles = int(obstacle_ratio * size * size)
        for _ in range(num_obstacles):
            x, y = random.randint(0, size - 1), random.randint(0, size - 1)
            if (x, y) != self.start and (x, y) != self.goal:
                self.grid[x, y] = -1

    def is_valid(self, state):
        x, y = state
        return 0 <= x < self.size and 0 <= y < self.size and self.grid[x, y] != -1

    def get_reward(self, state):
        return 10 if state == self.goal else -1 if self.grid[state] == -1 else -0.1

    def get_next_state(self, state, action):
        x, y = state
        if action == 'up':
            x -= 1
        elif action == 'down':
            x += 1
        elif action == 'left':
            y -= 1
        elif action == 'right':
            y += 1
        next_state = (x, y)
        return next_state if self.is_valid(next_state) else state

grid_world = GridWorld(size=100, obstacle_ratio=0.2)

# Now, define the QLearningAgent class as you have it
class QLearningAgent:
    def __init__(self, grid_world, alpha=0.1, gamma=0.9, epsilon=0.1, episodes=1000):
        self.env = grid_world
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.episodes = episodes
        self.q_table = np.zeros((self.env.size, self.env.size, 4)) 
        self.actions = ['up', 'down', 'left', 'right']
        self.action_map = {a: i for i, a in enumerate(self.actions)}

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(self.actions)
        else:
            x, y = state
            return self.actions[np.argmax(self.q_table[x, y])]

    def learn(self):
        for episode in range(self.episodes):
            state = self.env.start
            while state != self.env.goal:
                action = self.choose_action(state)
                next_state = self.env.get_next_state(state, action)
                reward = self.env.get_reward(next_state)
                
                # Update Q-table
                x, y = state
                action_index = self.action_map[action]
                best_next_action = np.max(self.q_table[next_state[0], next_state[1]])
                
                self.q_table[x, y, action_index] += self.alpha * (
                    reward + self.gamma * best_next_action - self.q_table[x, y, action_index]
                )
                
                state = next_state

q_agent = QLearningAgent(grid_world, episodes=1000)
q_agent.learn()

In [20]:
q_policy = np.argmax(q_agent.q_table, axis=2)

In [21]:
q_policy

array([[1, 1, 1, ..., 2, 2, 0],
       [2, 0, 3, ..., 0, 3, 1],
       [2, 0, 0, ..., 2, 3, 1],
       ...,
       [1, 2, 0, ..., 0, 0, 1],
       [2, 0, 2, ..., 0, 3, 1],
       [0, 0, 2, ..., 0, 1, 0]])