In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
# Actions
ACTIONS = ['Up', 'Down', 'Left', 'Right']
ACTION_IDX = {action: i for i, action in enumerate(ACTIONS)}

In [3]:
# Parameters
alpha = 0.1  
gamma = 0.9  
epsilon = 0.3  
n_episodes = 25
grid_size = 100

In [4]:
# Rewards
green_cell = 100  
blue_cell = 50  
white_cell = -5  
orange_cell = -15
black_cell = 0  

In [5]:
# Initialize the grid with white cells
Grid = np.full((grid_size, grid_size), white_cell)

In [6]:
# Set obstacles, penalties, goal, and start positions (fixed for consistency)
num_black_cells = random.randint(10, 20)
num_orange_cells = random.randint(10, 20)

In [7]:
for _ in range(num_black_cells):
    x, y = random.randint(0, grid_size - 1), random.randint(0, grid_size - 1)
    Grid[x, y] = black_cell

for _ in range(num_orange_cells):
    x, y = random.randint(0, grid_size - 1), random.randint(0, grid_size - 1)
    if Grid[x, y] == white_cell:
        Grid[x, y] = orange_cell

In [8]:
green_x, green_y = random.randint(0, grid_size - 1), random.randint(0, grid_size - 1)
Grid[green_x, green_y] = green_cell

blue_x, blue_y = random.randint(0, grid_size - 1), random.randint(0, grid_size - 1)
Grid[blue_x, blue_y] = blue_cell

In [9]:
# Initialize Q-table using defaultdict
Q = defaultdict(lambda: np.zeros(len(ACTIONS), dtype=np.float32))

In [10]:
def get_reward(state):
    x, y = state
    return Grid[x, y] if 0 <= x < grid_size and 0 <= y < grid_size else black_cell

def is_valid_state(state):
    x, y = state
    return 0 <= x < grid_size and 0 <= y < grid_size and Grid[x, y] != black_cell


In [11]:
episode_rewards = []

In [12]:
for episode in range(n_episodes):
    state = (blue_x, blue_y)
    total_reward = 0

    while True:
        # Epsilon-greedy action selection
        if random.random() < epsilon:
            action = random.choice(ACTIONS)
        else:
            action = ACTIONS[np.argmax(Q[state])]

        # Next state calculation
        x, y = state
        moves = {'Up': (-1, 0), 'Down': (1, 0), 'Left': (0, -1), 'Right': (0, 1)}
        next_state = (x + moves[action][0], y + moves[action][1])

        if not is_valid_state(next_state):
            next_state = state
        else:
            # 80% intended move, 20% other valid moves
            if random.random() >= 0.8:
                valid_moves = [a for a in ACTIONS if a != action and is_valid_state((x + moves[a][0], y + moves[a][1]))]
                next_state = (x + moves[random.choice(valid_moves)][0], y + moves[random.choice(valid_moves)][1]) if valid_moves else state

        # Get reward and update Q-value
        reward = get_reward(next_state)
        total_reward += reward
        Q[state][ACTION_IDX[action]] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state][ACTION_IDX[action]])

        # Transition to the next state
        state = next_state

        # End if goal state is reached
        if Grid[state] == green_cell:
            break

    episode_rewards.append(total_reward)

In [13]:
# Policy extraction
policy = np.full((grid_size, grid_size), -1, dtype=int)
for x in range(grid_size):
    for y in range(grid_size):
        if (x, y) in Q:
            policy[x, y] = np.argmax(Q[(x, y)])


In [14]:
print("Learned Policy (0: UP, 1: DOWN, 2: LEFT, 3: RIGHT):")
print(policy)

Learned Policy (0: UP, 1: DOWN, 2: LEFT, 3: RIGHT):
[[1 2 3 ... 1 2 1]
 [1 1 2 ... 1 2 1]
 [3 0 2 ... 1 0 2]
 ...
 [3 2 0 ... 2 1 3]
 [2 3 0 ... 3 3 3]
 [0 2 1 ... 2 2 3]]


In [15]:
for i in episode_rewards:
    print(i)

-369375
-116040
-57595
-82425
54900
2011190
13467830
12123650
5557845
27089710
70657870
27686400
179269190
133574500
400009785
2074602480
248633295
352035485
1604873340
169397170
1628535855
172854550
1822820320
775288885
1692090985
