In [1]:
import numpy as np
import random

## parameters

In [2]:
num_episodes = 1000
max_steps_per_episode = 100
learning_rate = 0.1
discount_rate = 0.99
exploration_rate = 1.0
max_exploration_rate = 1.0
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

## environmental set-up

In [3]:
action_space_size = 4   # Actions: 0=up, 1=right, 2=down, 3=left
state_space_size = 16   # Assume a 4x4 grid environment
q_table = np.zeros((state_space_size, action_space_size))  # Initialize Q-table with zeros

## rewards set-up

In [4]:
rewards = np.zeros(state_space_size)
rewards[15] = 100  # Goal state

## helper functions

In [5]:
def get_next_state(state, action):
    """Get the next state given the current state and action."""
    row = state // 4
    col = state % 4
    if action == 0 and row > 0:        # Up
        row -= 1
    elif action == 1 and col < 3:      # Right
        col += 1
    elif action == 2 and row < 3:      # Down
        row += 1
    elif action == 3 and col > 0:      # Left
        col -= 1
    return row * 4 + col

## Q learning algorithm

In [6]:
for episode in range(num_episodes):
    state = np.random.randint(0, state_space_size - 1)  # Start in random state
    done = False

    for step in range(max_steps_per_episode):
        # Exploration-exploitation trade-off
        if random.uniform(0, 1) < exploration_rate:
            action = np.random.randint(action_space_size)  # Explore
        else:
            action = np.argmax(q_table[state, :])          # Exploit

        # Take action and observe reward and next state
        new_state = get_next_state(state, action)
        reward = rewards[new_state]

        # Update Q-table
        q_table[state, action] = q_table[state, action] + learning_rate * (
            reward + discount_rate * np.max(q_table[new_state, :]) - q_table[state, action]
        )

        state = new_state  # Transition to next state

        if state == 15:  # Check if the goal is reached
            break

    # Decay exploration rate
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

## Output table

In [7]:
print("Trained Q-table:")
print(q_table)

Trained Q-table:
[[ 93.43416364  95.09874983  95.0561567   94.02932763]
 [ 95.06553942  96.05960098  96.03791914  93.94434586]
 [ 96.05489684  97.02784086  97.0299      95.09195475]
 [ 96.98972644  97.01823599  98.00999999  96.00863178]
 [ 94.03185513  95.97330536  96.05960042  95.01799281]
 [ 95.06246094  97.02989997  97.02186634  95.08459314]
 [ 96.05958389  98.00999878  98.01        96.05955016]
 [ 97.02954427  98.00996767  99.          97.02981783]
 [ 95.0795032   97.0299      97.02716665  96.05565288]
 [ 96.05781661  98.01        98.00973942  96.05942383]
 [ 97.02989998  99.          98.99999997  97.02989975]
 [ 98.00999999  98.99999995 100.          98.00999995]
 [ 96.04336613  98.00999977  97.0241132   96.99592822]
 [ 97.02816579  99.          98.00908358  97.02807474]
 [ 98.00985287 100.          98.99993894  98.00883666]
 [  0.           0.           0.           0.        ]]


Environment and Q-Table Initialization: Assuming a simple 4x4 grid (16 states), with the goal state in the bottom-right corner (state 15) that gives a reward of 100.

Q-Learning Algorithm:
For each episode, the agent starts in a random state.
It selects actions based on an epsilon-greedy policy(explores with prob epsilon and exploits with prob 1-epsilon)
The Q-value update rule is applied after each action.

Exploration Decay: The exploration rate gradually decreases with each episode to encourage the agent to exploit learned knowledge over time.