In [1]:
import numpy as np

### GridWorld Class

The `GridWorld` class represents the environment in which the Q-learning agent operates. It provides the functionality to initialize the grid, generate starting locations, determine the next location based on an action, and check terminal states.

#### Initialization

- `__init__(self, n=5)`: This constructor initializes the grid world with the following parameters:
  - `n`: Size of the grid. By default, the grid is `n x n` where `n=5`.
  - `self.B`: Coordinates of the base (goal location) set to `(n - 1, n - 1)`.
  - `self.actions`: List of possible actions the agent can take (`'up'`, `'right'`, `'down'`, `'left'`).
  - `self.action_space`: Number of possible actions, which is the length of `self.actions`.

#### Methods

- `get_starting_locations(self)`: 
  - Randomly generates starting locations for both the agent and the package, ensuring that neither the agent nor the package starts at the base location.
  - Returns the initial positions of the agent and package as `(agent_row, agent_col, package_row, package_col)`.

- `get_next_location(self, agent_row, agent_col, action_index)`:
  - Computes the next position of the agent based on the action taken.
  - Takes into account grid boundaries to prevent invalid moves.
  - Returns the new position `(new_row, new_col)`.

- `is_terminal_state(self, agent_row, agent_col, carrying)`:
  - Checks if the current state is terminal. The state is terminal if the agent is at the base location and is carrying the package.
  - Returns `True` if it is a terminal state, otherwise `False`.


In [2]:
class GridWorld:
    def __init__(self, n=5):
        self.n = n
        self.B = (n - 1, n - 1) 
        self.actions = ['up', 'right', 'down', 'left']
        self.action_space = len(self.actions)

    def get_starting_locations(self):
        agent_row = np.random.randint(self.n)
        agent_col = np.random.randint(self.n)
        package_row = np.random.randint(self.n)
        package_col = np.random.randint(self.n)
        while (agent_row, agent_col) == self.B or (package_row, package_col) == self.B:
            agent_row = np.random.randint(self.n)
            package_row = np.random.randint(self.n)
            package_col = np.random.randint(self.n)
        return agent_row, agent_col, package_row, package_col

    def get_next_location(self, agent_row, agent_col, action_index):
        new_row, new_col = agent_row, agent_col
        action = self.actions[action_index]
        if action == 'up' and agent_row > 0:
            new_row -= 1
        elif action == 'right' and agent_col < self.n - 1:
            new_col += 1
        elif action == 'down' and agent_row < self.n - 1:
            new_row += 1
        elif action == 'left' and agent_col > 0:
            new_col -= 1
        return new_row, new_col

    def is_terminal_state(self, agent_row, agent_col, carrying):
        return (agent_row, agent_col) == self.B and carrying

### QLearningAgent Class

The `QLearningAgent` class implements the Q-learning algorithm, which is used to train the agent to make decisions within the `GridWorld` environment. It manages Q-values, selects actions, and updates Q-values based on the agent's experiences.

#### Initialization

- `__init__(self, grid_world, learning_rate=0.1, discount_factor=0.95, epsilon=0.9)`:
  - Initializes the Q-learning agent with the following parameters:
    - `grid_world`: An instance of the `GridWorld` class.
    - `self.q_values`: Q-values for all state-action pairs, initialized randomly. The shape of this array is `(n, n, n, n, 2, action_space)`, where `2` corresponds to the carrying state (0 or 1).
    - `self.learning_rate`: Learning rate \( \alpha \) that controls how much new information overrides old values.
    - `self.discount_factor`: Discount factor \( \gamma \) which determines the importance of future rewards.
    - `self.epsilon`: Epsilon for the epsilon-greedy policy, balancing exploration and exploitation.
    - `self.rewards`: Dictionary of rewards for different actions (`'delivery'`, `'move'`, `'pickup'`).

#### Methods

- `get_next_action(self, agent_row, agent_col, package_row, package_col, carrying)`:
  - Selects the next action for the agent based on the epsilon-greedy policy.
  - With probability `epsilon`, chooses the action with the highest Q-value. Otherwise, selects a random action.
  - Returns the index of the selected action.

- `update_q_values(self, old_state, action_index, reward, new_state)`:
  - Updates the Q-values using the Q-learning update rule.
  - Computes the temporal difference and updates the Q-value for the given old state and action based on the reward received and the maximum Q-value for the new state.


In [3]:
class QLearningAgent:
    def __init__(self, grid_world, learning_rate=0.1, discount_factor=0.95, epsilon=0.9):
        self.grid_world = grid_world
        self.q_values = np.random.rand(grid_world.n, grid_world.n, grid_world.n, grid_world.n, 2, grid_world.action_space)
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.rewards = {
            'delivery': 80,
            'move': -1,
            'pickup': 20
        }

    def get_next_action(self, agent_row, agent_col, package_row, package_col, carrying):
        if np.random.random() < self.epsilon:
            return np.argmax(self.q_values[agent_row, agent_col, package_row, package_col, carrying])
        else:
            return np.random.randint(self.grid_world.action_space)

    def update_q_values(self, old_state, action_index, reward, new_state):
        old_q_value = self.q_values[old_state][action_index]
        temporal_difference = reward + (self.discount_factor * np.max(self.q_values[new_state])) - old_q_value
        self.q_values[old_state][action_index] = old_q_value + (self.learning_rate * temporal_difference)

### Training Class

The `Training` class is responsible for training the Q-learning agent within the `GridWorld` environment. This process involves running multiple episodes where the agent interacts with the environment, learns from the rewards, and updates its Q-values to improve performance.

#### Initialization

- `__init__(self, agent, grid_world, num_episodes=100000, max_steps_per_episode=200)`: This constructor initializes the training parameters:
  - `agent`: An instance of the `QLearningAgent` class.
  - `grid_world`: An instance of the `GridWorld` class.
  - `num_episodes`: The number of training episodes to run.
  - `max_steps_per_episode`: The maximum number of steps to take per episode.

#### Training Process

The `train(self)` method executes the training loop, which involves:

1. **Episode Loop**: Iterates through the specified number of episodes (`num_episodes`).
   - **Starting Locations**: For each episode, the agent and package start at random locations obtained from `grid_world.get_starting_locations()`. The agent starts without carrying the package (`carrying = 0`).

2. **Step Loop**: Within each episode, the agent takes actions for a maximum of `max_steps_per_episode` steps:
   - **Action Selection**: The agent selects an action based on its policy using `self.agent.get_next_action()`. This method implements the epsilon-greedy strategy, where the agent chooses the action with the highest Q-value with probability `epsilon`.
   - **State Transition**: The agent moves to a new location based on the selected action using `grid_world.get_next_location()`.

3. **Reward Calculation**: After moving, the agent receives a reward based on its new state:
   - **Pickup**: If the agent reaches the package location and is not carrying the package, it receives a `pickup` reward.
   - **Delivery**: If the agent reaches the base location while carrying the package, it receives a `delivery` reward.
   - **Move**: For all other moves, the agent receives a `move` penalty.

4. **Q-Value Update**: The Q-values are updated using the Q-learning update rule. The update involves the following:

   - Applies the Bellman equation for Q-learning:
    ```
    Q(s, a) = Q(s, a) + α * [r + γ * max_a'(Q(s', a')) - Q(s, a)]
    ```
    where:
    - `Q(s, a)` is the current Q-value.
    - `α` is the learning rate.
    - `r` is the reward.
    - `γ` is the discount factor.
    - `max_a'(Q(s', a'))` is the maximum Q-value for the next state `s'` over all possible actions.

   The `update_q_values()` method in the `QLearningAgent` class applies this formula to adjust the Q-values based on the observed transition from state \( s \) to \( s' \) and the reward \( r \).

5. **Terminal State Check**: If the agent reaches a terminal state, the episode ends early.


In [4]:
class Training:
    def __init__(self, agent, grid_world, num_episodes=100000, max_steps_per_episode=200):
        self.agent = agent
        self.grid_world = grid_world
        self.num_episodes = num_episodes
        self.max_steps_per_episode = max_steps_per_episode

    def train(self):
        for episode in range(self.num_episodes):
            agent_row, agent_col, package_row, package_col = self.grid_world.get_starting_locations()
            carrying = 0

            for step in range(self.max_steps_per_episode):
                action_index = self.agent.get_next_action(agent_row, agent_col, package_row, package_col, carrying)
                new_agent_row, new_agent_col = self.grid_world.get_next_location(agent_row, agent_col, action_index)

                if (new_agent_row, new_agent_col) == (package_row, package_col) and not carrying:
                    reward = self.agent.rewards['pickup']
                    carrying = 1
                elif (new_agent_row, new_agent_col) == self.grid_world.B and carrying:
                    reward = self.agent.rewards['delivery']
                else:
                    reward = self.agent.rewards['move']

                old_state = (agent_row, agent_col, package_row, package_col, carrying)
                new_state = (new_agent_row, new_agent_col, package_row, package_col, carrying)
                self.agent.update_q_values(old_state, action_index, reward, new_state)

                agent_row, agent_col = new_agent_row, new_agent_col

                if self.grid_world.is_terminal_state(agent_row, agent_col, carrying):
                    break
        print('Training complete!')

### Testing Class

The `Testing` class evaluates the performance of the trained Q-learning agent by testing it in the `GridWorld` environment and checking its success rate.

- `__init__(self, agent, grid_world, max_steps_per_episode=200)`: Initializes the testing with the agent, grid world, and maximum steps per episode.
- `test_agent(self, num_tests=10)`: Tests the agent over a specified number of trials and prints the success rate along with the paths taken during the tests.


In [5]:
class Testing:
    def __init__(self, agent, grid_world, max_steps_per_episode=200):
        self.agent = agent
        self.grid_world = grid_world
        self.max_steps_per_episode = max_steps_per_episode

    def test_agent(self, num_tests=10):
        success_count = 0
        for _ in range(num_tests):
            agent_row, agent_col, package_row, package_col = self.grid_world.get_starting_locations()
            carrying = 0
            path = [(agent_row, agent_col)]
            for step in range(self.max_steps_per_episode):
                action_index = self.agent.get_next_action(agent_row, agent_col, package_row, package_col, carrying)
                agent_row, agent_col = self.grid_world.get_next_location(agent_row, agent_col, action_index)
                path.append((agent_row, agent_col))
                if (agent_row, agent_col) == (package_row, package_col) and not carrying:
                    carrying = 1
                if self.grid_world.is_terminal_state(agent_row, agent_col, carrying):
                    success_count += 1
                    print('Success')
                    break
            print(f'Path taken by agent for package location: {(package_row, package_col)} - ')
            print(path)
        print(f'Success rate: {success_count}/{num_tests}')

In [6]:
grid_world = GridWorld()
agent = QLearningAgent(grid_world)
trainer = Training(agent, grid_world)
trainer.train()

tester = Testing(agent, grid_world)
tester.test_agent(num_tests=10)

Training complete!
Success
Path taken by agent for package location: (2, 4) - 
[(4, 0), (4, 1), (3, 1), (3, 2), (3, 3), (3, 4), (2, 4), (3, 4), (4, 4)]
Success
Path taken by agent for package location: (1, 4) - 
[(3, 1), (3, 2), (3, 3), (2, 3), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (0, 4), (1, 4), (2, 4), (1, 4), (2, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (1, 3), (1, 4), (2, 4), (1, 4), (2, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1, 4), (2, 4), (1,