In [27]:
!nvcc --version
# import torch
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(device)

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_03:03:05_Pacific_Daylight_Time_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [28]:
import numpy as np

class Grid:
    def __init__(self, n=5):
        # Parameters
        self.n = n  # Grid size n x n
        self.actions = ['up', 'right', 'down', 'left']  # Actions the agent can take
        self.action_space = len(self.actions)
        self.B = (n - 1, n - 1) # Define the target location (B)
        
        # Initialize Q-table: (agent_row, agent_col, package_row, package_col, carrying_flag, action) 
        self.q_values = np.zeros((n, n, n, n, 2, self.action_space))
    
    # Function to check if the state is terminal (i.e., package delivered)
    def is_terminal_state(self, agent_row, agent_col, carrying):
        return (agent_row, agent_col) == self.B and carrying
    
    # Function to choose a random, non-terminal starting location for the agent and package
    def get_starting_locations(self):
        agent_row = np.random.randint(self.n)
        agent_col = np.random.randint(self.n)
        package_row = np.random.randint(self.n)
        package_col = np.random.randint(self.n)
        while (agent_row, agent_col) == self.B or (package_row, package_col) == self.B:
            agent_row = np.random.randint(self.n)
            package_row = np.random.randint(self.n)
            package_col = np.random.randint(self.n)
        return agent_row, agent_col, package_row, package_col

    # Function to get the next location based on the chosen action
    def get_next_location(self, agent_row, agent_col, action_index):
        new_row, new_col = agent_row, agent_col
        if self.actions[action_index] == 'up' and agent_row > 0:
            new_row -= 1
        elif self.actions[action_index] == 'right' and agent_col < self.n - 1:
            new_col += 1
        elif self.actions[action_index] == 'down' and agent_row < self.n - 1:
            new_row += 1
        elif self.actions[action_index] == 'left' and agent_col > 0:
            new_col -= 1
        return new_row, new_col

In [29]:
class QLearningAgent:
    def __init__(self, grid_env, epsilon=0.9, discount_factor=0.9, learning_rate=0.1):
        # Epsilon-greedy algorithm for choosing the next action
        self.grid_env = grid_env 
        self.epsilon = epsilon # Epsilon for epsilon-greedy strategy
        self.discount_factor = discount_factor # Discount factor for future rewards
        self.learning_rate = learning_rate  # Learning rate
    
    # Epsilon-greedy algorithm for choosing the next action
    def get_next_action(self, agent_row, agent_col, package_row, package_col, carrying):
        if np.random.random() < self.epsilon:
            return np.argmax(self.grid_env.q_values[agent_row, agent_col, package_row, package_col, carrying])
        else:
            return np.random.randint(self.grid_env.action_space)
    
    # Function to update the Q-values during training
    def update_q_values(self, old_state, action_index, reward, new_state):
        old_q_value = self.grid_env.q_values[old_state][action_index]
        temporal_difference = reward + (self.discount_factor * np.max(self.grid_env.q_values[new_state])) - old_q_value
        self.grid_env.q_values[old_state][action_index] = old_q_value + (self.learning_rate * temporal_difference)

In [30]:
class QLearningTraining:
    def __init__(self, grid_env, agent, num_episodes=100000, max_steps_per_episode=200):
        self.grid_env = grid_env
        self.agent = agent
        # Training parameters
        self.num_episodes = num_episodes # Number of training episodes
        self.max_steps_per_episode = max_steps_per_episode # Limit the steps per episode
    
    def train(self):
        # Training loop
        for episode in range(self.num_episodes):
            
            # Initialize starting locations
            agent_row, agent_col, package_row, package_col = self.grid_env.get_starting_locations()
            carrying = 0  # Agent starts without carrying the package
            
            for step in range(self.max_steps_per_episode):
                
                # Choose action
                action_index = self.agent.get_next_action(agent_row, agent_col, package_row, package_col, carrying)
                
                # Get next location
                new_agent_row, new_agent_col = self.grid_env.get_next_location(agent_row, agent_col, action_index)
                
                # Determine the reward and update carrying status
                if (new_agent_row, new_agent_col) == (package_row, package_col) and not carrying:
                    reward = 20  # pickup_reward
                    carrying = 1  # Now the agent is carrying the package
                elif (new_agent_row, new_agent_col) == self.grid_env.B and carrying:
                    reward = 80  # delivery_reward
                else:
                    reward = -1  # move_reward
                
                # Update Q-values
                old_state = (agent_row, agent_col, package_row, package_col, carrying)
                new_state = (new_agent_row, new_agent_col, package_row, package_col, carrying)
                self.agent.update_q_values(old_state, action_index, reward, new_state)
                
                # Transition to the new state
                agent_row, agent_col = new_agent_row, new_agent_col
                
                # Check if the task is complete
                if self.grid_env.is_terminal_state(agent_row, agent_col, carrying):
                    break
        
        print('Training complete!')


In [31]:
# Test the agent's performance after training
class QLearningTester:
    def __init__(self, grid_env, agent, max_steps_per_episode=200):
        self.grid_env = grid_env
        self.agent = agent
        self.max_steps_per_episode = max_steps_per_episode
    
    def test(self, num_tests=10):
        success_count = 0
        for _ in range(num_tests):
            agent_row, agent_col, package_row, package_col = self.grid_env.get_starting_locations()
            carrying = 0
            path = [(agent_row, agent_col)]
            for step in range(self.max_steps_per_episode):
                action_index = self.agent.get_next_action(agent_row, agent_col, package_row, package_col, 1)
                agent_row, agent_col = self.grid_env.get_next_location(agent_row, agent_col, action_index)
                path.append((agent_row, agent_col))
                if (agent_row, agent_col) == (package_row, package_col) and not carrying:
                    carrying = 1
                if self.grid_env.is_terminal_state(agent_row, agent_col, carrying):
                    success_count += 1
                    print('Success')
                    break
            print(f'Path taken by agent for package location: {(package_row, package_col)} - ')
            print(path)
        print(f'Success rate: {success_count}/{num_tests}')

In [33]:
# Instantiate classes and run training/testing
grid_env = Grid()
agent = QLearningAgent(grid_env)
training = QLearningTraining(grid_env, agent)
training.train()

tester = QLearningTester(grid_env, agent)
tester.test()


Training complete!
Success
Path taken by agent for package location: (0, 0) - 
[(1, 0), (0, 0), (0, 1), (1, 1), (1, 2), (1, 3), (2, 3), (3, 3), (3, 4), (4, 4)]
Path taken by agent for package location: (1, 2) - 
[(3, 2), (3, 3), (3, 4), (4, 4), (3, 4), (4, 4), (4, 3), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (3, 3), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (4, 4), (3, 4), (4, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (4, 3), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4, 4), (3, 4), (4,