# Agent and Environment
We should first come up with action way to represent states and actions in the game. Each action can be simply mapped to moving either four directions. We also represent the current state of the agent represent the immediate cells around it.

In [2]:
# Define the state space and the action space
state_space = 3**8 # The number of possible states, each cell being Wall, Empty, or Dot, Agent
action_space = 4 # The number of possible actions

# Q-table
Now we initialize the Q table being state_space * action_space which initially is filled with zeroes.

In [3]:
import numpy as np

# Initialize the Q-table with zeros
Q = np.zeros((state_space, action_space))

def encode(state, isShortened=False):
    result = 0
    # if the input matrix is the whole playable area
    if isShortened == False:
        for row in range(len(state)):
            for column in range(len(state[row])):
                if state[row][column] == 'A':
                    temp = [state[row-1][i] for i in range(column-1, column+2)]
                    temp.append(state[row][column-1])
                    temp.append(state[row][column+1])
                    temp += [state[row+1][i] for i in range(column-1, column+2)]
                    #print(temp)
                    for cell in temp:
                        result *= 3
                        if (cell == 'W'):
                            result += 0
                        elif (cell == 'D'):
                            result += 1
                        elif (cell == 'E'):
                            result += 2
    else:
        for row in range(len(state)):
            for column in range(len(state[row])):
                if row == 1 and column == 1:
                    continue
                result *= 3
                if (state[row][column] == 'W'):
                    result += 0
                elif (state[row][column] == 'D'):
                    result += 1
                elif (state[row][column] == 'E'):
                    result += 2
    return result

def decode(encoding):
    state = [['A' for _ in range(3)]for _ in range(3)]
    for row in range(2, -1, -1):
        for column in range(2, -1, -1):
            if(row == 1 and column == 1):
                continue

            remainder = encoding % 3
            encoding = encoding // 3
            if remainder == 0:
                state[row][column] = 'W'
            elif remainder == 1:
                state[row][column] = 'D'
            elif remainder == 2:
                state[row][column] = 'E'

    return state

# Parameters
Now before training the mode we should define our parameters.

In [4]:
# Define the learning rate, the discount factor, and the maximum number of episodes
learning_rate = 0.2 # How much to update the Q-value (Alpha)
discount_factor = 0.9 # How much to discount the future reward (Gamma)
max_episodes = 1000 # How many episodes to train the agent
max_iteration = 1000 # How many iterations to go each episode
epsilon = 0.1 # How much to explore randomly
# REMEBER TO MAKE EPSILON MOVE GRADUALLY ACCORDING TO max_episodes

In [5]:
import random

def get_possible_reward(state, action):
    """
    gets a state and a planned action and 
    calculates the possible reward for that action

    Args:
        state (int_encoding): and encoding of the current state of the agent
        action (int): a string showing what the next action of agent is going to be

    Returns:
        int: the possible reward to get
    """
    # Initial reward
    reward = 0
    # Get the destination cell of next action
    temp = decode(state)
    if action == 'up':
        temp = temp[0][1]
    elif action == 'down':
        temp = temp[2][1]
    elif action == 'right':
        temp = temp[1][2]
    elif action == 'left':
        temp = temp[1][0]
    
    # Calculate the reward. NOTE: cell cannot be WALL.
    if temp == 'D':
        reward += 10
    elif temp == 'E':
        reward -= 2

    return reward

def move_next_state(environment, action):
    """
    gets an environment and proceeds to run the action on the environment

    Args:
        environment (list(string) ** 2): a list of lists of characters showing the current environment
        action (string): a string showin the next action to take

    Returns:
        list(string) ** 2: the new environment after taking the action
    """
    for row in range(1, len(environment)):
        for column in range(1, len(environment[row])):
            if environment[row][column] == 'A':
                #print (row, column)
                # Clear the previous cell of agent
                environment[row][column] = 'E'
                # Move the agent to the next position
                if action == 'up':
                    environment[row-1][column] = 'A'
                elif action == 'down':
                    environment[row+1][column] = 'A'
                elif action == 'right':
                    environment[row][column+1] = 'A'
                elif action == 'left':
                    environment[row][column-1] = 'A'
                    
                #print(*environment, sep='\n')
                #print("in move_next_state, action=", action)
                #print("\n\n\n")
                
                return environment

def get_next_action(state):
    """
    gets the environment and current state, based on the epsilon value either selects a random action
    or uses the Q table to look for the most profitable action to take

    Args:
        environment (list(string)**2): the environment of the game
        state (int_encoding): the encoding of the current state of the agent

    Returns:
        string: (random/best) action to take in the current state
    """
    temp = decode(state)
    explore = random.random()
    # RANDOM ACTION
   #print("in get next action", temp)
    if explore > epsilon:
    #if True:
        range = 0
        if temp[0][1] != 'W':
            #print("empty up")
            range += 1
        if temp[1][0] != 'W':
            #print("empty left")
            range += 1
        if temp[1][2] != 'W':
            #print("empty right")
            range += 1
        if temp[2][1] != 'W':
            #print("empty down")
            range += 1
        #print("in get next action, range=", range)  
        move = random.randint(0, (range-1))
        
        iterator = 0
        #print("in get next action, move=", move)
        if temp[0][1] != 'W':
            #print("in if1, iterator = ", iterator)
            if move == iterator:
                #print("range is", range, "move is", move, "took up")
                return 'up'
            iterator += 1
        if temp[1][2] != 'W':
            #print("in if2, iterator = ", iterator)
            if move == iterator:
                #print("range is", range, "move is", move, "took right")
                return 'right'
            iterator += 1
        if temp[2][1] != 'W':
            #print("in if3, iterator = ", iterator)
            if move == iterator:
                #print("range is", range, "move is", move, "took down")
                return 'down'
            iterator += 1
        
        #print("DEBUG: TOOK LEFT AS THE ONLY OPTION", iterator, range, temp)   
        #print("range is", range, "move is", move, "took left")                         # DEBUG
        return 'left'
    # BEST ACTION
    else:
        # Next action first assumed to be unknown
        best_move = 'NULL'
        best_move_score = -1e9
        temp2 = Q[state][0]#up
        if (temp[0][1] != 'W' and temp2 > best_move_score):
            best_move_score = temp2
            best_move = 'up'
        temp2 = Q[state][1]#down
        if (temp[2][1] != 'W' and temp2 > best_move_score):
            best_move_score = temp2
            best_move = 'down'
        temp2 = Q[state][2]#left
        if (temp[1][0] != 'W' and temp2 > best_move_score):
            best_move_score = temp2
            best_move = 'left'
        temp2 = Q[state][3]#right
        if (temp[1][2] != 'W' and temp2 > best_move_score):
            best_move_score = temp2
            best_move = 'right'

        if best_move == 'NULL':
            raise Exception("THE AGENT HAS GOTTEN STUCK BETWEEN 4 WALLS", temp)                     # DEBUG

        return best_move
    
def update_Q(prev_state, prev_action, reward, new_state):
    """
    Updating the Q values after taking an action

    Args:
        prev_state (int_encoding): an encoding of the previous state agent was in
        prev_action (string): a string showin the previous action agent has taken
        reward (int): an integer showing the immediate reward gathered by taking the previous action
        newState (int_encoding): an encoding of the current state agent is in
    """
    if prev_action == 'up':
        prev_action = 0
    elif prev_action == 'down':
        prev_action = 1
    elif prev_action == 'left':
        prev_action = 2
    elif prev_action == 'right':
        prev_action = 3

    max_Q = Q[new_state][3]
    for i in range(3):
        max_Q = max(Q[new_state][i], max_Q)
    Q[prev_state][prev_action] = (((1-learning_rate) * Q[prev_state][prev_action]) + 
                                (learning_rate * (reward + (discount_factor * max_Q))))
    return

In [6]:
import pygame
from pygame.locals import *
def test_game(environment):
    pygame.init()
    # Set up the display
    cell_size = 100  # Size of each cell in pixels
    map_width = 11  # mber of cells in the horizontal direction
    map_height = 9 # Number of cells in the vertical direction
    screen_width = cell_size * map_width
    screen_height = cell_size * map_height
    screen = pygame.display.set_mode((screen_width, screen_height))
    pygame.display.set_caption("Pacman")
    # Define colors
    font = pygame.font.Font(None, 24)
    WHITE = (255, 255, 255)
    BLACK = (0, 0, 0)
    BLUE = (0, 0, 255)
    GREEN = (0, 255, 0)
    environment = [['W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W'],
               ['W', 'A', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'W'],
               ['W', 'D', 'W', 'W', 'W', 'D', 'W', 'W', 'W', 'D', 'W'],
               ['W', 'D', 'W', 'D', 'D', 'D', 'D', 'D', 'W', 'D', 'W'],
               ['W', 'D', 'D', 'D', 'W', 'E', 'W', 'D', 'D', 'D', 'W'],
               ['W', 'D', 'W', 'D', 'W', 'E', 'W', 'D', 'W', 'D', 'W'],
               ['W', 'D', 'W', 'D', 'D', 'W', 'D', 'D', 'W', 'D', 'W'],
               ['W', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'W'],
               ['W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W']]
    global epsilon
    epsilon = 0.1
    dot_count = 0
    for list in environment:
        dot_count += list.count('D')
    while (dot_count > 0):
        dot_count = 0
        for list in environment:
            dot_count += list.count('D')
        #print(dot_count, epsilon)
        epsilon = 1.1

        current_state = encode(environment)
        # Choose action
        action = get_next_action(current_state)
        # Get Reward
        reward = get_possible_reward(current_state, action)
        # Update Environment
        environment = move_next_state(environment, action)
        # Update Q
        update_Q(current_state, action, reward, encode(environment))


        screen.fill(BLACK)
        # Render the map
        running = True
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                running = False
        if not running:
            break
        for x in range(map_height):
            for y in range(map_width):
                cell_color = WHITE
                if environment[x][y] == 'W':
                    cell_color = BLACK
                if environment[x][y] == 'D':
                    cell_color = BLUE
                if environment[x][y] == 'A':
                    cell_color = GREEN
                rectangle = pygame.Rect(y * cell_size, x * cell_size, cell_size, cell_size)
                    #cell_color = GREEN
                pygame.draw.rect(screen, cell_color, rectangle)
                
                
        score_text = font.render(f"Current Move Score: {reward}", True, WHITE)
        screen.blit(score_text, (10, 10))
        # Update the display
        pygame.display.flip()
        pygame.time.delay(1000)
        

pygame 2.5.2 (SDL 2.28.3, Python 3.11.5)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [None]:
for episode in range(max_episodes):
    environment = [['W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W'],
               ['W', 'A', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'W'],
               ['W', 'D', 'W', 'W', 'W', 'D', 'W', 'W', 'W', 'D', 'W'],
               ['W', 'D', 'W', 'D', 'D', 'D', 'D', 'D', 'W', 'D', 'W'],
               ['W', 'D', 'D', 'D', 'W', 'E', 'W', 'D', 'D', 'D', 'W'],
               ['W', 'D', 'W', 'D', 'W', 'E', 'W', 'D', 'W', 'D', 'W'],
               ['W', 'D', 'W', 'D', 'D', 'W', 'D', 'D', 'W', 'D', 'W'],
               ['W', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'D', 'W'],
               ['W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W', 'W']]

    global epsilon
    epsilon = 0.1
    dot_count = 0
    for list in environment:
        dot_count += list.count('D')
    stepcount = 0
    while (dot_count > 0 and stepcount < max_iteration):
        stepcount += 1
        dot_count = 0
        for list in environment:
            dot_count += list.count('D')
        #print(dot_count, epsilon)
        if episode == 9999:
            epsilon = 1.1
            print("IN MAIN:")
            print(*environment, sep="\n")
            print(*decode(encode(environment)), sep="\n")
        current_state = encode(environment)
        # Choose action
        action = get_next_action(current_state)
        # Get Reward
        reward = get_possible_reward(current_state, action)
        # Update Environment
        environment = move_next_state(environment, action)
        # Update Q
        update_Q(current_state, action, reward, encode(environment))
        epsilon += 0.001
        if epsilon > 0.9:
            epsilon = 0.9
print(stepcount)
test_game(environment)
