In [None]:
import pickle

import numpy as np
import tensorflow as tf

from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

In [None]:
env_col = 21
env_row = 21

epsilon = 0.8
discount_factor = 0.9
learning_rate = 0.1

lidar_sample_size = int(360/15) # Should always be an integer

actions = ['up', 'down', 'left', 'right']

obstacles = [[3, 0], [3, 1], [3, 2], [3, 3], [3, 4], [3, 5], [3, 6], [3, 7], [3, 8], [3, 9],
             [8, 9], [8, 10], [8, 11], [8, 12], [8, 13], [8, 14], [8, 15], [8, 16], [8, 17], [8, 18], [8, 19], [8, 20],
             [14, 0], [14, 1], [14, 2], [14, 3], [14, 4], [14, 5], [14, 6], [14, 7], [14, 8], [14, 9]]

In [None]:
rewards = np.full((env_row, env_col), -1)
for x,y in obstacles:
    rewards[x,y] = -100

In [None]:
# Create a neural network model
def create_model(input_shape, num_actions):
    model = Sequential([
        Input(shape=input_shape),
        Dense(25, activation='relu'),
        Dense(100, activation='relu'),
        Dense(25, activation='relu'),
        Dense(num_actions, activation='linear')
    ])
    model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))
    return model

In [None]:
def get_starting_location():
    # get a random row and column index
    current_row_index = np.random.randint(env_row)
    current_column_index = np.random.randint(env_col)
    # if the random location is an obstacle, keep getting a new one
    while [current_row_index, current_column_index] in obstacles:
        current_row_index = np.random.randint(env_row)
        current_column_index = np.random.randint(env_col)
    return current_row_index, current_column_index

In [None]:
def is_final_state(row, col, goal_x, goal_y):
    if row == goal_x and col == goal_y:
        return True
    else:
        return False

In [None]:
def get_next_action(q_values, current_row_index, current_column_index, epsilon):
    # if a randomly chosen value between 0 and 1 is less than epsilon,
    # then choose the most promising value from the Q-table for this state.
    if np.random.random() < epsilon:
        return np.argmax(q_values[current_row_index, current_column_index])
    else:  # choose a random action
        return np.random.randint(4)

In [None]:
def get_next_location(current_row_index, current_column_index, action_index):
    new_row_index = current_row_index
    new_column_index = current_column_index
    if actions[action_index] == 'up' and current_row_index < env_row - 1:
        new_row_index += 1
    elif actions[action_index] == 'down' and current_row_index > 0:
        new_row_index -= 1
    elif actions[action_index] == 'right' and current_column_index < env_col - 1:
        new_column_index += 1
    elif actions[action_index] == 'left' and current_column_index > 0:
        new_column_index -= 1

    return new_row_index, new_column_index

In [None]:
def generate_q_table(target):

    rewards[target[0], target[1]] = 100

    q_values = np.zeros((env_row, env_col, len(actions)))
    
    for episode in range(1000):
        # get the starting location for this episode
        row_index, column_index = get_starting_location()
        # continue taking actions (i.e., moving) until we reach a terminal state
        # (i.e., until we reach the item packaging area or crash into an item storage location)
        while not is_final_state(row_index, column_index, target[0], target[1]):
            # choose which action to take (i.e., where to move next)
            action_index = get_next_action(q_values, row_index, column_index, epsilon)
            # perform the chosen action, and transition to the next state (i.e., move to the next location)
            # store the old row and column indexes
            old_row_index, old_column_index = row_index, column_index
            row_index, column_index = get_next_location(row_index, column_index, action_index)
            # receive the reward for moving to the new state, and calculate the temporal difference
            reward = rewards[row_index, column_index]
            old_q_value = q_values[old_row_index,
                                   old_column_index, action_index]
            temporal_difference = reward + (discount_factor *np.max(q_values[row_index, column_index])) - old_q_value
            # update the Q-value for the previous state and action pair
            new_q_value = old_q_value + (learning_rate * temporal_difference)
            q_values[old_row_index, old_column_index, action_index] = new_q_value

    rewards[target[0], target[1]] = -1

    return q_values

In [None]:
try:
    q_tables = pickle.load(open("q_tables.bin", "rb"))
except: # Takes about 1 minute to generate q_tables for 21x21 grid
    print("No q_tables.bin found, initializing new q_tables")
    q_tables = {}
    for i in range(env_row):
        for j in range(env_col):
            if [i, j] not in obstacles:
                q_tables[(i, j)] = generate_q_table([i, j])

    filee = open("q_tables.bin", "wb")
    pickle.dump(q_tables, filee)
    filee.close()
    print("Q tables initialized and saved to q_tables.bin")

In [None]:
try:
    lidar_data = pickle.load(open("lidar_data_21x21.bin", "rb"))
except:
    print("lidar_data_21x21.bin not found")

In [None]:
try:
    model = tf.keras.models.load_model(f'models/ql_fed_dqn_with_lidar_{lidar_sample_size}.h5')
    print('Loaded model from disk')
except:
    print('No model found, creating new one')
    model = create_model((int(4 + lidar_sample_size),), len(actions))

In [None]:
def generate_memory(batch_size):
    memory = []
    for i in range(batch_size):
        state = get_starting_location()
        target = get_starting_location()
        while target == state:
            target = get_starting_location()
        
        q_values = q_tables[target][state[0]][state[1]]

        lidar_step = int(360//lidar_sample_size)
        lidar_values = lidar_data[state][::lidar_step]

        new_lidar_values = list(lidar_values)

        for i in range(len(lidar_values)):
            if lidar_values[i] == float('inf'):
                new_lidar_values[i] = 15

        # print(max(new_lidar_values))
        lidar_values = tuple(new_lidar_values)

        memory.append((state, target, q_values, lidar_values))
    return memory

In [None]:
def train(batch_size, epoch=1):
    
    # Generate a random batch of experiences
    mem = generate_memory(batch_size)
    state = np.array([i[0] for i in mem])
    target = np.array([i[1] for i in mem])
    q_values = np.array([i[2] for i in mem])
    lidar_values = np.array([i[3] for i in mem])

    # Modify the input to contain both the state and the target
    inputt = np.concatenate((state, target, lidar_values), axis=1)

    # Train the model using the experiences
    model.fit(inputt, q_values, epochs=epoch, verbose=0)

In [None]:
def test(test_size):

    # Generate a random batch of experiences
    mem = generate_memory(test_size)
    state = np.array([i[0] for i in mem])
    target = np.array([i[1] for i in mem])
    q_values = np.array([i[2] for i in mem])
    lidar_values = np.array([i[3] for i in mem])

    # Modify the input to contain both the state and the target
    inputt = np.concatenate((state, target, lidar_values), axis=1)

    correct_action = 0

    for i in range(test_size):
        # Get the predicted q values
        q_values_pred = model.predict(inputt[i].reshape(1, 4+lidar_sample_size), verbose=0)

        # Check if the predicted q values are equal to the actual q values
        # print("Actual q values: ", q_values[i])
        # print("Predicted q values: ", q_values_pred)
        # print()
        if np.argmax(q_values_pred) == np.argmax(q_values[i]):
            correct_action += 1
    
    # Return the accuracy
    return correct_action / test_size

In [None]:
max_episodes = 500
for i in range(max_episodes):
    train(10000, 30)
    if i % 25 == 0:
        print(f"{i}/{max_episodes}: {test(50)}")
        model.save(f'models/ql_fed_dqn_with_lidar_{lidar_sample_size}.h5')

model.save(f'models/ql_fed_dqn_with_lidar_{lidar_sample_size}.h5')

In [None]:
test(100)