In [None]:
import numpy as np
import matplotlib
plt.style.use("ggplot")
from matplotlib import pyplot as plt
from matplotlib.table import Table
from matplotlib import colors
from tqdm import tqdm
import timeit

epsilon = 0.1 #the percentage of time when we should take the best action (instead of a random action)
discount_factor = 0.9 #discount factor for future rewards
learning_rate = 0.9 #the rate at which the AI agent should learn
num_episode = 1000

NUM_ACTION = 4
actions = ['up', 'right', 'down', 'left']

# Setup environment
env_row = 4
env_column = 5
q_values = np.zeros((env_row, env_column, NUM_ACTION))
env_reward = np.full((env_row, env_column), -1)
env_reward[3, 1:4] = -100
env_reward[3,0] = -1
env_reward[3,4] = 1000
print(env_reward)

In [None]:
def is_terminal_state(current_row_index, current_column_index):
  assert current_row_index < env_row and current_column_index < env_column # Illegal location
  if current_row_index == env_row - 1 and 0 < current_column_index < env_column:
    return True
  else:
    return False

def get_starting_location():
  return env_row - 1, 0

def get_next_action(current_row_index, current_column_index, epsilon):
  # e-greedy policy
  if np.random.random() > epsilon:
    return np.argmax(q_values[current_row_index, current_column_index])
  else: #choose a random action
    return np.random.randint(4)

def get_next_location(current_row_index, current_column_index, action_index):
  new_row_index = current_row_index
  new_column_index = current_column_index
  if actions[action_index] == 'up' and current_row_index > 0:
    new_row_index -= 1
  elif actions[action_index] == 'right' and current_column_index < env_column - 1:
    new_column_index += 1
  elif actions[action_index] == 'down' and current_row_index < env_row - 1:
    new_row_index += 1
  elif actions[action_index] == 'left' and current_column_index > 0:
    new_column_index -= 1
  return new_row_index, new_column_index

def get_shortest_path():
    current_row_index, current_column_index = get_starting_location()
    shortest_path = []
    shortest_path.append([current_row_index, current_column_index])
    #continue moving along the path until we reach the goal (i.e., the item packaging location)
    while not is_terminal_state(current_row_index, current_column_index):
      #get the best action to take
      action_index = get_next_action(current_row_index, current_column_index, epsilon = 0)
      #move to the next location on the path, and add the new location to the list
      current_row_index, current_column_index = get_next_location(current_row_index, current_column_index, action_index)
      shortest_path.append([current_row_index, current_column_index])
      print([current_row_index, current_column_index], action_index)
    return shortest_path

In [None]:
def train(num_episode, epsilon, discount_factor, learning_rate):
  start = timeit.default_timer()
  for episode in range(num_episode):
    row_index, column_index = get_starting_location()

    #continue taking actions  until we reach a terminal state
    while not is_terminal_state(row_index, column_index):
      action_index = get_next_action(row_index, column_index, epsilon)
      old_row_index, old_column_index = row_index, column_index #store the old row and column indexes
      row_index, column_index = get_next_location(row_index, column_index, action_index)
      
      #receive the reward for moving to the new state, and calculate the temporal difference
      reward = env_reward[row_index, column_index]
      old_q_value = q_values[old_row_index, old_column_index, action_index]
      temporal_difference = reward + (discount_factor * np.max(q_values[row_index, column_index])) - old_q_value

      #update the Q-value for the previous state and action pair
      new_q_value = old_q_value + (learning_rate * temporal_difference)
      q_values[old_row_index, old_column_index, action_index] = new_q_value

  end = timeit.default_timer()
  print(f'Training complete in: {(end - start)*1000} ms')

In [None]:
train(num_episode = 100, epsilon = 0.1, discount_factor = 0.99, learning_rate = 0.01)
print(np.around(q_values, decimals=2, out=None))
np.argmax(q_values[3, 0])

In [None]:
path = get_shortest_path()
simulation = np.zeros((env_row, env_column))
for i in path:
  simulation[i[0], i[1]] = 1
print(simulation)

In [None]:
print(np.around(q_values, decimals=2, out=None))
np.argmax(q_values[3, 0])