In [None]:
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
from matplotlib.table import Table
from matplotlib import colors
plt.style.use("ggplot")
import random
from tqdm import tqdm
import timeit

epsilon = 0.1 #the percentage of time when we should take the best action (instead of a random action)
discount_factor = 0.9 #discount factor for future rewards
learning_rate = 0.9 #the rate at which the AI agent should learn
num_episode = 1000

NUM_ACTION = 4
actions = ['up', 'right', 'down', 'left']

# Setup environment
env_row = 4
env_column = 5
state_values = np.zeros((env_row, env_column))
env_reward = np.full((env_row, env_column), -1)
env_reward[env_row - 1, 1:int(env_column - 1)] = -100
state_values[env_row - 1, 1:int(env_column - 1)] = -100
print(env_reward)
print(state_values)
# print()
# state_values[2, 2] = 1
# a = state_values[2][2]
# a = 2
# print(state_values)
# for i in range (10):
#   print(explore(3, 0))


In [None]:
def exploit(current_row_index, current_column_index):
  has_max = False
  if current_row_index > 0:
    if not has_max:
      has_max = True
      max = state_values[current_row_index - 1][current_column_index]
      new_row_index, new_column_index = current_row_index - 1, current_column_index
    else:
      if state_values[current_row_index - 1][current_column_index] > max:
        max = state_values[current_row_index - 1][current_column_index]
        new_row_index, new_column_index = current_row_index - 1, current_column_index

  if current_row_index < env_row - 1:
    if not has_max:
      has_max = True
      max = state_values[current_row_index + 1][current_column_index]
      new_row_index, new_column_index = current_row_index + 1, current_column_index
    else:
      if state_values[current_row_index + 1][current_column_index] > max:
        max = state_values[current_row_index + 1][current_column_index]
        new_row_index, new_column_index = current_row_index + 1, current_column_index

  if current_column_index > 0:
    if not has_max:
      has_max = True
      max = state_values[current_row_index][current_column_index - 1]
      new_row_index, new_column_index = current_row_index, current_column_index - 1
    else:
      if state_values[current_row_index][current_column_index - 1] > max:
        max = state_values[current_row_index][current_column_index - 1]
        new_row_index, new_column_index = current_row_index, current_column_index - 1

  if current_column_index < env_column - 1:
    if not has_max:
      has_max = True
      max = state_values[current_row_index][current_column_index + 1]
      new_row_index, new_column_index = current_row_index, current_column_index + 1
    else:
      if state_values[current_row_index][current_column_index + 1] > max:
        max = state_values[current_row_index][current_column_index + 1]
        new_row_index, new_column_index = current_row_index, current_column_index + 1
  return [new_row_index, new_column_index]

def explore(current_row_index, current_column_index):
  list_of_legal_move = list()
  if current_row_index > 0:
    list_of_legal_move.append([current_row_index - 1, current_column_index])
  if current_row_index < env_row - 1:
    list_of_legal_move.append([current_row_index + 1, current_column_index])
  if current_column_index > 0:
    list_of_legal_move.append([current_row_index, current_column_index - 1])
  if current_column_index < env_column - 1:
    list_of_legal_move.append([current_row_index, current_column_index + 1])
  index = random.choice(range(len(list_of_legal_move)))
  return list_of_legal_move[index]

In [None]:
def get_starting_location():
  return env_row - 1, 0

def is_terminal_state(current_row_index, current_column_index):
  assert current_row_index < env_row and current_column_index < env_column # Illegal location
  if current_row_index == env_row - 1 and 0 < current_column_index < env_column:
    return True
  else:
    return False

def move_to_next_state(current_row_index, current_column_index, epsilon):
  # e-greedy policy
  if np.random.random() > epsilon:
    return exploit(current_row_index, current_column_index)
  else: #choose a random action
    return explore(current_row_index, current_column_index)

In [None]:
def run(num_episode, epsilon, discount_factor, learning_rate):
  start = timeit.default_timer()
  list_of_marks = list()
  for episode in range(num_episode):
    sum_of_reward = 0
    row_index, column_index = get_starting_location()
    new_row_index, new_column_index = move_to_next_state(row_index, column_index, epsilon)
    sum_of_reward += env_reward[row_index, column_index] + env_reward[new_row_index, new_column_index]

    #continue taking actions  until we reach a terminal state
    while not is_terminal_state(new_row_index, new_column_index):
      old_row_index, old_column_index = row_index, column_index #store the old row and column indexes
      row_index, column_index = new_row_index, new_column_index

      new_row_index, new_column_index = move_to_next_state(row_index, column_index, epsilon)
      #receive the reward for moving to the new state, and calculate the temporal difference
      reward = env_reward[row_index, column_index]
      old_state_value = state_values[old_row_index, old_column_index]
      state_value = state_values[row_index, column_index]
      temporal_difference = reward + (discount_factor * state_value) - old_state_value

      #update the Q-value for the previous state and action pair
      old_state_value = old_state_value + (learning_rate * temporal_difference)
      # print(type(old_state_value))
      # print(old_row_index, old_column_index)
      # print(state_values[old_row_index, old_column_index])
      state_values[old_row_index, old_column_index] = old_state_value
      sum_of_reward += env_reward[new_row_index, new_column_index]
    
    list_of_marks.append(sum_of_reward)

  end = timeit.default_timer()
  print(f'Running in: {(end - start)*1000} ms')
  return list_of_marks

In [None]:
episode = 10
list_of_marks = run(episode, 0.1, 0.9, 0.8)
a = np.around(state_values, decimals=2, out=None)
print(pd.DataFrame(a))
print(list_of_marks)
plt.plot(range(episode), list_of_marks)

In [None]:
pd.DataFrame(a)