In [169]:
import random
import numpy as np

# 1. Criação do grid

In [170]:
def randomCell():
  line = random.randint(0, 7)
  col = random.randint(0, 7)
  return [line, col]

In [171]:
def gridInit():

  grid = []
  currentLineList = []
  lines = 8
  cols = 8
  currentLine = 1
  currentCol = 1

  # percorre linha por linha
  while currentLine <= lines:
    while currentCol <= cols:
      currentLineList.append({
          "valueUp": 0,
          "valueDown": 0,
          "valueLeft": 0,
          "valueRight": 0,
          "type": "-"
          })
      currentCol += 1
    grid.append(currentLineList)
    currentLineList = []
    currentLine += 1
    currentCol = 1 # reseta a coluna pro início cada vez que a linha muda

  # colocando célular randômicas
  randomStart = randomCell()
  randomEnd = randomCell()
  randomMountain = randomCell()
  randomQuicksand = randomCell()
  grid[randomStart[0]][randomStart[1]]["type"] = "S"
  grid[randomEnd[0]][randomEnd[1]]["type"] = "E"
  grid[randomMountain[0]][randomMountain[1]]["type"] = "M"
  grid[randomQuicksand[0]][randomQuicksand[1]]["type"] = "Q"

  return grid, randomStart, randomEnd, randomMountain, randomQuicksand

# 2. Print do grid

In [172]:
def gridPrint(grid):
  currentLine = 0
  currentCol = 0
  while currentLine < 8:
    strLine = ""
    while currentCol < 8:
      strLine += grid[currentLine][currentCol]["type"] + " "
      currentCol += 1
    print(strLine)
    currentCol = 0
    currentLine += 1

# 3. Conjunto de ações que o agente pode realizar
O ambiente assume que o agente não pode tomar a decisão de ir para fora do grid ou mover-se para uma posição de montanha. Essas ações não estarão disponíveis para escolha dependendo de sua posição.
Se, por acaso, o número de montanhas aumentar, a função `availableActions()` contempla + de uma montanha por grid.
A função não contempla grids diferentes de 8x8, o código teria que ser editado para isso.

In [173]:
def availableActions(agentPosition, mountainPosition):
  # positions têm forma [line, col]
  up = True
  down = True
  left = True
  right = True

  # handle de linhas e colunas limites
  if agentPosition[1] == 7: # se está na col 7
    right = False
  if agentPosition[1] == 0: # se está na col 0
    left = False
  if agentPosition[0] == 0: # se está na line 0
    up = False
  if agentPosition[0] == 7:
    down = False

  # handle de mountain
  mountainLine = mountainPosition[0]
  mountainCol = mountainPosition[1]
  if agentPosition[0] + 1 == mountainLine: # se a montanha está em baixo do agente
    down = False
  if agentPosition[0] - 1 == mountainLine: # se a montanha está em cima do agente
    up = False
  if agentPosition[1] + 1 == mountainCol: # se a montanha está à direita do agente
    right = False
  if agentPosition[1] - 1 == mountainCol: # se a montanha está à esquerda do agente
    left = False

  return up, down, left, right

# 4. Recompensas
- Entrar na areia movediça = -10
- Mover-se para qualquer posição que não seja final = -1
- Mover-se para estado final = +100


In [174]:
def rewardBasedOnAction(action, agentPosition, endPosition, quicksandPosition):
  if action == 0: # up
    newPosition = [agentPosition[0] - 1, agentPosition[1]]
  elif action == 1: # down
    newPosition = [agentPosition[0] + 1, agentPosition[1]]
  elif action == 2: # left
    newPosition = [agentPosition[0], agentPosition[1] - 1]
  elif action == 3: # right
    newPosition = [agentPosition[0], agentPosition[1] + 1]
  else:
    raise ValueError("Action out of range")

  if newPosition == endPosition:
    return newPosition, +100
  elif newPosition == quicksandPosition:
    return newPosition, -10
  else:
    return newPosition, -1

# Atualização de qualidades com Q-learning
O agente tentará aprender a melhor política utilizando o algoritmo Q-learning

In [175]:
def updateCellQuality(oldQuality, reward, maxNextActionValue):
  # parâmetros do q-learning
  discountFactor = 0.9
  learningRate = 0.1
  return oldQuality + (learningRate * (reward + (discountFactor * maxNextActionValue) - oldQuality))

In [176]:
def checkIfQualitiesAreEqual(upValue, downValue, leftValue, rightValue):
  return upValue == downValue == leftValue == rightValue

In [177]:
def checkIfRandomActionIsPossible(action, up, down, left, right):
    isRandomActionPossible = False
    while isRandomActionPossible == False:
      action = random.randint(0, 3)
      if action == 0:
        if up == False:
          pass
        else:
          isRandomActionPossible = True
      elif action == 1:
        if down == False:
          pass
        else:
          isRandomActionPossible = True
      elif action == 2:
        if left == False:
          pass
        else:
          isRandomActionPossible = True
      elif action == 3:
        if right == False:
          pass
        else:
          isRandomActionPossible = True
    return action


In [178]:
def actionChooser(grid, agentPosition, up, down, left, right):
  exploitationRate = 60 # em %
  whatWillIDo = random.randint(1, 100)
  goingUp = grid[agentPosition[0]][agentPosition[1]]["valueUp"]
  goingDown = grid[agentPosition[0]][agentPosition[1]]["valueDown"]
  goingLeft = grid[agentPosition[0]][agentPosition[1]]["valueLeft"]
  goingRight = grid[agentPosition[0]][agentPosition[1]]["valueRight"]
  if whatWillIDo >= exploitationRate: # explora
    action = random.randint(0, 3)
    action = checkIfRandomActionIsPossible(action, up, down, left, right)
  else: # exploita
    # se todas as qualidades das ações são iguais, toma uma aleatória possível
    if checkIfQualitiesAreEqual(goingUp, goingDown, goingLeft, goingRight):
      action = random.randint(0, 3)
      action = checkIfRandomActionIsPossible(action, up, down, left, right)

    # senão, seleciona a maior qualidade de estado/ação
    else:
      possibleActions = []
      if up == True:
        possibleActions.append(goingUp)
      if down == True:
        possibleActions.append(goingDown)
      if left == True:
        possibleActions.append(goingLeft)
      if right == True:
        possibleActions.append(goingRight)
      action = np.argmax(possibleActions)

  return action

In [179]:
def maxNextStateActionQualityCalculator(grid, nextPosition, up, down, left, right):
  goingUp = grid[nextPosition[0]][nextPosition[1]]["valueUp"]
  goingDown = grid[nextPosition[0]][nextPosition[1]]["valueDown"]
  goingLeft = grid[nextPosition[0]][nextPosition[1]]["valueLeft"]
  goingRight = grid[nextPosition[0]][nextPosition[1]]["valueRight"]
  allValues = []
  if up == True:
    allValues.append(goingUp)
  if down == True:
    allValues.append(goingDown)
  if left == True:
    allValues.append(goingLeft)
  if right == True:
    allValues.append(goingRight)
  return np.argmax(allValues)

# Execução do ambiente

In [180]:
# inicialização do ambiente
grid, randomStart, endPosition, mountainPosition, quicksandPosition = gridInit()
gridPrint(grid)

- - - - - - Q S 
- - - - - - - - 
- - - - - - - - 
- - - - - - - - 
- - - - - - - - 
- - - - - - - - 
- - - - M - - - 
- - - E - - - - 


## Rodando Q-learning

In [181]:
def turnActionToString(action):
  if action == 0:
    return "valueUp"
  elif action == 1:
    return "valueDown"
  elif action == 2:
    return "valueLeft"
  elif action == 3:
    return "valueRight"

In [184]:
print(randomStart)

[0, 7]


In [None]:
agentPosition = randomStart
iterations = 100
qualityTable = []
cumulativeReward = 0
rewardsForEachEpisode = []
done = False
i = 0
n = 0

for i in range(iterations):
  print("EPISODE " + str(i) + " -----------------------------------------------------------------------")
  while not done:
    # finds the available actions for the current position
    up, down, left, right = availableActions(agentPosition, mountainPosition)

    # chooses action
    action = actionChooser(grid, agentPosition, up, down, left, right)
    print("Step action" + str(action))
    nextPosition, reward = rewardBasedOnAction(action, agentPosition, endPosition, quicksandPosition)
    print(reward, nextPosition)
    print(up, down, left, right)
    cumulativeReward += reward

    # utiliza as informações para atualizar as qualidades
    maxNextStateActionQuality = maxNextStateActionQualityCalculator(grid, nextPosition, up, down, left, right)
    actionString = turnActionToString(action)
    currentStateActionQuality = grid[agentPosition[0]][agentPosition[1]][actionString]
    updatedStateActionQuality = updateCellQuality(currentStateActionQuality, reward, maxNextStateActionQuality)
    grid[agentPosition[0]][agentPosition[1]][actionString] = updatedStateActionQuality

    # critério de parada
    if agentPosition == quicksandPosition or agentPosition == endPosition:
      done = True
      break
    # atualiza a posição do agente
    agentPosition = nextPosition

  rewardsForEachEpisode.append(cumulativeReward)
  print(cumulativeReward)
  agentPosition = randomStart
  cumulativeReward = 0
  done = False

