Generación de un entorno de juego simple

In [1]:
class Environment:
    def __init__(self):
      self.state_space = [0, 1, 2, 3]
      self.action_space = [0, 1]
      self.rewards = {0: -1, 1: -1, 2: -1, 3: 10}


env = Environment()
print("Estados: ", env.state_space)
print("Acciones: ", env.action_space)
print("Recompensas: ", env.rewards)

Estados:  [0, 1, 2, 3]
Acciones:  [0, 1]
Recompensas:  {0: -1, 1: -1, 2: -1, 3: 10}


Q-Learning

In [7]:
from ast import AsyncFunctionDef
import numpy as np

Q = np.zeros((len(env.state_space), len(env.action_space)))

alpha = 0.1
gamma = 0.9

for _ in range(1000):
  state = np.random.choice(env.state_space)
  while state != 3:
    action = np.random.choice(env.action_space)
    next_state = state + action
    reward = env.rewards[next_state]
    Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
    state = next_state

print("Función Q-Valor aprendida: ")
print(Q)

Función Q-Valor aprendida: 
[[ 4.58  6.2 ]
 [ 6.2   8.  ]
 [ 8.   10.  ]
 [ 0.    0.  ]]


Sarsa

In [6]:
Q = np.zeros((len(env.state_space), len(env.action_space)))

for _ in range(1000):
  state = np.random.choice(env.state_space)
  action = np.random.choice(env.action_space)

  while state != 3:
    next_state = state + action
    next_action = np.random.choice(env.action_space)
    reward = env.rewards[next_state]
    Q[state, action] = Q[state, action] + alpha * (reward + gamma * Q[next_state, next_action] - Q[state, action])
    state = next_state
    action = next_action

print("Función Q-Valor aprendida: ")
print(Q)

Función Q-Valor aprendida: 
[[ 1.46723377  3.4931794 ]
 [ 3.72767503  6.64694481]
 [ 6.89827986 10.        ]
 [ 0.          0.        ]]


Política de Gradiente de Montecarlo

In [10]:
policy = np.ones((len(env.state_space), len(env.action_space))) / len(env.action_space)

def average_reward(Q):
  return np.mean([Q[state, np.argmax(policy[state])] for state in env.state_space])
  for _ in range(1000):
    state = np.random.choice(env.state_space)
    while state != 3:
      action = np.random.choice(env.action_space, p=policy[state])
      next_state = state + action
      reward = env.rewards[next_state]
      gradient = np.zeros_like(policy[state])
      gradient[action] = 1
      policy[state] += alpha * gradient * (reward - average_reward(Q))
      state = next_state

print("Política aprendida: ")
print(policy)

Política aprendida: 
[[0.5 0.5]
 [0.5 0.5]
 [0.5 0.5]
 [0.5 0.5]]
