In [1]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

Using TensorFlow backend.


# Q-Learning y Deep Q-Learning.

- ### Pablo Melendez
- ### Hector Magaña

Definimos el algoritmo de Q-Learning.

In [2]:
def sel_accion(Q, s, eps=0.1):
    if np.random.uniform(0,1) < eps:
        return np.random.randint(Q.shape[1])
    else:
        return np.argmax(Q[s])

def QL(env, n_iter=10000, gamma=0.95, alfa=0.01, eps=0.3, d_eps=0.00005):
    
    n_a = env.action_space.n
    n_s = env.observation_space.n

    Q = np.zeros((n_s, n_a))
#     print(Q.shape)
    
    for it in range(n_iter):
        done = False        
        state = env.reset()
        
        if eps > 0.01:
            eps -= d_eps
        
        while not done:
            action = sel_accion(Q, state, eps)
            next_state, rew, done, info = env.step(action)
#             print('-----')
#             print(action)
#             print(rew)
#             print(info)
#             print('-----')

            Q[state,action] = Q[state,action] + alfa*(rew + gamma*np.max(Q[next_state]) - Q[state,action])
            state = next_state
            
    return Q

Definimos el algoritmo de Deep Q-Learning.

In [12]:
def DeepQL(env, mem_limit = 1000, batch = 50, warmup = 2000, steps = 50000):

    # Semilla para valores aleatorios
    np.random.seed(123)
    env.seed(123)
    
    # Obtenemos la cantidad de acciones
    nb_actions = env.action_space.n

    # Primera capa
    model = Sequential()
    model.add(Dense(nb_actions, input_shape=(env.observation_space.shape,))
              
    # Capas siguientes
#     model.add(Dense(16))
#     model.add(Activation('relu'))
    model.add(Dense(16))
#     model.add(Activation('relu'))
    model.add(Dense(16))
#     model.add(Activation('relu'))
    model.add(Dense(nb_actions))
#     model.add(Activation('linear'))
              
    # Estructura de la red
    print(model.summary())

    # Creamos la poliza inicial y compilamos el agente
    memory = SequentialMemory(limit = mem_limit, window_length = 1)
              
    policy = BoltzmannQPolicy()
              
    dqn = DQNAgent(model=model, nb_actions = nb_actions, memory = memory, nb_steps_warmup = warmup,
                   target_model_update = 1e-2, policy = policy)
              
    dqn.compile(Adam(lr = 1e-3), metrics = ['mae'])

    # Entrenamos al agente
    dqn.fit(env, nb_steps = steps, visualize = False, verbose = 0)

    # Respaldo de los pesos finales
    # dqn.save_weights('dqn_{}_weights.h5f'.format(name), overwrite = True)

    # Testeamos el resultado
    dqn.test(env, nb_episodes = 50, visualize = False)
    
    # Regresamos el agente entrenado
    return dqn

SyntaxError: invalid syntax (<ipython-input-12-00a6e5a2a694>, line 17)

Frozen Lake

In [4]:
test_env = gym.make('FrozenLake-v0')

test_env.reset()

print(test_env.observation_space)
print(test_env.action_space)


Discrete(16)
Discrete(4)


In [13]:
test_env.reset()
DQr = DeepQL(test_env, mem_limit = 1000, batch = 10, warmup = 10, steps = 10)
# print(DQr.policy.get_config())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 4)                 8         
_________________________________________________________________
dense_14 (Dense)             (None, 16)                80        
_________________________________________________________________
dense_15 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_16 (Dense)             (None, 4)                 68        
Total params: 428
Trainable params: 428
Non-trainable params: 0
_________________________________________________________________
None
Testing for 50 episodes ...
Episode 1: reward: 0.000, steps: 3
Episode 2: reward: 0.000, steps: 5
Episode 3: reward: 0.000, steps: 4
Episode 4: reward: 0.000, steps: 9
Episode 5: reward: 0.000, steps: 2
Episode 6: reward: 0.000, steps: 6
Episode 7: 

In [14]:
test_env.reset()
Qr = QL(test_env, n_iter = 5000, gamma = 0.95, alfa = .1, eps = 0.5, d_eps = 0.001)
print(Qr)

[[1.87754319e-01 1.45798469e-01 1.46779020e-01 1.39356087e-01]
 [9.88156193e-02 5.06736616e-02 5.82002416e-03 1.32689344e-04]
 [1.47338169e-01 7.04748094e-03 1.81604922e-02 1.49877929e-02]
 [8.69438169e-03 0.00000000e+00 0.00000000e+00 9.17740289e-04]
 [2.19304199e-01 1.29715881e-01 1.31506027e-01 1.42139963e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.68380351e-01 0.00000000e+00 7.38355849e-02 1.40788651e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.70795323e-01 1.63489997e-01 1.50266774e-01 2.91159532e-01]
 [2.37426909e-01 3.89945047e-01 1.19987124e-01 1.88387631e-01]
 [4.76912793e-01 1.27023120e-01 1.77637967e-01 5.73585075e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.60096698e-01 3.18045364e-01 5.70826552e-01 2.74698721e-01]
 [4.23530106e-01 8.11434175e-01 5.80452691e-01 2.97552916e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.000000

Roulette

In [7]:
test_env_2 = gym.make('Roulette-v0')

print(test_env_2.observation_space)
print(test_env_2.action_space)

Discrete(1)
Discrete(38)


In [8]:
test_env_2.reset()

Qri = QL(test_env_2, n_iter=1000, gamma=1, alfa=.1, eps=0.5, d_eps=0.000001)
print(Qri)

[[35.83734247 36.0819095  36.04641939 36.10423133 36.14041359 36.41371308
  35.55204465 36.36031831 35.29431239 35.88718556 35.58217124 35.18170958
  36.11397648 36.28251724 35.28996592 35.71305149 36.0469198  35.81580764
  36.04107616 37.21724014 35.75472604 36.15179887 35.69435257 35.21636144
  35.56206542 35.43074812 36.060634   36.20612077 35.30084144 35.81825881
  36.04297716 36.38604943 35.61958097 36.38657552 35.54482869 35.70735098
  36.62546875 36.17791533]]


In [15]:
test_env_2.reset()
DQr = DeepQL(test_env_2, mem_limit = 1000, batch = 10, warmup = 10, steps = 10)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_17 (Dense)             (None, 38)                76        
_________________________________________________________________
dense_18 (Dense)             (None, 16)                624       
_________________________________________________________________
dense_19 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_20 (Dense)             (None, 38)                646       
Total params: 1,618
Trainable params: 1,618
Non-trainable params: 0
_________________________________________________________________
None
Testing for 50 episodes ...
Episode 1: reward: 11.000, steps: 100
Episode 2: reward: -26.000, steps: 100
Episode 3: reward: 85.000, steps: 100
Episode 4: reward: -63.000, steps: 100
Episode 5: reward: 11.000, steps: 100
Episode 6: reward: -26.00