#Demo con ambiente Gymnasium para jugar juegos de Atari usando una red DQN 

Nota: debido a que TF-Agents todavía no es compatbile con Gymnasium no se usa y se genera la red usando Keras puro

Basado en los tutoriales: 

  https://farama.org/Announcing-The-Farama-Foundation 
  
  https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t
  
  https://keras.io/examples/rl/deep_q_network_breakout/ 

  https://github.com/moduIo/Deep-Q-network/blob/master/DQN.ipynb

  


In [None]:
#@title Instalar Paquete Gymnasium para acceder a juegos Atari
!pip install gymnasium[atari,accept-rom-license]     
print("Gymnasium para ATARI instalado.")

In [None]:
#@title Cargar Librerías

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import abc
import tensorflow as tf
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from random import randint

import random
import pandas as pd

from collections import deque
import csv

print("Librerías cargadas.")

## Entorno para Juego de Atari

In [None]:
#@title Preparar funciones auxiliares para visualizar juegos Atari

import gymnasium as gym
from gym import logger as gymlogger
from gym.wrappers import RecordVideo
gymlogger.set_level(40) #error only
import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML
import os
from IPython import display as ipythondisplay

##from pyvirtualdisplay import Display
##display = Display(visible=0, size=(1400, 900))
##display.start()


"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""
def show_env_video(env):
  # trata de encontrar el video que corresponde
  encuentraVideo = False
  mp4 = './EnvVideos/' + env.spec.kwargs["game"] + '-episode-' + str(env.episode_id-1) + '.mp4'  
  mp4list = glob.glob(mp4)
  if len(mp4list) == 1:
    encuentraVideo = True
  else:
    mp4list = glob.glob('./EnvVideos/*.mp4')
    if len(mp4list) > 0:
      # toma el último video generado para el juego
      mp4list.sort(reverse=True, key=os.path.getmtime)
      mp4 = mp4list[0]
      encuentraVideo = True
  if encuentraVideo:
    print("Video: ", mp4)
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("No se encuentra video " + mp4 + " del juego!")
    
def func_episode_trigger(ep):
  # siempre graba
  return True

def wrap_env_recorder(env):
  env = RecordVideo(env, 
                    video_folder = './EnvVideos', 
                    episode_trigger = func_episode_trigger, 
                    video_length = 0,
                    name_prefix = env.spec.kwargs["game"])
  return env


print("\nWrapper para generar video preparado.")

def simular_entorno(env, agente, mostrarRecompensa=True, num_episodes=1, mostrar_video=True):
  for i in range(num_episodes):
    if num_episodes > 1:
      print("Generando episodio " + str(i+1) + "...")
    else:
      print("Generando...")
    # inicia entorno
    observation, info = env.reset(seed=123)
    sumR = 0.00
    termino = False
    step = 0
    while not termino:
      # si tiene acción inicial la ejecuta
      if (step == 0) and (env_start_action is not None):
        action_step = env_start_action
      else:
        # sino usa la acción del agente directamente
        action_step = agente.action(observation, info)
      # aplica la acción
      observation, step_reward, termino, info = env.step(action_step)
      sumR += step_reward
      step += 1
    # muestra recompensa
    if mostrarRecompensa:
      rFinal = step_reward
      if num_episodes > 1:
        print("Recompensa Acumulada del episodio " + str(i+1) + ": ", sumR)
        ##print("Recompensa Final del episodio " + str(i+1) + ": ", rFinal)
      else:
        print("Recompensa Acumulada: ", sumR)
        ##print("Recompensa Final: ", rFinal)
    if mostrar_video:
      show_env_video(env)
  return 

print("\nFunción para simular entorno definida.")

# Clase para Agente que juega al azar 
class randomAgentClass():

  def __init__(self, observation_space, action_space):
    self._action_space = action_space
  
  def action(self, observation=None, info=None): 
      # devuelve valor al azar
      return self._action_space.sample()

print("\nClase randomAgentClass definida.")     
  

In [None]:
#@title Selecciona el juego de Atari
seleccionaJuego = "Breakout" #@param ["Pong", "Freeway", "Enduro", "Asteroids", "Breakout", "Space Invaders"]
#@markdown Ver informacion en https://www.gymlibrary.dev/environments/atari/complete_list/
tipoObsTS = "Grayscale Screen" #@param ["Game RAM", "Grayscale Screen", "RGB Screen"]
entornoDeterministico = True #@param{type:"boolean"}

# selecciona juego
env_start_action = None
if seleccionaJuego == "Freeway":
  gym_env_name = 'ALE/Freeway-v5'
elif seleccionaJuego == "Enduro":
  gym_env_name = 'ALE/Enduro-v5'
elif seleccionaJuego == "Pong":
  gym_env_name = 'ALE/Pong-v5'
elif seleccionaJuego == "Asteroids":
  gym_env_name = 'ALE/Asteroids-v5'
elif seleccionaJuego == "Breakout":
  gym_env_name = 'ALE/Breakout-v5'
  env_start_action = 1 # debe disparar la pelotita
elif seleccionaJuego == "Space Invaders":
  gym_env_name = 'ALE/SpaceInvaders-v5'
else:
  raise ValueError("No se puede defnir gym_env_name!!!")

# determina tipo de OBS
if tipoObsTS == "Game RAM":
  obsType = 'ram'
elif tipoObsTS == "RGB Screen":
  obsType = 'rgb'
elif tipoObsTS == "Grayscale Screen":
  obsType = 'grayscale'
else:
  raise ValueError("No se puede defnir obsType!!!")  


# función para inicializar juego
def inicializar_gym_env(gym_env_name, obs_type, entornoDeterministico=False):
    if entornoDeterministico:
      # crea el entorno con parámetros deterministicos
      env = gym.make(gym_env_name, 
                                obs_type = obsType, 
                                frameskip = 1,
                                repeat_action_probability = False,
                                full_action_space = False,
                                render_mode = 'rgb_array') 
    else:
       # crea el entorno con parámetros estocásticos
      env = gym.make(gym_env_name, 
                                obs_type = obsType, 
                                full_action_space = False,
                                render_mode = 'rgb_array')   
    env.metadata['render_fps'] = 30
    return env



 # crea el entorno para entrenar (no graba video)
train_atari_env = inicializar_gym_env(gym_env_name, 
                                      obsType, 
                                      entornoDeterministico
                                      )

# crea el entorno para probar (graba video )
eval_atari_env = wrap_env_recorder( 
                    inicializar_gym_env(gym_env_name, 
                                        obsType,
                                        entornoDeterministico
                                        )
                )

    
# muesta información del entorno
print("\n")
print('- Entorno: ', gym_env_name)
print('\n- Specification:')
for det in train_atari_env.spec.kwargs:
  print("  ", det, "=", train_atari_env.spec.kwargs[det])
#print("   max_episode_steps=", train_atari_env._max_episode_steps)
print('\n- Observation space:')
print("  ", train_atari_env.observation_space)
print('\n-Action space:')
print("  ", train_atari_env.action_space)
print('\n-Reward range:')
print("  ", train_atari_env.reward_range)

# inicializa agente Random para el ambiente
randomAg = randomAgentClass(train_atari_env.observation_space, train_atari_env.action_space)

# muestra pantalla ejemplo
print("\n-Ejemplo pantalla: ")
train_atari_env.reset()
import PIL.Image
PIL.Image.fromarray(train_atari_env.render())

In [None]:
#@title Ejemplo de juego jugando al Azar
simular_entorno(eval_atari_env, randomAg, True, 1, True)

##DQN

In [None]:
#@title Definir el Agente tipo DQN

#@markdown ### Parámetros generales:
agent_max_memoria = 1000 #@param {type:"integer"}
        # 100000
        
#@markdown ### Parámetros de las capas ConvNet:
convNet_usar_DeepMind_config = True #@param {type:"boolean"}
convNet_tamaño_kernel_N =  3 #@param {type:"integer"}
convNet_tamaño_pooling_M = 3 #@param {type:"integer"}
convNet_cantidad_capas_ocultas =  3#@param {type:"integer"}

#@markdown ### Parámetros de las capas Lineales:
lineal_cant_neuronas_capas_ocultas = '512' #@param {type:"string"}
#@markdown (Nota: se puede indicar Cantidad de neuronas, D para DropOut, BN para BatchNormalization)
lineal_porc_capa_DropOut = 0.4 #@param {type:"number"}


# diccionario auxiliar para pasar configuracion QNetwork
config_q_network = {}

# tamaño de los kernels y pooling (para simplificar son todas iguales)
if convNet_usar_DeepMind_config:
  # usa red definido en paper original de DeepMind
  config_q_network["cnn_deepmind_config"] = True
else:
  # usa red con configuración definida por parámetros
  config_q_network["cnn_deepmind_config"] = False
  if convNet_tamaño_kernel_N<1:
    convNet_tamaño_kernel_N = 1
  config_q_network["cnn_kernel_shape"] = (convNet_tamaño_kernel_N)
  if convNet_tamaño_pooling_M<0:
    convNet_tamaño_pooling_M=0
  config_q_network["cnn_pooling_shape"] = (convNet_tamaño_pooling_M)

  # indica la configuración para la parte Encoder 
  #   (cada elemento de las listas son la configuración de las capas Conv)
  if convNet_cantidad_capas_ocultas<1:
    convNet_cantidad_capas_ocultas = 1
  cnn_filters = []
  for i in range(convNet_cantidad_capas_ocultas, 0, -1):
    cnn_filters.append( 2**(i+2) )
  config_q_network["cnn_filters"] = cnn_filters

  # chequea configuración de drop out
  if lineal_porc_capa_DropOut <= 0:
    lineal_porc_capa_DropOut = 0.10
  elif lineal_porc_capa_DropOut > 0.9:
      lineal_porc_capa_DropOut = 0.9
  config_q_network["lineal_porc_capa_DropOut"] = lineal_porc_capa_DropOut

  # cantidad de neuronas ocultas 
  hidden_layers = []
  for val in lineal_cant_neuronas_capas_ocultas.split(','):
    val = val.strip()
    if val == "D":
      hidden_layers.append( "DropOut" )  
    elif val == "BN":
      hidden_layers.append( "BatchNormalization" )  
    elif val.isnumeric():
      hidden_layers.append( val )
    else:
      print("Capa ", val, "descartada!")  
  config_q_network["hidden_layers"] = hidden_layers

# Define clase de Agente DQN 
class dqnAgentClass():

  def __init__(self, observation_space, action_space, config_q_network, agent_memory):
    self._action_space = action_space
    self._observation_space = observation_space
    self._num_actions = self._action_space.n
    # Hyperparameters
    self.gamma = 1.0            # Discount rate
    self.epsilon = 1.0          # Exploration rate
    self.epsilon_min = 0.1      # Minimal exploration rate (epsilon-greedy)
    self.epsilon_decay = 0.995  # Decay rate for epsilon
    self.update_rate = 1000     # Number of steps until updating the target network
    # memory collection
    self.memory = deque(maxlen=max(agent_memory, 1000))
    # Construct DQN models      
    self._config_q_network = config_q_network
    self.create_q_models()


  def create_q_models(self):      
    if self._config_q_network["cnn_deepmind_config"]:          
        # crea modelos usando arquitectura de paper de Deepmind
        self.q_model = self._create_q_model_deepmind("Q-Model",
                                    self._observation_space.shape, 
                                    self._action_space.n)
    else:
        # crea modelos usando arquitectura a partir de configuración de usuario
        self.q_model = self._create_q_model_custom("Q-Model",
                                    self._observation_space.shape, 
                                    self._action_space.n)
    
    # copia model para reward model_target
    self.target_model = tf.keras.models.clone_model(self.q_model)
    self.target_model._name = "Reward-Model"
    self.target_model.set_weights(self.q_model.get_weights())      

    # determina optimizer para modelos
    _optimizer = tf.keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)
    self.q_model.compile(loss='mse', optimizer=_optimizer)
    ##self.target_model.compile(loss='mse', optimizer=_optimizer)

    # muestra uno de los dos modelos creados (tiene la misma estructura)
    self.q_model.summary()

  def _prepare_obsState(self, obsState, haceArray=True):      
    normObs = (obsState-128)/127.0
    if haceArray:
      return np.array( [ normObs ], dtype="float32" )
    else:
      return normObs

  def _determine_obs_matrixShape(self, inputShape):
      # determina la forma en matriz para capa reshape
      if len(inputShape)==1:
        cols = inputShape[0]
        if (cols%16)==0:
          matrixShape = [16, (cols//16)]
        elif (cols%3)==0:
          matrixShape = [3, (cols//3)]        
        elif (cols%2)==0:
          matrixShape = [2, (cols//2)]
        else:
          matrixShape = [1, cols]        
        return matrixShape
      else:
        return inputShape

  def _create_q_model_deepmind(self, modelName, inputShape, num_actions):      
      # Network defined by the Deepmind paper
      
      # capa de entrada
      inputLay = tf.keras.layers.Input(shape=inputShape, name="input")
      eachLay = inputLay

      # agrega capas conv según corresponda
      if len(inputShape)==1:
        # ajusta la forma de entrada para poder usar capas conv1D           
          eachLay = tf.keras.layers.Reshape(self._determine_obs_matrixShape(inputShape), name="reshapeRAM")(eachLay)
          # agrega solo 1 capa convolution para RAM
          eachLay = tf.keras.layers.Conv1D(32, 8, strides=4, activation="relu", name="conv_1")(eachLay)        
      elif len(inputShape)>2:
        # Convolutions on the frames on the RGB screen
        eachLay = tf.keras.layers.Conv2D(32, 8, strides=4, activation="relu", name="conv_1")(eachLay)
        eachLay = tf.keras.layers.Conv2D(64, 4, strides=2, activation="relu", name="conv_2")(eachLay)
        eachLay = tf.keras.layers.Conv2D(64, 3, strides=1, activation="relu", name="conv_3")(eachLay)
      else:         
        # Convolutions on the frames on the GrayScale screen
        eachLay = tf.keras.layers.Conv1D(32, 8, strides=4, activation="relu", name="conv_1")(eachLay)
        eachLay = tf.keras.layers.Conv1D(64, 4, strides=2, activation="relu", name="conv_2")(eachLay)
        eachLay = tf.keras.layers.Conv1D(64, 3, strides=1, activation="relu", name="conv_3")(eachLay)

      # capas flatten y lineal
      eachLay = tf.keras.layers.Flatten(name="flat")(eachLay)
      eachLay = tf.keras.layers.Dense(512, activation="relu", name="lineal")(eachLay)

      # capa de salida
      outputLay = tf.keras.layers.Dense(num_actions, activation="linear", name="output")(eachLay)
      
      # devuelve el modelo creado            
      mod = tf.keras.Model(name=modelName+"_DeepMind", inputs=inputLay, outputs=outputLay)
      return mod

  def _create_q_model_custom(self, modelName, inputShape, num_actions):      
      # Red definida por configuración

      # capa de entrada
      inputLay = tf.keras.layers.Input(shape=inputShape, name="input")
      eachLay = inputLay
      
      # agrega capas conv según corresponda
      if len(inputShape)==1:
        # ajusta la forma de entrada para poder usar capas conv1D           
          eachLay = tf.keras.layers.Reshape(self._determine_obs_matrixShape(inputShape), name="reshapeRAM")(eachLay)

      auxName = 'conv_'
      for i in range(len(self._config_q_network["cnn_filters"])):  

          # define el nombre de la capa oculta
          auxlayerName = 'conv_'+str(i+1)

          # agrega las capas ocultas de tipo Conv2D 
          if len(inputShape)>2:
            eachLay =  tf.keras.layers.Conv2D(self._config_q_network["cnn_filters"][i], self._config_q_network["cnn_kernel_shape"], activation='relu', padding='same', name='c_'+auxlayerName)(eachLay) 
          else:
            eachLay =  tf.keras.layers.Conv1D(self._config_q_network["cnn_filters"][i], self._config_q_network["cnn_kernel_shape"], activation='relu', padding='same', name='c_'+auxlayerName)(eachLay) 
          # determina nombre y shape de la capa conv2D
          last_conv_layer_name = 'c_'+auxlayerName
          if self._config_q_network["cnn_pooling_shape"] > 0:
            # sino no agrega capa MaxPooling 
            if len(inputShape)>2:
              eachLay =  tf.keras.layers.MaxPooling2D(self._config_q_network["cnn_pooling_shape"], padding='same', name='p_'+auxlayerName)(eachLay)
            else:
              eachLay =  tf.keras.layers.MaxPooling1D(self._config_q_network["cnn_pooling_shape"], padding='same', name='p_'+auxlayerName)(eachLay)

      #  agrega capa Flatten 
      eachLay = tf.keras.layers.Flatten(name='flat')(eachLay)

      # agrega capas lineales
      auxName = 'lineal_'
      auxId = 1 
      for val_hid in self._config_q_network["hidden_layers"]:  

        if val_hid == "DropOut":
          auxlayerName = "d_"+str(auxId)
          auxId = auxId + 1
          eachLay =  tf.keras.layers.Dropout(self._config_q_network["lineal_porc_capa_DropOut"], name=auxlayerName)(eachLay)
        elif val_hid == "BatchNormalization":
          auxlayerName = "bn_"+str(auxId)
          auxId = auxId + 1
          eachLay =  tf.keras.layers.BatchNormalization(name=auxlayerName)(eachLay)
        elif val_hid.isnumeric():
          # agrega la capa oculta
          auxlayerName = auxName+str(auxId)
          auxId = auxId + 1
          eachLay =  tf.keras.layers.Dense(int(val_hid), name=auxlayerName)(eachLay) # capas ocultas

      # capa de salida
      outputLay = tf.keras.layers.Dense(num_actions, activation="linear", name="output")(eachLay)
      
      # devuelve el modelo creado            
      mod = tf.keras.Model(name=modelName+"_custom", inputs=inputLay, outputs=outputLay)
      #mod.compile(loss='mse', optimizer=tf.keras.optimizers.Adam())
      return mod


  #
  # Chooses action based on epsilon-greedy policy
  #
  def train_act(self, state):
      # Use epsilon-greedy for exploration
      if np.random.rand() <= self.epsilon:
          # Random exploration
          return random.randrange(self._action_space.n)
      else:
          # Predict action Q-values using model
          input_data = self._prepare_obsState(state)
          #input_data = state
          act_values = self.q_model.predict(input_data, verbose=0)            
          # Returns action using policy
          return np.argmax(act_values[0])  

  #
  # Stores experience in replay memory
  #
  def remember(self, state, action, reward, next_state, done):
      # prepara estados 
      #  (así no hace falta prepararlo cada vez que hace replay)
      state = self._prepare_obsState(state, False)
      next_state =  self._prepare_obsState(next_state, False)
      # guarda en memoria
      self.memory.append((state, action, reward, next_state, done))

  #
  # Trains the model using randomly selected experiences in the replay memory
  #
  def replay(self, batch_size): 
      # determina ejemplos para re-entrenar
      rndindices = np.random.choice(range(len(self.memory)), size=batch_size)

      # obtiene datos de ejemplos
      sample_next_states = []
      sample_actions = []
      sample_rewards = []
      sample_done = []
      sample_states = []
      for i in rndindices:
          # self.memory structure: (state, action, reward, next_state, done))
          auxMem = self.memory[i]
          sample_states.append( auxMem[0] )
          sample_actions.append(  auxMem[1] )
          sample_rewards.append(  auxMem[2] )
          sample_next_states.append( auxMem[3] )            
          sample_done.append( auxMem[4] )
      sample_states = np.array( sample_states )
      sample_actions = np.array( sample_actions )
      sample_rewards = np.array( sample_rewards )
      sample_next_states = np.array(sample_next_states)            
      sample_done = np.array( sample_done )

      # ejecuta los modelos (una con todos los datos)

      # determina prediccion modelo target para reward
      model_rewards = self.target_model.predict(sample_next_states, verbose=0)
      
      # Use the current model to output the Q-value predictions
      model_qvalues = self.q_model.predict(sample_states, verbose=0)
      
      train_qvalues = []
      for i in range(batch_size):

          if not sample_done[i]:
              pred_reward = np.amax(model_rewards[i])
              target = sample_rewards[i] + self.gamma * pred_reward
          else:
              target = sample_rewards[i]
        
          # 2. Rewrite the chosen action value with the computed target
          model_qvalues[i][sample_actions[i]] = target
                      
          # 3. Use vectors in the objective computation
          train_qvalues.append( model_qvalues[i] )
      
      # reentrena       
      train_qvalues = np.array( train_qvalues )
      self.q_model.fit(sample_states, train_qvalues, epochs=1, verbose=0)
          
      # libera memoria
      del sample_next_states
      del sample_actions
      del sample_rewards
      del sample_done
      del sample_states
      del train_qvalues

      # degrada epsilon
      if self.epsilon > self.epsilon_min:
          self.epsilon *= self.epsilon_decay

  #
  # Sets the target model parameters to the current model parameters
  #
  def update_target_model(self):
      self.target_model.set_weights(self.q_model.get_weights())
          
  #
  # Loads a saved model
  #
  def load(self, name):
      #self.q_model.load_weights(name)
      if ".h5" not in name:
        name = name + ".h5"
      self.q_model = tf.keras.models.load_model( name )
      # copia model para reward model_target
      self.target_model = tf.keras.models.clone_model(self.q_model)
      self.target_model._name = "Reward-Model"
      self.target_model.set_weights(self.q_model.get_weights())      
      # carga epsilon
      with open(name + ".csv", 'r') as f:
            reader = csv.reader(f)
            for row in reader:
              self.epsilon = float(row[1])
      f.close()      

  #
  # Saves parameters of a trained model
  #
  def save(self, name):
      # graba modelo
      if ".h5" not in name:
        name = name + ".h5"
      self.q_model.save(name, save_format="h5")
      # graba epsilon
      with open(name + ".csv", 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["epsilon", self.epsilon])      
      f.close()

  def action(self, observation, info): 
      # prepara datos entrada para modelo
      input_data = self._prepare_obsState(observation) ##np.array( [observation] )
      # devuelve valor determinado por el modelo QNetwork
      pred_modelo = self.q_model.predict(input_data, verbose=0)        
      # utiliza el ID acción con mayor valor Q
      accionModelo = np.argmax(pred_modelo[0])
      #print(predModelo, predModelo[0], accionModelo)
      return accionModelo

print("\nClase dqnClass definida.")   

# inicializa Agente DQN
dqnAg = dqnAgentClass(train_atari_env.observation_space, 
                      train_atari_env.action_space, 
                      config_q_network,
                      agent_max_memoria)

print("\nAgente DQN inicializado. ")


In [None]:
#@title Entrenar al Agente DQN
import gc
import time

#@markdown Parámetros del entrenamiento:
entrenar_DQN = True # @param {type:"boolean"}
  # 300    
train_episodes = 10 #@param {type:"integer"}
train_batch_size = 16 #@param {type:"integer"}
min_recompensa_promedio_finalizar = 10.0 #@param {type:"number"}
mostrar_estado_cada_steps = 200

#@markdown Parámetros para grabar/recuperar modelo DQN:
directorio_modelo = '/content/gdrive/MyDrive/IA/demoRL/Modelos' #@param {type:"string"}
nombre_modelo_grabar = "keras-Atari" #@param {type:"string"}
recuperar_modelo_entrenado = True #@param {type:"boolean"}
grabar_modelo_mientras_entrena = True #@param {type:"boolean"}

if recuperar_modelo_entrenado or grabar_modelo_mientras_entrena:
  import os
  from google.colab import drive
  drive.mount('/content/gdrive')
  # determina lugar donde se guarda el modelo
  dqn_modelo_dir = os.path.join(directorio_modelo, nombre_modelo_grabar)
  dqn_modelo_dir = dqn_modelo_dir + "/" + seleccionaJuego 
  if ".h5" not in dqn_modelo_dir:
    dqn_modelo_dir = dqn_modelo_dir + ".h5"
  if recuperar_modelo_entrenado:
    if os.path.isfile(dqn_modelo_dir):
      # recupera un modelo anterior grabado
      dqnAg.load(dqn_modelo_dir)
      #dqnAg.q_model.summary()
      print("\n++ Modelo recuperado de " + dqn_modelo_dir + " ++")
    else:
      print("\n++ No se encuentra modelo grabado en " + dqn_modelo_dir + "!!! ++\n")

if entrenar_DQN:

  # variables auxiliares
  total_step = 0   # Counter for total number of steps taken
  all_rewards = 0  # Used to compute avg reward over time
  done = False
  avg_reward = -1  

  # usa el ambiente de training que no genera videos
  env = train_atari_env

  print("\n** Comienza Entrenamiento **")

  for e in range(1, train_episodes+1):
      total_reward = 0
      game_score = 0
      state, info = env.reset(seed=123)
          
      print("\ntotal steps: {} * comienza episode {}/{} " 
            .format(total_step, e, train_episodes))

      for step in range(1, 20001):
          #env.render()
          total_step += 1
          
          # Every update_rate timesteps we update the target network parameters
          if (total_step % dqnAg.update_rate) == 0:
              dqnAg.update_target_model()

          if (step <= 1) and (env_start_action is not None):
              # si tiene acción inicial la ejecuta
              action = env_start_action
          else:
              # ejecuta la acción del agente
              # Transition Dynamics
              action = dqnAg.train_act(state)

          # ejecuta la acción
          next_state, reward, done, truncated, _ = env.step(action)
          
          # termino realmente el juego
          # o se forzo la terminación por máximos pasos
          terminated = done or truncated
                    
          # Store sequence in replay memory
          dqnAg.remember(state, action, reward, next_state, done)

          state = next_state
          game_score += reward
          reward -= 1  # Punish behavior which does not accumulate reward
          total_reward += reward
          
          if terminated:
              # si termino
              all_rewards += game_score
              avg_reward = all_rewards/(e+1)
              print("total step: {} / ep step: {} * fin episode: {}/{} -> game score: {}, total reward: {}, avg reward: {}"
                    .format(total_step, step, e, train_episodes, game_score, total_reward, round(avg_reward,3)))                            
              break 
          else:
              # no termino, se fija si muestra estado
              if (step % mostrar_estado_cada_steps)==0:
                print("total step: {} / ep step: {} * sigue episode: {}/{} -> game score: {} "
                    .format(total_step, step, e, train_episodes, game_score))                            
              # fuerz liberar memoria
              gc.collect()
          
          # entrena modelos usando estados anteriores
          if len(dqnAg.memory) > train_batch_size:
              t1 = time.time()
              dqnAg.replay(train_batch_size)
              t2 = time.time()
              if (t2 - t1) > 2:
                 print("total step: {} / ep step: {} !! replay tarda mucho: {}" 
                    .format(total_step, step, round(t2-t1,3)))                            

      if grabar_modelo_mientras_entrena:
          # graba modelo
          dqnAg.save(dqn_modelo_dir)
          print("++ Modelo grabado en " + dqn_modelo_dir + " ++")
      
      # finaliza si alcanza recompensa promedio minima
      if avg_reward >= min_recompensa_promedio_finalizar:
          print("total step: {} * fin episode: {}/{} -- se alcanza mínima recompensa para finalizar entrenamiento!"
            .format(total_step, e, train_episodes))
          break

  print("\n** Entrenamiento Finalizado **\n")
else:
  print("No se ejecuta entrenamiento de Agente DQN.")                        

In [None]:
#@title Probar el Agente DQN Entrenado
simular_entorno(eval_atari_env, dqnAg, True, 1, True)