In [None]:
#1. Ejecutar esta celda individualmente
#2. Reiniciar entorno cuando acabe (Run -> Restart and clear cell outputs )
#3. Ejecutar desde la siguiente celda en adelante

%pip install gym==0.17.3
%pip install git+https://github.com/Kojoley/atari-py.git
%pip install keras-rl2==1.0.5
%pip install tensorflow==2.12

In [1]:
from __future__ import division

from PIL import Image
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from tensorflow.keras.optimizers.legacy import Adam
import tensorflow.keras.backend as K

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

import os
import time
import json
import re

In [2]:
INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4

env_name = 'SpaceInvaders-v0'
env = gym.make(env_name)

np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [3]:
class AtariProcessor(Processor):
    def process_observation(self, observation):
        assert observation.ndim == 3  # (height, width, channel)
        img = Image.fromarray(observation)
        img = img.resize(INPUT_SHAPE).convert('L')
        processed_observation = np.array(img)
        assert processed_observation.shape == INPUT_SHAPE
        return processed_observation.astype('uint8')

    def process_state_batch(self, batch):
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)

In [5]:
model = Sequential()
model.add(Permute((2, 3, 1), input_shape=(WINDOW_LENGTH,) + INPUT_SHAPE))
model.add(Convolution2D(32, (8, 8), strides=(4, 4)))
model.add(Activation('relu'))
model.add(Convolution2D(64, (4, 4), strides=(2, 2)))
model.add(Activation('relu'))
model.add(Convolution2D(64, (3, 3), strides=(1, 1)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))

In [None]:
# ===================== CONFIGURACIÓN =====================
block_steps = 100_000
log_path = "/kaggle/working"

weights_prefix = f"{log_path}/dqn_{env_name}_weights"
state_prefix = f"{log_path}/dqn_state"
log_prefix = f"{log_path}/log_block"

# ===================== DETECCIÓN DE CHECKPOINT =====================
import os, re, json

pattern = re.compile(rf'dqn_{env_name}_weights_(\d+).h5f')
checkpoint_files = [f for f in os.listdir(log_path) if pattern.match(f)]

if checkpoint_files:
    last_step = max(int(pattern.match(f).group(1)) for f in checkpoint_files)
    weights_path = f"{weights_prefix}_{last_step}.h5f"
    state_path = f"{state_prefix}_{last_step}.json"
    with open(state_path, "r") as f:
        state = json.load(f)
    restored_step = state.get("step", 0)
    nb_warmup = 1000  # warmup corto al reanudar
    print(f"Retomando desde paso {restored_step}")
else:
    last_step = 0
    restored_step = 0
    nb_warmup = 50000
    print("Entrenamiento nuevo")

# ===================== CONFIGURAR COMPONENTES =====================
memory = SequentialMemory(limit=1_000_000, window_length=WINDOW_LENGTH)

policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps',
                              value_max=1.0, value_min=0.1, value_test=0.05,
                              nb_steps=1_000_000)

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               processor=AtariProcessor(),
               nb_steps_warmup=nb_warmup,
               gamma=0.99,
               target_model_update=10000,
               train_interval=4,
               delta_clip=1.0)

dqn.compile(Adam(learning_rate=0.00025), metrics=['mae'])

# ===================== RESTAURAR CHECKPOINT =====================
if last_step > 0:
    dqn.load_weights(weights_path)
    dqn.step = restored_step

# ===================== ENTRENAR BLOQUE =====================
current_step = last_step + block_steps
current_block = current_step // block_steps
print(f"\nEntrenando bloque {current_block} (pasos {last_step + 1} → {current_step})")

log_file = f"{log_prefix}_{current_block:03d}.json"
log_callback = FileLogger(log_file, interval=100)

dqn.fit(env, nb_steps=block_steps, visualize=False, verbose=2, callbacks=[log_callback])

# ===================== GUARDAR CHECKPOINT =====================
weights_out = f"{weights_prefix}_{current_step}.h5f"
state_out = f"{state_prefix}_{current_step}.json"

dqn.save_weights(weights_out, overwrite=True)
with open(state_out, "w") as f:
    json.dump({"step": int(dqn.step)}, f)

print(f"\nBloque {current_block} completado - step: {dqn.step}")


Entrenamiento nuevo

Entrenando bloque 1 (pasos 1 → 100000)
Training for 100000 steps ...


  updates=self.state_updates,


   803/100000: episode: 1, duration: 3.834s, episode steps: 803, steps per second: 209, episode reward: 17.000, mean reward:  0.021 [ 0.000,  1.000], mean action: 2.587 [0.000, 5.000],  loss: --, mae: --, mean_q: --, mean_eps: --
  1194/100000: episode: 2, duration: 1.608s, episode steps: 391, steps per second: 243, episode reward:  6.000, mean reward:  0.015 [ 0.000,  1.000], mean action: 2.570 [0.000, 5.000],  loss: --, mae: --, mean_q: --, mean_eps: --
  2019/100000: episode: 3, duration: 3.349s, episode steps: 825, steps per second: 246, episode reward: 10.000, mean reward:  0.012 [ 0.000,  1.000], mean action: 2.476 [0.000, 5.000],  loss: --, mae: --, mean_q: --, mean_eps: --
  2608/100000: episode: 4, duration: 2.377s, episode steps: 589, steps per second: 248, episode reward: 10.000, mean reward:  0.017 [ 0.000,  1.000], mean action: 2.390 [0.000, 5.000],  loss: --, mae: --, mean_q: --, mean_eps: --
  3201/100000: episode: 5, duration: 2.442s, episode steps: 593, steps per secon

In [None]:
# Testing part to calculate the mean reward
dqn.load_weights(final_weights)
dqn.test(env, nb_episodes=10, visualize=False)

In [33]:
%ls -l /kaggle/working
%rm -rf /kaggle/working/*

total 0


In [None]:
import numpy as np
import json
import matplotlib.pyplot as plt

# Ruta al archivo log
log_path = f'/kaggle/working/dqn_{env_name}_log.json'

# Cargar datos
with open(log_path, 'r') as f:
    history = json.load(f)

# Extraer métricas
rewards = history['episode_reward']
losses = history.get('loss', [])
mean_qs = history.get('mean_q', [])
mean_eps = history.get('mean_eps', [])

# === Función para media acumulada ===
def cumulative_average(data):
    return np.cumsum(data) / np.arange(1, len(data) + 1)

# === Gráfica de recompensas ===
plt.figure(figsize=(12, 4))
plt.plot(rewards, label='Reward')
plt.plot(cumulative_average(rewards), label='Mean Reward (cumulative)', color='red', linewidth=2)
plt.title('Recompensa por episodio')
plt.xlabel('Episodio')
plt.ylabel('Recompensa')
plt.grid(True)
plt.legend()
plt.show()

# === Gráfica de pérdida ===
if losses:
    plt.figure(figsize=(12, 4))
    plt.plot(losses, label='Loss')
    plt.plot(cumulative_average(losses), label='Mean Loss (cumulative)', color='red', linewidth=2)
    plt.title('Pérdida (loss) durante el entrenamiento')
    plt.xlabel('Episodio')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.legend()
    plt.show()

# === Gráfica de mean_q ===
if mean_qs:
    plt.figure(figsize=(12, 4))
    plt.plot(mean_qs, label='Mean Q')
    plt.plot(cumulative_average(mean_qs), label='Mean Q (cumulative)', color='red', linewidth=2)
    plt.title('Valor medio Q (mean_q)')
    plt.xlabel('Episodio')
    plt.ylabel('Q')
    plt.grid(True)
    plt.legend()
    plt.show()

# === Gráfica de epsilon ===
if mean_eps:
    plt.figure(figsize=(12, 4))
    plt.plot(mean_eps, label='Epsilon')
    plt.plot(cumulative_average(mean_eps), label='Mean Epsilon (cumulative)', color='red', linewidth=2)
    plt.title('Exploración (epsilon) por episodio')
    plt.xlabel('Episodio')
    plt.ylabel('Epsilon')
    plt.grid(True)
    plt.legend()
    plt.show()
