In [None]:
#1. Ejecutar esta celda individualmente
#2. Reiniciar entorno cuando acabe (Run -> Restart and clear cell outputs )
#3. Ejecutar desde la siguiente celda en adelante

%pip install gym==0.17.3
%pip install git+https://github.com/Kojoley/atari-py.git
%pip install keras-rl2==1.0.5
%pip install tensorflow==2.12

In [1]:
from __future__ import division

from PIL import Image
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from tensorflow.keras.optimizers.legacy import Adam
import tensorflow.keras.backend as K

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

import os
import time
import json
import re

In [2]:
INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4

env_name = 'SpaceInvaders-v0'
env = gym.make(env_name)

np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [3]:
class AtariProcessor(Processor):
    def process_observation(self, observation):
        assert observation.ndim == 3  # (height, width, channel)
        img = Image.fromarray(observation)
        img = img.resize(INPUT_SHAPE).convert('L')
        processed_observation = np.array(img)
        assert processed_observation.shape == INPUT_SHAPE
        return processed_observation.astype('uint8')

    def process_state_batch(self, batch):
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)

In [5]:
model = Sequential()
model.add(Permute((2, 3, 1), input_shape=(WINDOW_LENGTH,) + INPUT_SHAPE))
model.add(Convolution2D(32, (8, 8), strides=(4, 4)))
model.add(Activation('relu'))
model.add(Convolution2D(64, (4, 4), strides=(2, 2)))
model.add(Activation('relu'))
model.add(Convolution2D(64, (3, 3), strides=(1, 1)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))

In [None]:
# ===================== CONFIGURACIÓN =====================
block_steps = 10_000 ###########
log_path = "/kaggle/working"

weights_prefix = f"{log_path}/dqn_{env_name}_weights"
state_prefix = f"{log_path}/dqn_state"
log_prefix = f"{log_path}/log_block"

# ===================== DETECCIÓN DE CHECKPOINT =====================
pattern = re.compile(rf'dqn_{env_name}_weights_(\d+).h5f')
checkpoint_files = [f for f in os.listdir(log_path) if pattern.match(f)]

if checkpoint_files:
    last_step = max(int(pattern.match(f).group(1)) for f in checkpoint_files)
    weights_path = f"{weights_prefix}_{last_step}.h5f"
    state_path = f"{state_prefix}_{last_step}.json"
    with open(state_path, "r") as f:
        state = json.load(f)
    restored_step = state.get("step", 0)
    nb_warmup = 1000 # warmup menor, solo para que el buffer no esté vacío
    print(f" Retomando desde paso {restored_step}")
else:
    last_step = 0
    restored_step = 0
    nb_warmup = 5000 ###########
    print(" Entrenamiento nuevo")

# ===================== CONFIGURAR COMPONENTES =====================
memory = SequentialMemory(limit=1_000_000, window_length=WINDOW_LENGTH)

policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps',
                              value_max=1.0, value_min=0.1, value_test=0.05,
                              nb_steps=1_000_000)

dqn = DQNAgent(model=model,
               nb_actions=nb_actions,
               policy=policy,
               memory=memory,
               processor=AtariProcessor(),
               nb_steps_warmup=nb_warmup,
               gamma=0.99,
               target_model_update=10000,
               train_interval=4,
               delta_clip=1.0)

dqn.compile(Adam(learning_rate=0.00025), metrics=['mae'])

if last_step > 0:
    dqn.load_weights(weights_path)
    dqn.step = restored_step

# ===================== ENTRENAR BLOQUE =====================
current_step = last_step + block_steps
current_block = current_step // block_steps
print(f"\n Entrenando bloque {current_block} (pasos {last_step + 1} → {current_step})")

log_file = f"{log_prefix}_{current_block:03d}.json"
log_callback = FileLogger(log_file, interval=100)

dqn.fit(env, nb_steps=block_steps, visualize=False, verbose=2, callbacks=[log_callback])

# ===================== GUARDAR PROGRESO =====================
weights_out = f"{weights_prefix}_{current_step}.h5f"
state_out = f"{state_prefix}_{current_step}.json"

dqn.save_weights(weights_out, overwrite=True)
with open(state_out, "w") as f:
    json.dump({
        "step": int(dqn.step)
    }, f)

print(f"\n Bloque {current_block} completado - step: {dqn.step}, epsilon: {dqn.policy.inner_policy.eps:.4f}")


 Entrenamiento nuevo

 Entrenando bloque 1 (pasos 1 → 10000)
Training for 10000 steps ...


  updates=self.state_updates,


 1639/10000: episode: 1, duration: 6.914s, episode steps: 1639, steps per second: 237, episode reward: 25.000, mean reward:  0.015 [ 0.000,  1.000], mean action: 2.581 [0.000, 5.000],  loss: --, mae: --, mean_q: --, mean_eps: --
 2172/10000: episode: 2, duration: 2.172s, episode steps: 533, steps per second: 245, episode reward:  7.000, mean reward:  0.013 [ 0.000,  1.000], mean action: 2.597 [0.000, 5.000],  loss: --, mae: --, mean_q: --, mean_eps: --
 3143/10000: episode: 3, duration: 3.914s, episode steps: 971, steps per second: 248, episode reward: 14.000, mean reward:  0.014 [ 0.000,  1.000], mean action: 2.551 [0.000, 5.000],  loss: --, mae: --, mean_q: --, mean_eps: --
 4420/10000: episode: 4, duration: 5.119s, episode steps: 1277, steps per second: 249, episode reward: 21.000, mean reward:  0.016 [ 0.000,  1.000], mean action: 2.581 [0.000, 5.000],  loss: --, mae: --, mean_q: --, mean_eps: --


  updates=self.state_updates,


 5248/10000: episode: 5, duration: 11.925s, episode steps: 828, steps per second:  69, episode reward: 13.000, mean reward:  0.016 [ 0.000,  1.000], mean action: 2.581 [0.000, 5.000],  loss: 0.008325, mae: 0.279835, mean_q: 0.368011, mean_eps: 0.995388
 6279/10000: episode: 6, duration: 33.534s, episode steps: 1031, steps per second:  31, episode reward: 16.000, mean reward:  0.016 [ 0.000,  1.000], mean action: 2.502 [0.000, 5.000],  loss: 0.008283, mae: 0.286480, mean_q: 0.370034, mean_eps: 0.994814
 7145/10000: episode: 7, duration: 28.344s, episode steps: 866, steps per second:  31, episode reward:  9.000, mean reward:  0.010 [ 0.000,  1.000], mean action: 2.513 [0.000, 5.000],  loss: 0.007154, mae: 0.292439, mean_q: 0.380669, mean_eps: 0.993959
 7618/10000: episode: 8, duration: 15.204s, episode steps: 473, steps per second:  31, episode reward:  9.000, mean reward:  0.019 [ 0.000,  1.000], mean action: 2.467 [0.000, 5.000],  loss: 0.005792, mae: 0.296186, mean_q: 0.384979, mean_e

In [None]:
# Testing part to calculate the mean reward
dqn.load_weights(final_weights)
dqn.test(env, nb_episodes=10, visualize=False)

In [19]:
%ls -l /kaggle/working
%rm -rf /kaggle/working/*

total 13220
-rw-r--r-- 1 root root     133 Jun 21 06:54 checkpoint
-rw-r--r-- 1 root root 6750266 Jun 21 06:54 dqn_SpaceInvaders-v0_weights_10000.h5f.data-00000-of-00001
-rw-r--r-- 1 root root     781 Jun 21 06:54 dqn_SpaceInvaders-v0_weights_10000.h5f.index
-rw-r--r-- 1 root root 6750266 Jun 21 06:54 dqn_SpaceInvaders-v0_weights_20000.h5f.data-00000-of-00001
-rw-r--r-- 1 root root     781 Jun 21 06:54 dqn_SpaceInvaders-v0_weights_20000.h5f.index
-rw-r--r-- 1 root root      15 Jun 21 06:54 dqn_state_10000.json
-rw-r--r-- 1 root root      14 Jun 21 06:54 dqn_state_20000.json
-rw-r--r-- 1 root root    1562 Jun 21 06:54 log_block_001.json
-rw-r--r-- 1 root root     243 Jun 21 06:54 log_block_002.json


In [None]:
import numpy as np
import json
import matplotlib.pyplot as plt

# Ruta al archivo log
log_path = f'/kaggle/working/dqn_{env_name}_log.json'

# Cargar datos
with open(log_path, 'r') as f:
    history = json.load(f)

# Extraer métricas
rewards = history['episode_reward']
losses = history.get('loss', [])
mean_qs = history.get('mean_q', [])
mean_eps = history.get('mean_eps', [])

# === Función para media acumulada ===
def cumulative_average(data):
    return np.cumsum(data) / np.arange(1, len(data) + 1)

# === Gráfica de recompensas ===
plt.figure(figsize=(12, 4))
plt.plot(rewards, label='Reward')
plt.plot(cumulative_average(rewards), label='Mean Reward (cumulative)', color='red', linewidth=2)
plt.title('Recompensa por episodio')
plt.xlabel('Episodio')
plt.ylabel('Recompensa')
plt.grid(True)
plt.legend()
plt.show()

# === Gráfica de pérdida ===
if losses:
    plt.figure(figsize=(12, 4))
    plt.plot(losses, label='Loss')
    plt.plot(cumulative_average(losses), label='Mean Loss (cumulative)', color='red', linewidth=2)
    plt.title('Pérdida (loss) durante el entrenamiento')
    plt.xlabel('Episodio')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.legend()
    plt.show()

# === Gráfica de mean_q ===
if mean_qs:
    plt.figure(figsize=(12, 4))
    plt.plot(mean_qs, label='Mean Q')
    plt.plot(cumulative_average(mean_qs), label='Mean Q (cumulative)', color='red', linewidth=2)
    plt.title('Valor medio Q (mean_q)')
    plt.xlabel('Episodio')
    plt.ylabel('Q')
    plt.grid(True)
    plt.legend()
    plt.show()

# === Gráfica de epsilon ===
if mean_eps:
    plt.figure(figsize=(12, 4))
    plt.plot(mean_eps, label='Epsilon')
    plt.plot(cumulative_average(mean_eps), label='Mean Epsilon (cumulative)', color='red', linewidth=2)
    plt.title('Exploración (epsilon) por episodio')
    plt.xlabel('Episodio')
    plt.ylabel('Epsilon')
    plt.grid(True)
    plt.legend()
    plt.show()
