# Deep Q-Network

### José Pablo Kiesling Lange - 21581

In [1]:
import matplotlib.pyplot as plt
import numpy as np

import gymnasium as gym

import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [3]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
tf.debugging.set_log_device_placement(True)

Num GPUs Available:  1


In [4]:
SEED = 327
rng = np.random.default_rng(SEED)

In [5]:
env = gym.make("CartPole-v1", render_mode="rgb_array")

El entorno de CartPole-v1 tiene un espacio de acciones discreto:

| Acción | Descripción                       |
|--------|-----------------------------------|
| 0      | Mover el carro a la izquierda     |
| 1      | Mover el carro a la derecha       |

Y el espacio de estados es continuo, con los siguientes valores:

| Estado        | Descripción                       | Mínimo | Máximo |
|---------------|-----------------------------------|--------|--------|
| Cart Position | Posición del carro                | -4.8   | 4.8    |
| Cart Velocity | Velocidad del carro               | -Inf   | Inf    |
| Pole Angle    | Ángulo del poste                 | -24°   | 24°    |
| Pole Velocity | Velocidad angular del poste       | -Inf   | Inf    |

## Q-Network

Dado que la entrada de la red será el estado, tendrá 4 nodos de entrada. La salida será la acción a tomar, por lo que tendrá 2 nodos de salida.

In [6]:
class QNetwork(Model):
    def __init__(self, input_dim=4, output_dim=2, lr=0.0001):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.lr = lr

        self.hidden1 = Dense(128, activation='relu')
        self.hidden2 = Dense(128, activation='relu')
        self.out = Dense(self.output_dim, activation='linear')
        
        self.optimizer = Adam(learning_rate=self.lr)
        self.loss_fn = MeanSquaredError()

        self.build((None, self.input_dim))
        
    def call(self, state):
        x = tf.convert_to_tensor(state, dtype=tf.float32)
        
        if tf.rank(x) == 1:
            x = tf.expand_dims(x, 0)
        
        x = self.hidden1(x)
        x = self.hidden2(x)
        q_values = self.out(x)
        return q_values
    
    def predict(self, state):
        q_values = self(state)
        return q_values.numpy()[0]
    
    def predict_batch(self, states):
        q_values = self(states)
        return q_values.numpy()
    
    def hard_update(self, target_network):
        self.set_weights(target_network.get_weights())
        
    def soft_update(self, target_network, tau=0.1):
        target_weights = self.get_weights()
        source_weights = target_network.get_weights()
        new_weights = []
        for target_w, source_w in zip(target_weights, source_weights):
            new_w = tau * source_w + (1 - tau) * target_w
            new_weights.append(new_w)
        self.set_weights(new_weights)