In [None]:
# paquetes
import numpy as np
import gymnasium as gym

In [None]:
# dominancia de Pareto
def domina(v1, v2):
    return all(v1_i <= v2_i for v1_i, v2_i in zip(v1, v2)) and any(v1_i < v2_i for v1_i, v2_i in zip(v1, v2))

In [None]:
# conjunto de vectores no dominados
def no_dominado(vectores):
    no_dominado = []
    for v in vectores:
        if not any(domina(v,u) for u in vectores if not np.array_equal(v,u)):
            no_dominado.append(v)
    return no_dominado

In [None]:
# parametros
gamma = 1

In [None]:
# Pareto Q-Learning (PQL)

class PQL:
    def __init__(self, env, gamma=0.9):
        self.env = env
        self.gamma = gamma
        self.Q_set = dict()  # Q_set[(s,a)] = [vec1, vec2, ...]
        self.R_avg = dict()  # R_avg[(s,a)] = reward vector promedio
        self.visits = dict() # visitas[(s,a)] = contador

    def initialize(self):
        for s in range(self.env.n_states):
            for a in range(self.env.n_actions):
                self.Q_set[(s,a)] = []
                self.R_avg[(s,a)] = np.zeros(self.env.reward_dim)
                self.visits[(s,a)] = 0

    def select_action(self, s, weights):
        """Elige la mejor acción en s por scalarización lineal"""
        best_score = -np.inf
        best_action = None
        for a in range(self.env.n_actions):
            for q in self.Q_set[(s,a)]:
                score = np.dot(weights, q)
                if score > best_score:
                    best_score = score
                    best_action = a
        # Exploración básica si no hay vectores
        return best_action if best_action is not None else self.env.action_space.sample()

    def update(self, s, a, r, s_next):
        self.visits[(s,a)] += 1
        n = self.visits[(s,a)]
        self.R_avg[(s,a)] += (r - self.R_avg[(s,a)]) / n

        # Construir candidatos futuros
        successors = []
        for a_prime in range(self.env.n_actions):
            for q in self.Q_set[(s_next, a_prime)]:
                successors.append(q)
        future = remove_dominated(successors)

        # Suma de recompensa + γ * q'
        candidates = [r + self.gamma * q for q in future] if future else [r]
        self.Q_set[(s,a)] = remove_dominated(self.Q_set[(s,a)] + candidates)

    def train(self, episodes, weights):
        self.initialize()
        for ep in range(episodes):
            s = self.env.reset()
            done = False
            while not done:
                a = self.select_action(s, weights)
                s_next, r, done, _ = self.env.step(a)
                r_vec = np.array(r)  # vector de recompensa
                self.update(s, a, r_vec, s_next)
                s = s_next