In [7]:
# paquetes

import numpy as np
import gymnasium as gym

In [8]:
# dominancia de Pareto

def domina(v1, v2):
    return all(v1_i <= v2_i for v1_i, v2_i in zip(v1, v2)) and any(v1_i < v2_i for v1_i, v2_i in zip(v1, v2))

In [9]:
# conjunto de vectores no dominados

def no_dominado(vectores):
    no_dominado = []
    for v in vectores:
        if not any(domina(v,u) for u in vectores if not np.array_equal(v,u)):
            no_dominado.append(v)
    return no_dominado

In [10]:
# parametros

gamma = 0.9

In [11]:
# Pareto Q-Learning (PQL)

class PQL:
    def __init__(set, env, gamma):
        set.env = env
        set.gamma = gamma
        set.Q_set = dict()  # valor de accion
        set.R_avg = dict()  # recompensa promedio
        set.visits = dict() # contador de visitas al estado presente
    
    def start(set):
        for s in range(set.env.n_states):
            for a in range(set.env.n_actions):
                set.Q_set[(s,a)] = []
                set.R_avg[(s,a)] = np.zeros(set.env.reward_dim)
                set.visits[(s,a)] = 0

    def escalarizacion(set, s, pesos):
        best_score = -np.inf
        best_action = None
        for a in range(set.env.n_actions):
            for q in set.Q_set[(s,a)]:
                score = np.dot(pesos, q)
                if score > best_score:
                    best_score = score
                    best_action = a  
        return best_action if best_action is not None else set.env.action_space.sample()   
    
    def actualizar(set, s, a, r, ss):
        set.visits[(s,a)] = set.visits[(s,a)] + 1
        n = set.visits[(s,a)]
        set.R_avg[(s,a)] = set.R_avg[(s,a)] + (r - set.R_avg[(s,a)]) / n

        # dicc de posibles sucesores con la act del Q_set
        succeq = []
        for a_succ in range(set.env.n_actions):
            for q in set.Q_set[(ss, a_succ)]:
                succeq.append(q)
        prox = no_dominado(succeq)

        #retorno
        ppvector = [r + set.gamma * q for q in ppvector] if ppvector else [r]
        set.Q_set[(s,a)] = no_dominado(set.Q_set[(s,a)] + ppvector)  

    def entrenamiento(set, episodios, pesos):
        set.start()
        for ep in range(episodios):
            s = set.env.reset()
            done = False
            while not done:
                a = set.select_action(s, pesos)
                ss, r, done, _ = set.env.step(a)
                r_vec = np.array(r) 
                set.update(s, a, r_vec, ss)
                s = ss