# Semantic Text Similarity
Este modelo utiliza gensim para convertir pares de vectores + puntuaciones en vectores (word embeddings).
Dado un dataset, infiere la puntuación de similitud entre ambas frases.

In [1]:
# Requisitos
from gensim.models import TfidfModel
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
import numpy as np

In [2]:
# Tipado
from typing import Tuple, List

Definir base de datos:

In [3]:
# Cargar stopwords en Catalan
# STOPWORDS_CA = {"a", "abans", "ací", "ah", "així", "això", "al", "aleshores", "algun", "alguna", "algunes", "alguns", "alhora", "allà", "allí", "allò", "als", "altra", "altre", "altres", "amb", "ambdues", "ambdós", "anar", "ans", "apa", "aquell", "aquella", "aquelles", "aquells", "aquest", "aquesta", "aquestes", "aquests", "aquí", "baix", "bastant", "bé", "cada", "cadascuna", "cadascunes", "cadascuns", "cadascú", "com", "consegueixo", "conseguim", "conseguir", "consigueix", "consigueixen", "consigueixes", "contra", "d'un", "d'una", "d'unes", "d'uns", "dalt", "de", "del", "dels", "des", "des de", "després", "dins", "dintre", "donat", "doncs", "durant", "e", "eh", "el", "elles", "ells", "els", "em", "en", "encara", "ens", "entre", "era", "erem", "eren", "eres", "es", "esta", "estan", "estat", "estava", "estaven", "estem", "esteu", "estic", "està", "estàvem", "estàveu", "et", "etc", "ets", "fa", "faig", "fan", "fas", "fem", "fer", "feu", "fi", "fins", "fora", "gairebé", "ha", "han", "has", "haver", "havia", "he", "hem", "heu", "hi", "ho", "i", "igual", "iguals", "inclòs", "ja", "jo", "l'hi", "la", "les", "li", "li'n", "llarg", "llavors", "m'he", "ma", "mal", "malgrat", "mateix", "mateixa", "mateixes", "mateixos", "me", "mentre", "meu", "meus", "meva", "meves", "mode", "molt", "molta", "moltes", "molts", "mon", "mons", "més", "n'he", "n'hi", "ne", "ni", "no", "nogensmenys", "només", "nosaltres", "nostra", "nostre", "nostres", "o", "oh", "oi", "on", "pas", "pel", "pels", "per", "per que", "perquè", "però", "poc", "poca", "pocs", "podem", "poden", "poder", "podeu", "poques", "potser", "primer", "propi", "puc", "qual", "quals", "quan", "quant", "que", "quelcom", "qui", "quin", "quina", "quines", "quins", "què", "s'ha", "s'han", "sa", "sabem", "saben", "saber", "sabeu", "sap", "saps", "semblant", "semblants", "sense", "ser", "ses", "seu", "seus", "seva", "seves", "si", "sobre", "sobretot", "soc", "solament", "sols", "som", "son", "sons", "sota", "sou", "sóc", "són", "t'ha", "t'han", "t'he", "ta", "tal", "també", "tampoc", "tan", "tant", "tanta", "tantes", "te", "tene", "tenim", "tenir", "teniu", "teu", "teus", "teva", "teves", "tinc", "ton", "tons", "tot", "tota", "totes", "tots", "un", "una", "unes", "uns", "us", "va", "vaig", "vam", "van", "vas", "veu", "vosaltres", "vostra", "vostre", "vostres", "érem", "éreu", "és", "éssent", "últim", "ús"}
STOPWORDS_CA = {"a", "al", "el", "la", "els", "les", "de", "un", "una", "algun", "alguna", }

In [4]:
# Definir función de pre-procesado
def preprocess(sentence: str) -> List[str]:
    preprocessed = simple_preprocess(sentence) # Tokenización y normalización, lematización, minúsculas
    # Eliminar stopwords
    preprocessed = [token for token in preprocessed if token not in STOPWORDS_CA]
    return preprocessed

In [5]:
from datasets import load_dataset
# Text Similarity (STS) dataset (principal per la Pràctica 4)
train = load_dataset("projecte-aina/sts-ca", split="train")
test = load_dataset("projecte-aina/sts-ca", split="test")
val = load_dataset("projecte-aina/sts-ca", split="validation")
all_data = load_dataset("projecte-aina/sts-ca", split="all")
all_data

Dataset({
    features: ['id', 'sentence_1', 'sentence_2', 'label'],
    num_rows: 3073
})

In [55]:
def map_corpus(corpus):
    sentences_1_preproc = [simple_preprocess(d["sentence_1"]) for d in corpus] #lista de listas que son oraciones lematizadas
    sentences_2_preproc = [simple_preprocess(d["sentence_2"]) for d in corpus]
    scores = [d["label"] for d in corpus]
    sentence_pairs = list(zip(sentences_1_preproc, sentences_2_preproc, scores))
    return sentence_pairs

train_preproc = map_corpus(train)
test_preproc = map_corpus(test)
val_preproc = map_corpus(val)

Word Embedding pre-entrenado:

In [56]:
"""
# Modelos pre-entrenados
# WV_MODEL_PATH = "/Users/salva/Downloads/cc.ca.300.bin.gz"
WV_MODEL_PATH = '/Users/salva/Downloads/cc.ca.300.vec.gz'
import gensim
wv_model =  gensim.models.KeyedVectors.load_word2vec_format(WV_MODEL_PATH, binary=False)
wv_model
"""

'\n# Modelos pre-entrenados\n# WV_MODEL_PATH = "/Users/salva/Downloads/cc.ca.300.bin.gz"\nWV_MODEL_PATH = \'/Users/salva/Downloads/cc.ca.300.vec.gz\'\nimport gensim\nwv_model =  gensim.models.KeyedVectors.load_word2vec_format(WV_MODEL_PATH, binary=False)\nwv_model\n'

In [57]:
from gensim.models.fasttext import FastTextKeyedVectors
#cargar como map:
wv_model = FastTextKeyedVectors.load('/home/taya/Desktop/cc.ca.gensim.bin', mmap='r')

de dimensión reducida:

In [58]:
wv_model_50d = {
    word: wv_model[word][:50]
    for word in wv_model.index_to_key
}

wv_model_100d = {
    word: wv_model[word][:100]
    for word in wv_model.index_to_key
}
wv_model_150d = {
    word: wv_model[word][:150]
    for word in wv_model.index_to_key
}

Construccion del diccionario:

In [10]:
# Preprocesamiento de las oraciones y creación del diccionario
sentences_1_preproc = [simple_preprocess(d["sentence_1"]) for d in all_data] #lista de listas que son oraciones lematizadas
sentences_2_preproc = [simple_preprocess(d["sentence_2"]) for d in all_data]
scores = [d["label"] for d in all_data]
sentence_pairs = list(zip(sentences_1_preproc, sentences_2_preproc, scores))#lista de tuplas que son ([palabras or1], [pal or 2], score)
# Versión aplanada para poder entrenar el modelo
sentences_pairs_flattened = sentences_1_preproc + sentences_2_preproc #todas las oraciones juntas
diccionario = Dictionary(sentences_pairs_flattened) # diccionario donde cada palabra tiene un indice unico
diccionario

<gensim.corpora.dictionary.Dictionary at 0x7cccca037fb0>

Construccion de la metriz TF-IDF:

In [60]:
# Cálculo de los pesos TF-IDF para las oraciones pre-procesadas
corpus = [diccionario.doc2bow(sent) for sent in sentences_pairs_flattened]
"""
Por ejemplo, si sent es ['hola', 'mundo', 'hola'], el resultado de diccionario.doc2bow(sent) podría ser [(0, 2), (1, 1)], donde 0 es el índice de "hola" y 1 es el índice de "mundo", indicando que "hola" aparece 2 veces y "mundo" aparece 1 vez.
corpus = El resultado es una lista de representaciones de bolsa de palabras, donde cada elemento corresponde a una oración en el conjunto de datos.
"""
modelo_tfidf = TfidfModel(corpus) #transformar el corpus en una representación que refleja la importancia de las palabras en cada documento en relación con el corpus completo.

Aregación:

In [61]:
def map_tf_idf(sentence_preproc: List[str], dictionary: Dictionary, tf_idf_model: TfidfModel, model = wv_model) -> Tuple[List[np.ndarray], List[float]]:
    """
    lo que hace es que coge una oracion preprocesada, para cada palabra saca sus pesos TF-IDF y su vector en el embeding
    """
    bow = dictionary.doc2bow(sentence_preproc)#cuenta la frecuencia de cada palabra en la oracion
    tf_idf = tf_idf_model[bow] 
    vectors, weights = [], []
    for word_index, weight in tf_idf:
        word = dictionary.get(word_index)
        if word in model:
            vectors.append(model[word])
            weights.append(weight)
    return vectors, weights

def map_pairs(wv_model2, sentence_pairs: List[Tuple[str, str, float]],dictionary: Dictionary = None, tf_idf_model: TfidfModel = None,) -> List[Tuple[Tuple[np.ndarray, np.ndarray], float]]:
    """
    Mapea los tripletes de oraciones a listas de (x, y), (pares de vectores, score)
    :param sentence_pairs: lista de tuplas que son ([palabras or1], [palabras or2], score)
    :param dictionary: diccionario donde cada palabra tiene un indice unico
    :param tf_idf_model: objeto TfidfModel que da los pesos de las palabras (se puede indexar con un bag of words)
    :return: lista de ((vector1, vector2), similitud), donde vector1 y vector2 cambian en funcion de:
        si tf_idf_model is not None:
                para cada elemento de sentence_pairs devuelve el vector embeding promediado de manera ponderada por los pesos de la matriz TF-IDF de las palabras de las oraciones 1 y 2.
        si tf_idf_model is not None
            el promedio de los vectores de embeding de las palabras que componen cada una de las oraciones
    """
    # Mapeo de los pares de oraciones a pares de vectores
    pares_vectores = []
    for i, (sentence_1, sentence_2, similitud) in enumerate(sentence_pairs):
        sentence_1_preproc = preprocess(sentence_1) if isinstance(sentence_1, str) else sentence_1 # se procesa el texto antes de aplicar map_pairs entonces sentence_1 es una lista de tokens y ya nose vuelve a preprocesar
        sentence_2_preproc = preprocess(sentence_2) if isinstance(sentence_2, str) else sentence_2
        # Si usamos TF-IDF
        if tf_idf_model is not None:
            # Cálculo del promedio ponderado por TF-IDF de los word embeddings
            vectors1, weights1 = map_tf_idf(sentence_1_preproc, dictionary=dictionary, tf_idf_model=tf_idf_model,model =  wv_model2, )
            vectors2, weights2 = map_tf_idf(sentence_2_preproc, dictionary=dictionary, tf_idf_model=tf_idf_model, model = wv_model2 )
            vector1 = np.average(vectors1, weights=weights1, axis=0, ) #Esta función calcula el promedio de un conjunto de valores. Si se proporciona un argumento weights, el promedio se calcula de manera ponderada, lo que significa que cada valor contribuye al promedio de acuerdo con su peso correspondiente.
            vector2 = np.average(vectors2, weights=weights2, axis=0, )
        else:
            # Cálculo del promedio de los word embeddings
            vectors1 = [wv_model2[word] for word in sentence_1_preproc if word in wv_model2]
            vectors2 = [wv_model2[word] for word in sentence_2_preproc if word in wv_model2]
            vector1 = np.mean(vectors1, axis=0)
            vector2 = np.mean(vectors2, axis=0)
        # Añadir a la lista
        pares_vectores.append(((vector1, vector2), similitud))
    return pares_vectores

In [None]:
mapped_no_tfidf = map_pairs(wv_model, train_preproc, tf_idf_model=None, dictionary=diccionario, )
mapped_train_tfidf = map_pairs(wv_model,train_preproc, tf_idf_model=modelo_tfidf, dictionary=diccionario, )

mapped_val_no_tfidf = map_pairs(wv_model,val_preproc, tf_idf_model=None, dictionary=diccionario, )
mapped_val_tfidf = map_pairs(wv_model,val_preproc, tf_idf_model=modelo_tfidf, dictionary=diccionario, )

mapped_train_tfidf[0]# Imprimir los pares de vectores y la puntuación de similitud asociada

((array([-7.22131877e-03, -4.88683421e-03,  2.71017708e-02,  2.32332627e-02,
         -9.60097482e-03, -3.16095435e-03,  2.90225599e-02, -1.75413820e-02,
          2.93095319e-02, -1.60574403e-02, -6.04936805e-03,  1.49908545e-02,
          1.00934507e-02,  1.84449753e-02,  2.16156266e-02,  2.13238810e-02,
          8.69774586e-03,  6.13958934e-02,  2.18719114e-02,  1.01426407e-02,
          1.16837708e-02,  3.24588305e-03, -1.32856906e-02,  5.77104877e-02,
          1.54627187e-02,  2.13443526e-02, -3.82471218e-02,  6.23983637e-03,
          7.31161386e-04,  8.99225741e-03, -4.87626204e-03,  1.08773269e-02,
          1.30313145e-02, -3.86450925e-03,  7.23370200e-03, -1.75266524e-02,
         -9.09778208e-03,  4.43412138e-02, -4.31998409e-04,  6.25044253e-04,
         -1.13920750e-02, -1.82465011e-02, -8.11444328e-03, -8.18518457e-03,
         -3.54176235e-03, -1.60820262e-01,  7.74505797e-03,  9.80261699e-03,
          8.53058034e-03, -1.23878019e-02,  1.24134202e-02, -2.39070017e-03,

De dimensión reducida: <sb>

sí usando pesos TF-IDF

In [27]:
mapped_50 = map_pairs(wv_model_50d, train_preproc, tf_idf_model=modelo_tfidf, dictionary=diccionario, )
mapped_100 = map_pairs(wv_model_100d, train_preproc, tf_idf_model=modelo_tfidf, dictionary=diccionario, )
mapped_150 = map_pairs(wv_model_150d, train_preproc, tf_idf_model=modelo_tfidf, dictionary=diccionario, )


mapped_val_50 = map_pairs(wv_model_50d, val_preproc, tf_idf_model=modelo_tfidf, dictionary=diccionario, )
mapped_val_100 = map_pairs(wv_model_100d, val_preproc, tf_idf_model=modelo_tfidf, dictionary=diccionario, )
mapped_val_150 = map_pairs(wv_model_150d, val_preproc, tf_idf_model=modelo_tfidf, dictionary=diccionario, )

In [28]:
print(mapped_no_tfidf[0][0][0].shape)
print(mapped_train_tfidf[0][0][0].shape)
print(mapped_50[0][0][0].shape)
print(mapped_100[0][0][0].shape)
print(mapped_150[0][0][0].shape)

(300,)
(300,)
(50,)
(100,)
(150,)


Diferentes modelos:

In [None]:
# Definir el Modelo
import tensorflow as tf

def build_and_compile_model(hidden_size: int = 128, embedding_size: int = 300, learning_rate: float = 0.001) -> tf.keras.Model:
    """
    Esto crea una red neuronal de manera que al entrenarla las distancias coseno cuadren con la etiqueta real
    hidden_size: Tamaño de capas ocultas (no se usa en este código)
    embedding_size: Dimensión de los vectores de entrada (300)
    learning_rate: Tasa de aprendizaje para el optimizador
    """
    # Capa de entrada para los pares de vectores
    input_1 = tf.keras.Input(shape=(embedding_size,)) #los pares de vectores a comparar
    input_2 = tf.keras.Input(shape=(embedding_size,))

    # Capa oculta, con funcion de activacion lineal, tiene como objetivo proyectar los vectores de entrada en un nuevo espacio.
    """
    La capa oculta (en este caso, la capa densa) tiene pesos que se ajustan durante el entrenamiento.
    Estos pesos son los que transforman los vectores de entrada en los vectores proyectados
    """
    first_projection = tf.keras.layers.Dense(
        embedding_size,
        # activation='tanh',
        kernel_initializer=tf.keras.initializers.Identity(),# inicializa los pesos de la capa como una matriz identidad
        bias_initializer=tf.keras.initializers.Zeros(),
    )
    #aplica la capa de proyeccion a los dos vectores de entrada
    projected_1 = first_projection(input_1)
    projected_2 = first_projection(input_2)
    """
    # Compute the cosine distance
    projected_1 = tf.linalg.l2_normalize(projected_1, axis=1, ) #Normaliza ambos vectores para que tengan magnitud 1, necesario para el cálculo de similitud coseno
    projected_2 = tf.linalg.l2_normalize(projected_2, axis=1, )
    output = 2.5 * (1.0 + tf.reduce_sum(projected_1 * projected_2, axis=1, ))
    """ 
    #lo comentado es del profe y no va. Esto es del Chat############################################################################################
    normalize = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))
    projected_1 = normalize(projected_1)
    projected_2 = normalize(projected_2)
    output = tf.keras.layers.Lambda(lambda tensors: 2.5 * (1.0 + tf.reduce_sum(tensors[0] * tensors[1], axis=1)))([projected_1, projected_2])
    ############################################################################################################################################
    # Definir el modelo con las capas de entrada y salida
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output) #Durante el entrenamiento, Keras ajusta los pesos de la capa oculta para minimizar la función de pérdida definida (en este caso, el error absoluto medio).

    # Compilar el modelo
    model.compile(loss='mean_absolute_error',
                  optimizer=tf.keras.optimizers.Adam(learning_rate))

    return model

En las transparencias para el modelo 1 pone este codigo:

In [30]:
def build_model_aggregated(embedding_dim: int, hidden_size: int = 128, dropout_rate: float = 0.3) -> tf.keras.Model:
    input_1 = tf.keras.Input(shape=(embedding_dim,), name="input_vector_1")
    input_2 = tf.keras.Input(shape=(embedding_dim,), name="input_vector_2")
    concatenated = tf.keras.layers.Concatenate(axis=-1)([input_1, input_2])
    x = tf.keras.layers.BatchNormalization()(concatenated)
    x = tf.keras.layers.Dense(hidden_size, activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(dropout_rate)(x)
    output = tf.keras.layers.Dense(1)(x) # Activació lineal per a regressió
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)
    model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  metrics=['mae', tf.keras.metrics.RootMeanSquaredError()])
    return model

#model_agg.fit([X1_train, X2_train], Y_train, epochs=..., batch_size=...)

🧠 Modelo build_model_aggregated: Concatenación + Red Neuronal Densa
🏗️ Arquitectura:

    Toma dos vectores de entrada (input_vector_1, input_vector_2).

    Los concatena (Concatenate), por lo que la dimensión del vector combinado es el doble del embedding_dim.

    Aplica:

        BatchNormalization

        Dense con ReLU

        Otro BatchNormalization

        Dropout

        Una capa de salida densa sin activación (regresión lineal).

    Se entrena con MSE (Mean Squared Error).

🧾 Objetivo implícito:

    Aprender una función no lineal entre los vectores concatenados y la puntuación objetivo (p. ej., similitud STS, afinidad, etc.).

    Aprende una transformación compleja basada en composición conjunta de los dos vectores.

📌 Ventajas:

    Flexibilidad para aprender patrones complejos.

    Permite capturar interacciones no lineales entre los dos vectores.

❗ Consideraciones:

    Puede sobreajustarse si el dataset es pequeño.

    Requiere más parámetros, por lo tanto más datos para entrenar bien.

🧠 build_and_compile_model: Proyección + Similitud Coseno
🏗️ Arquitectura:

    Aplica una capa densa compartida (misma proyección) a cada vector por separado. Inicialmente es una matriz identidad.

    Los normaliza a magnitud 1.

    Calcula la similitud coseno como cos(θ) = dot(product).

    La salida es 2.5 * (1 + similitud_coseno), lo cual transforma el rango [-1, 1] a [0, 5].

    Se entrena con MAE (Mean Absolute Error).

🧾 Objetivo implícito:

    Aprender una proyección donde la similitud coseno refleje la puntuación deseada (por ejemplo, cuán similares son dos textos o frases).

    Optimiza directamente sobre una función interpretable (similitud coseno), útil para tareas tipo semantic textual similarity (STS).

📌 Ventajas:

    Muy interpretativo.

    Más simple y menos propenso a sobreajuste.

    Funciona bien cuando la relación entre embeddings y similitud es principalmente angular (coseno).

❗ Consideraciones:

    Menor capacidad expresiva que el Modelo 1.

    Asume que la similitud se puede modelar bien con una proyección lineal + coseno.

In [31]:

def build_and_compile_model2(embedding_size: int = 300, learning_rate: float = 0.001) -> tf.keras.Model:
    # Input layer
    input_1 = tf.keras.Input(shape=(embedding_size,), name="input_vector_1")
    input_2 = tf.keras.Input(shape=(embedding_size,), name="input_vector_2")

    # hidden layer
    first_projection_layer = tf.keras.layers.Dense(
        embedding_size,
        activation='tanh',
        kernel_initializer=tf.keras.initializers.Identity(),
        bias_initializer=tf.keras.initializers.Zeros(),
        name="projection_layer"
    )
    dropout = tf.keras.layers.Dropout(0.3, name="projection_dropout")
    projected_1_dense = dropout(first_projection_layer(input_1))
    projected_2_dense = dropout(first_projection_layer(input_2))

    # Normalize the projected vectors using Lambda layers
    normalized_1 = tf.keras.layers.Lambda(
        lambda x: tf.linalg.l2_normalize(x, axis=1), name="normalize_1"
    )(projected_1_dense)
    normalized_2 = tf.keras.layers.Lambda(
        lambda x: tf.linalg.l2_normalize(x, axis=1), name="normalize_2"
    )(projected_2_dense)

    # Compute the custom similarity score using a Lambda layer
    similarity_sum = tf.keras.layers.Lambda(
        lambda x: tf.reduce_sum(x[0] * x[1], axis=1, keepdims=True), name="similarity_sum"
    )([normalized_1, normalized_2])

    output = tf.keras.layers.Lambda(
        lambda x: 0.5 * (1.0 + x), name="output_scaling" #cambiar 0.5 por 2.5 para que este entre 0 y 5 
    )(similarity_sum)

    # Definir el modelo con las capas de entrada y salida
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output, name="similarity_model")

    # Compilar el modelo
    model.compile(
        loss='mean_squared_error',
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    )

    return model


| Característica             | Modelo 1: Cosine (`build_and_compile_model2`) | Modelo 2: MLP (`build_model_aggregated`) |
| -------------------------- | -------------------------------------------- | ---------------------------------------- |
| Tipo de entrada            | 2 vectores                                   | 2 vectores concatenados                  |
| Proyección                 | Capa densa compartida                        | Dense normal (no compartida)             |
| Normalización L2           | ✅ sí                                         | ❌ no                                     |
| Métrica implícita          | Cosine similarity                            | No definida; aprende desde los datos     |
| Salida                     | Escalado de coseno (rango 0-1 o 0-5)         | Escalar libre (regresión lineal)         |
| Capacidad expresiva        | Limitada (coseno + proyección)               | Alta (MLP)                               |
| Interpretabilidad          | Alta                                         | Media                                    |
| Velocidad de entrenamiento | Más rápido                                   | Más lento                                |


.

| Aspecto                         | build_and_compile_model                  | `build_and_compile_model2`                              |
| ------------------------------- | ----------------------------------------- | ------------------------------------------------------ |
| **Activación en la proyección** | Ninguna (lineal)                          | `tanh` (no lineal)                                     |
| **Regularización**              | No hay                                    | Sí, con `Dropout`                                      |
| **Normalización**               | Directamente con `tf.linalg.l2_normalize` | Igual, pero encapsulada en `Lambda` layers con nombres |
| **Similitud coseno**            | `2.5 * (1.0 + coseno)`                    | `0.5 * (1.0 + coseno)`                                 |
| **Escalado de salida**          | Rango `[0, 5]`                            | Rango `[0, 1]`                                         |
| **Perdida**                     | `mean_absolute_error`                     | `mean_squared_error`                                   |
| **Estilo**                      | Más directo, menos modular                | Más claro, modular, con nombres de capas               |


.

Training y evaluación:

In [37]:
# Definir constantes de entrenamiento
batch_size: int = 64
num_epochs: int = 64
train_val_split: float = 0.8

In [38]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr

def evaluate_model(model, X1_test, X2_test, Y_test, name=""):
    y_pred = model.predict([X1_test, X2_test]).squeeze()
    y_true = Y_test.squeeze()

    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    pearson, _ = pearsonr(y_true, y_pred)
    spearman, _ = spearmanr(y_true, y_pred)

    print(f"\n🔎 Resultados para el modelo '{name}':")
    print(f"MSE:      {mse:.4f}")
    print(f"RMSE:     {rmse:.4f}")
    print(f"MAE:      {mae:.4f}")
    print(f"Pearson:  {pearson:.4f}")
    print(f"Spearman: {spearman:.4f}")

    return {
        "name": name,
        "mse": mse,
        "rmse": rmse,
        "mae": mae,
        "pearson": pearson,
        "spearman": spearman
    }


In [None]:
# Obtener x_train e y_train
#train_slice: int = int(len(mapped) * train_val_split)

Obtener Train y Validation:

In [33]:
def pair_list_to_x_y(pair_list: List[Tuple[Tuple[np.ndarray, np.ndarray], int]]) -> Tuple[Tuple[np.ndarray, np.ndarray], np.ndarray]:
    """
    Obtiene las matrices X_1 (N x d) , X_2 (N x d), e Y (n) a partir de listas de parejas de vectores de oraciones - Listas de (d, d, 1)
    :param pair_list: lista que devuelve map_pairs(), lista de ((vector1, vector2), similitud), sonde vector1 y 2 son vectores agregados
    :return:
    transforma una lista de pares de vectores y puntuaciones en el formato adecuado para alimentar a un modelo de aprendizaje automático.
    """
    _x, _y = zip(*pair_list) #_x: lista de tuplas (embedding_1, embedding_2), _y: lista de etiquetas
    _x_1, _x_2 = zip(*_x)#_x_1: todos los embedding_1,  _x_2: todos los embedding_2
    return (np.array(_x_1), np.array(_x_2)), np.array(_y, dtype=np.float32, ) / 5.0

In [35]:
# Obtener las listas de train y test USANDO TF-IDF
x_train, y_train = pair_list_to_x_y(mapped_train_tfidf)
x_val, y_val = pair_list_to_x_y(mapped_val_tfidf)


# Obtener las listas de train y test SIN USAR TF-IDF
x_train_normal, y_train_normal = pair_list_to_x_y(mapped_no_tfidf)
x_val_normal, y_val_normal = pair_list_to_x_y(mapped_val_no_tfidf)

In [34]:
#De dimension reducida CON TF-IDF
x_train_50, y_train_50 = pair_list_to_x_y(mapped_50)
x_val_50, y_val_50 = pair_list_to_x_y(mapped_val_50)

x_train_100, y_train_100 = pair_list_to_x_y(mapped_100)
x_val_100, y_val_100 = pair_list_to_x_y(mapped_val_100)

x_train_150, y_train_150 = pair_list_to_x_y(mapped_150)
x_val_150, y_val_150 = pair_list_to_x_y(mapped_val_150)

In [39]:
# Preparar los conjuntos en forma de tensor
#Con TF-IDF
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=len(x_train)).batch(batch_size)

val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(batch_size)


#Sin TF-IDF
train_dataset_normal = tf.data.Dataset.from_tensor_slices((x_train_normal, y_train_normal))
train_dataset_normal = train_dataset_normal.shuffle(buffer_size=len(x_train_normal)).batch(batch_size)

val_dataset_normal = tf.data.Dataset.from_tensor_slices((x_val_normal, y_val_normal))
val_dataset_normal = val_dataset_normal.batch(batch_size)

In [108]:
#Dimensión 50
train_dataset_50 = tf.data.Dataset.from_tensor_slices((x_train_50, y_train_50))
train_dataset_50 = train_dataset_50.shuffle(buffer_size=len(x_train_50)).batch(batch_size)

val_dataset_50 = tf.data.Dataset.from_tensor_slices((x_val_50, y_val_50))
val_dataset_50 = val_dataset_50.batch(batch_size)

#Dimensión 100
train_dataset_100 = tf.data.Dataset.from_tensor_slices((x_train_100, y_train_100))
train_dataset_100 = train_dataset_100.shuffle(buffer_size=len(x_train_100)).batch(batch_size)

val_dataset_100 = tf.data.Dataset.from_tensor_slices((x_val_100, y_val_100))
val_dataset_100 = val_dataset_100.batch(batch_size)


#Dimensión 150
train_dataset_150 = tf.data.Dataset.from_tensor_slices((x_train_150, y_train_150))
train_dataset_150 = train_dataset_150.shuffle(buffer_size=len(x_train_150)).batch(batch_size)

val_dataset_150 = tf.data.Dataset.from_tensor_slices((x_val_150, y_val_150))
val_dataset_150 = val_dataset_150.batch(batch_size)


### Evaluacion

# Modelo COS:

In [None]:
# Podemos evaluar el modelo si sólo utilizamos COS similarity. (Depende completamente de los Word Embeddings)
from scipy import spatial

In [None]:
"""
val_normal = mapped_no_tfidf[train_slice:]
val = mapped[train_slice:]
val_50 = mapped_50[train_slice:]
val_100 = mapped_100[train_slice:]
val_150 = mapped_150[train_slice:]
"""

In [94]:
y_pred_baseline = []
y_pred_normal = []
y_pred_50 = []
y_pred_100 = []
y_pred_150 = []
for j in range(len(mapped_val_tfidf)):
    i = mapped_val_tfidf[j]
    v1= i[0][0] 
    v2 = i[0][1]
    d = 1.0 - spatial.distance.cosine(v1, v2)
    y_pred_baseline.append(d)

    k = mapped_val_no_tfidf[j]
    v1 = k[0][0]
    v2 = k[0][1] 
    d = 1.0 - spatial.distance.cosine(v1, v2)
    y_pred_normal.append(d)

    m = mapped_val_50[j]
    v1 = m[0][0] 
    v2 = m[0][1]
    d = 1.0 - spatial.distance.cosine(v1, v2)
    y_pred_50.append(d)

    l = mapped_val_100[j]
    v1 = l[0][0]
    v2 = l[0][1] 
    d = 1.0 - spatial.distance.cosine(v1, v2)
    y_pred_100.append(d)

    e = mapped_val_150[j]
    v1 = e[0][0]
    v2 = e[0][1] 
    d = 1.0 - spatial.distance.cosine(v1, v2)
    y_pred_150.append(d)

# Calcular la correlación de Pearson entre las predicciones y los datos de prueba
correlation, _ = pearsonr(np.array(y_pred_baseline), y_val.flatten())
correlation_normal, _ = pearsonr(np.array(y_pred_normal), y_val_normal.flatten())
correlation_50, _ = pearsonr(np.array(y_pred_50), y_val_50.flatten())
correlation_100, _ = pearsonr(np.array(y_pred_100), y_val_100.flatten())
correlation_150, _ = pearsonr(np.array(y_pred_150), y_val_150.flatten())

# Imprimir el coeficiente de correlación de Pearson
print(f"Usando media clásica la correlación de Pearson es: {correlation}")
print(f"Usando media ponderada con TF-IDF la correlación de Pearson es: {correlation_normal}")
print(f"Usando media ponderada con TF-IDF y dimensión 50, la correlación de Pearson es: {correlation_50}")
print(f"Usando media ponderada con TF-IDF y dimensión 100, la correlación de Pearson es: {correlation_100}")
print(f"Usando media ponderada con TF-IDF y dimensión 150, la correlación de Pearson es: {correlation_150}")


Usando media clásica la correlación de Pearson es: 0.4023026679065351
Usando media ponderada con TF-IDF la correlación de Pearson es: 0.24358562692308788
Usando media ponderada con TF-IDF y dimensión 50, la correlación de Pearson es: 0.3632097710213902
Usando media ponderada con TF-IDF y dimensión 100, la correlación de Pearson es: 0.38166246524470904
Usando media ponderada con TF-IDF y dimensión 150, la correlación de Pearson es: 0.39754565403044134


# Modelos de Regresion 1:

#### Prueba del embeding aprendido:

In [None]:
#Con TF-IDF
model_no_cos = build_model_aggregated(embedding_dim=300)
model_no_cos.fit([x_train], y_train, epochs=num_epochs, batch_size=batch_size)

In [96]:
X1, X2 = x_val
evaluate_model(model_no_cos, X1, X2, y_val, name="Linealmodel no cos()")

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step

🔎 Resultados para el modelo 'Linealmodel no cos()':
MSE:      0.0320
RMSE:     0.1788
MAE:      0.1369
Pearson:  0.2092
Spearman: 0.1943


{'name': 'Linealmodel no cos()',
 'mse': 0.03196687,
 'rmse': 0.1787928,
 'mae': 0.13685906,
 'pearson': 0.2092202309167135,
 'spearman': 0.19433486204195968}

In [98]:
#Sin TF-IDF
model_no_cos_no_tfidf = build_model_aggregated(embedding_dim=300)
model_no_cos.fit([x_train_normal], y_train_normal, epochs=num_epochs, batch_size=batch_size)
X1, X2 = x_val_normal
evaluate_model(model_no_cos, X1, X2, y_val_normal, name="Linealmodel no cos()")

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0067 - mae: 0.0634 - root_mean_squared_error: 0.0821
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0064 - mae: 0.0634 - root_mean_squared_error: 0.0803
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0075 - mae: 0.0670 - root_mean_squared_error: 0.0867
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0060 - mae: 0.0590 - root_mean_squared_error: 0.0769
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0070 - mae: 0.0627 - root_mean_squared_error: 0.0836
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0069 - mae: 0.0644 - root_mean_squared_error: 0.0832
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0067 - mae: 0.0640 - roo

{'name': 'Linealmodel no cos()',
 'mse': 0.03233151,
 'rmse': 0.17980966,
 'mae': 0.1360344,
 'pearson': 0.24413916476490077,
 'spearman': 0.24416001198023862}

Este modelo no tiene mucho sentido ya que no uncorpora la distancia coseno
TF-IDF no mejora el rendimiento

In [103]:
model_lin = build_and_compile_model()
model_lin.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)
X1, X2 = x_val
evaluate_model(model_lin, X1, X2, y_val)

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 3.9947 - val_loss: 3.7057
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 3.6517 - val_loss: 3.6115
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 3.5287 - val_loss: 3.5615
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 3.4427 - val_loss: 3.5288
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 3.3737 - val_loss: 3.5053
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 3.3145 - val_loss: 3.4877
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 3.2658 - val_loss: 3.4735
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 3.2221 - val_loss: 3.4620
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

{'name': '',
 'mse': 12.146353,
 'rmse': 3.4851618,
 'mae': 3.3287277,
 'pearson': 0.24669281061111503,
 'spearman': 0.2983922507157079}

In [104]:
model_lin_no_tfidf = build_and_compile_model()
model_lin_no_tfidf.fit(train_dataset_normal, epochs=num_epochs, validation_data=val_dataset)
X1, X2 = x_val_normal
evaluate_model(model_lin_no_tfidf, X1, X2, y_val_normal)

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 4.0665 - val_loss: 4.1367
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 3.5981 - val_loss: 4.2425
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 3.3846 - val_loss: 4.2708
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 3.2885 - val_loss: 4.2727
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 3.2256 - val_loss: 4.2666
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 3.1790 - val_loss: 4.2597
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 3.1418 - val_loss: 4.2527
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 3.1083 - val_loss: 4.2514
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[

{'name': '',
 'mse': 13.436124,
 'rmse': 3.6655319,
 'mae': 3.3258212,
 'pearson': 0.10653028615519422,
 'spearman': 0.19407947929338235}

In [None]:
"""
y_pred: tf.RaggedTensor = model_lin.predict(x_val)
# Calcular la correlación de Pearson entre las predicciones y los datos de prueba
correlation, _ = pearsonr(y_pred.flatten(), y_val.flatten())
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson: {correlation}")
"""

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Correlación de Pearson: 0.341672256901909


In [101]:
# Construir y compilar el modelo
model_no_lin = build_and_compile_model2()
# tf.keras.utils.plot_model(model, show_shapes=True, show_layer_activations=True, )
#print(model.summary())
# Entrenar el modelo
model_no_lin.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.1033 - val_loss: 0.1450
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0828 - val_loss: 0.1348
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.0770 - val_loss: 0.1300
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0721 - val_loss: 0.1256
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0691 - val_loss: 0.1224
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 0.0660 - val_loss: 0.1201
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0632 - val_loss: 0.1188
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0608 - val_loss: 0.1168
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x7958f44db950>

In [102]:
#from scipy.stats import pearsonr
# Obtener las predicciones del modelo para los datos de prueba. En este ejemplo vamos a utilizar el corpus de training.
y_pred: tf.RaggedTensor = model_no_lin.predict(x_val)
# Calcular la correlación de Pearson entre las predicciones y los datos de prueba
correlation, _ = pearsonr(y_pred.flatten(), y_val.flatten())
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson: {correlation}")


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Correlación de Pearson: 0.471240722830828


In [105]:
X1, X2 = x_val
evaluate_model(model_no_lin, X1, X2, y_val)

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step

🔎 Resultados para el modelo '':
MSE:      0.1029
RMSE:     0.3208
MAE:      0.2866
Pearson:  0.4712
Spearman: 0.4744


{'name': '',
 'mse': 0.102900855,
 'rmse': 0.32078162,
 'mae': 0.2865823,
 'pearson': 0.471240722830828,
 'spearman': 0.47435332252489665}

In [106]:
model_no_lin_no_tfidf = build_and_compile_model2()
model_no_lin_no_tfidf.fit(train_dataset_normal, epochs=num_epochs, validation_data=val_dataset_normal)
X1, X2 = x_val_normal
evaluate_model(model_no_lin_no_tfidf, X1, X2, y_val_normal)

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.1106 - val_loss: 0.1626
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0855 - val_loss: 0.1433
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0771 - val_loss: 0.1374
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0732 - val_loss: 0.1326
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.0701 - val_loss: 0.1299
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.0681 - val_loss: 0.1265
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.0659 - val_loss: 0.1241
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0639 - val_loss: 0.1225
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[

{'name': '',
 'mse': 0.105113015,
 'rmse': 0.32421136,
 'mae': 0.28485468,
 'pearson': 0.37460832093705954,
 'spearman': 0.4084685621007067}

Usando TF-IDF dan mejores resultados. <sb>

Para la evaluación del impacto de la dimensionalidad se va a usar el mejor modelo hasta el momento, que es `build_and_compile_model2()`

In [114]:
model_no_lin_50 = build_and_compile_model2(embedding_size = 50)
model_no_lin_50.fit(train_dataset_50, epochs=num_epochs, validation_data=val_dataset_50)
X1, X2 = x_val_50
evaluate_model(model_no_lin_50, X1, X2, y_val_50)

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.1285 - val_loss: 0.2004
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.1123 - val_loss: 0.1637
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0947 - val_loss: 0.1440
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0880 - val_loss: 0.1417
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0839 - val_loss: 0.1398
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0838 - val_loss: 0.1388
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0826 - val_loss: 0.1383
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0837 - val_loss: 0.1370
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

{'name': '',
 'mse': 0.120492496,
 'rmse': 0.34712029,
 'mae': 0.3123578,
 'pearson': 0.4167082626695549,
 'spearman': 0.4359996013610936}

In [113]:
model_no_lin_100 = build_and_compile_model2(embedding_size = 100)
model_no_lin_100.fit(train_dataset_100, epochs=num_epochs, validation_data=val_dataset_100)
X1, X2 = x_val_100
evaluate_model(model_no_lin_100, X1, X2, y_val_100)

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.1154 - val_loss: 0.1709
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0976 - val_loss: 0.1452
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0852 - val_loss: 0.1384
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0802 - val_loss: 0.1359
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0806 - val_loss: 0.1335
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0787 - val_loss: 0.1315
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0769 - val_loss: 0.1301
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0746 - val_loss: 0.1286
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

{'name': '',
 'mse': 0.11015916,
 'rmse': 0.33190233,
 'mae': 0.29832277,
 'pearson': 0.45988185739766835,
 'spearman': 0.48614688707672987}

In [115]:
model_no_lin_150 = build_and_compile_model2(embedding_size = 150)
model_no_lin_150.fit(train_dataset_150, epochs=num_epochs, validation_data=val_dataset_150)
X1, X2 = x_val_150
evaluate_model(model_no_lin_150, X1, X2, y_val_150)

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 0.1095 - val_loss: 0.1614
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0901 - val_loss: 0.1407
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0817 - val_loss: 0.1360
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0788 - val_loss: 0.1327
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0755 - val_loss: 0.1300
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0746 - val_loss: 0.1277
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0725 - val_loss: 0.1261
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0700 - val_loss: 0.1241
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

{'name': '',
 'mse': 0.10725653,
 'rmse': 0.32750043,
 'mae': 0.293617,
 'pearson': 0.46553533868662045,
 'spearman': 0.4831638315459866}

# Modelo de Regresión 2

Por que surge la necesidad de pasar el texto entero y no una agregación.

# Altres:

## RoBERTa fine-tuned por STS

Un modelo con arquitectura BERT que genera embeddings de frases tales que la distancia coseno entre ellos refleja la similitud semántica real (p. ej., según anotaciones STS), y que se puede evaluar usando la correlación de Pearson entre esa distancia y los valores reales.

In [9]:
from transformers import pipeline, AutoTokenizer
from scipy.special import logit

2025-05-30 19:17:28.456725: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748625448.486908   72976 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748625448.495970   72976 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748625448.518469   72976 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748625448.518492   72976 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748625448.518495   72976 computation_placer.cc:177] computation placer alr

In [None]:
model = 'projecte-aina/roberta-base-ca-v2-cased-sts'

Device set to use cpu


In [25]:
def prepare(sentence_pairs):
    sentence_pairs_prep = []
    for s1, s2 in sentence_pairs:
        sentence_pairs_prep.append(f"{tokenizer.cls_token} {s1}{tokenizer.sep_token}{tokenizer.sep_token} {s2}{tokenizer.sep_token}")
    return sentence_pairs_prep

sentence_pairs_ = [(item['sentence_1'], item['sentence_2']) for item in val]
predictions = pipe(prepare(sentence_pairs_), add_special_tokens=False)

In [6]:
sentence_pairs_ = [(item['sentence_1'], item['sentence_2']) for item in val]

In [15]:
def version_corregida_huggingface(sentence_pairs):
    model = 'projecte-aina/roberta-base-ca-v2-cased-sts'
    tokenizer = AutoTokenizer.from_pretrained(model)
    pipe = pipeline('text-classification', model=model, tokenizer=tokenizer)
    
    # Preparar pares usando el tokenizer correctamente
    texts = [f"{s1} [SEP] {s2}" for s1, s2 in sentence_pairs]
    
    predictions = pipe(texts)
    
    # Convertir a escala 0-5 correctamente
    #for prediction in predictions:
        # El pipeline ya devuelve probabilidades, solo escalamos
        #prediction['score'] = prediction['score'] * 5
    
    return predictions

In [17]:
predicted_scores = version_corregida_huggingface(sentence_pairs_)

Device set to use cpu


In [20]:
predicted_scores = [item['score'] for item in predicted_scores]

In [13]:
true_scores = [item['label'] for item in val]

In [21]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr, spearmanr

# Calcular errores
mse = mean_squared_error(true_scores, predicted_scores)
mae = mean_absolute_error(true_scores, predicted_scores)

# Calcular correlaciones
pearson_corr, _ = pearsonr(true_scores, predicted_scores)
spearman_corr, _ = spearmanr(true_scores, predicted_scores)

# Mostrar resultados
print(f"MSE: {mse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"Pearson correlation: {pearson_corr:.4f}")
print(f"Spearman correlation: {spearman_corr:.4f}")


MSE: 0.3242
MAE: 0.4224
Pearson correlation: 0.7496
Spearman correlation: 0.7304
