In [2]:
'''!pip uninstall -y numpy gensim
!pip install numpy==1.23.5  
!pip install gensim         

# Also install TensorFlow for later use
!pip install tensorflow==2.19.0  

# Force restart the kernel after installing packages
print("Please restart the kernel after running this cell.")'''

'!pip uninstall -y numpy gensim\n!pip install numpy==1.23.5  \n!pip install gensim         \n\n# Also install TensorFlow for later use\n!pip install tensorflow==2.19.0  \n\n# Force restart the kernel after installing packages\nprint("Please restart the kernel after running this cell.")'

# Semantic Text Similarity
Este modelo utiliza gensim para convertir pares de vectores + puntuacions en vectores (word embeddings).
Dado un dataset, infere la puntuació de similitud entre ambdues frases.

In [3]:
import os
# Requisits
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
import numpy as np

In [4]:
# Tipat
from typing import Tuple, List, Optional

In [5]:
# Models pre-entrenats
# WV_MODEL_PATH = "/Users/salva/Downloads/cc.ca.300.bin.gz"


'''WV_MODEL_PATH = '/Users/salva/Downloads/cc.ca.300.vec.gz'
import gensim
wv_model =  gensim.models.KeyedVectors.load_word2vec_format(WV_MODEL_PATH, binary=False)
wv_model'''

"WV_MODEL_PATH = '/Users/salva/Downloads/cc.ca.300.vec.gz'\nimport gensim\nwv_model =  gensim.models.KeyedVectors.load_word2vec_format(WV_MODEL_PATH, binary=False)\nwv_model"

In [6]:
# Llavors podeu carregar el model com a mmap
from gensim.models.fasttext import FastTextKeyedVectors
wv_model = FastTextKeyedVectors.load('../cc.ca.gensim.bin', mmap='r')

In [7]:
# Exemple de 10 parells d'oracions amb puntuació de similitud associada
input_pairs = [
    ('M\'agrada el futbol', 'Disfruto veient partits de futbol', 4),
    ('El cel està despejat', 'Fa un dia bonic', 4.5),
    ('M\'encanta viatjar', 'Explorar nous llocs és una passió', 3.5),
    ('Prefereixo l\'estiu', 'No m\'agrada el fred de l\'hivern', 2.5),
    ('Tinc gana', 'Què hi ha per sopar?', 2),
    ('La música em relaxa', 'Escoltar música és una teràpia', 3),
    ('El llibre és emocionant', 'No puc deixar de llegir-lo', 4),
    ('M\'agrada la pizza', 'És el meu menjar preferit', 4.5),
    ('Estic cansat', 'Necessito fer una migdiada', 1.5),
    ('Avui fa molta calor', 'És un dia sofocant', 3.5)
    ]

In [8]:
REMAP_EMBEDDINGS: bool = True
USE_PRETRAINED: bool = True

In [9]:
from datasets import load_dataset
# Text Similarity (STS) dataset (principal per la Pràctica 4)
train = load_dataset("projecte-aina/sts-ca", split="train")
test = load_dataset("projecte-aina/sts-ca", split="test")
val = load_dataset("projecte-aina/sts-ca", split="validation")

all_data = load_dataset("projecte-aina/sts-ca", split="all")
all_data
print(train[0]['label'])

3.5


In [10]:
# Preprocessament de les oracions i creació del diccionari
sentences_1_preproc = [simple_preprocess(d["sentence_1"]) for d in all_data]
sentences_2_preproc = [simple_preprocess(d["sentence_2"]) for d in all_data]

scores = [d["label"] for d in all_data]
sentence_pairs = list(zip(sentences_1_preproc, sentences_2_preproc, scores))
# Versió aplanada per poder entrenar el model
sentences_pairs_flattened = sentences_1_preproc + sentences_2_preproc
diccionario = Dictionary(sentences_pairs_flattened)
diccionario

<gensim.corpora.dictionary.Dictionary at 0x23d0ff08a90>

## Alternative Baseline : One-Hot Encoding

Una altra alternativa és utilitzar One-Hot Encoding per representar les frases. Això pot ser útil per comparar la similitud entre frases de manera més directa, encara que no captura la semàntica de les paraules com ho fan els word embeddings. La similitud es pot calcular fent servir la distància de Jaccard (número de paraules en comú dividit entre número de paraules totals entre les dues frases) o el coseno entre els vectors resultants.

In [11]:
# Usant la distància Jaccard per avaluar la similitud entre oracions
from typing import List, Set, Union
import numpy as np

def one_hot_evaluation(sent1: List[Union[str, Set[str]]], sent2: List[Union[str, Set[str]]]) -> float:
    """
    Calcular la similitud de Jaccard entre dues oracions
    
    Args:
        sent1: Primera oració tokenitzada com una llista de paraules o conjunts de paraules
        sent2: Segona oració tokenitzada com una llista de paraules o conjunts de paraules
        
    Returns:
        float: Puntuació de similitud basada en la distància de Jaccard
    """
    scores = []
    for i in range(len(sent1)):
        # Convertir a conjunts si no ho són ja
        set1 = set(sent1[i]) if not isinstance(sent1[i], set) else sent1[i]
        set2 = set(sent2[i]) if not isinstance(sent2[i], set) else sent2[i]
        
        # Calcular la similitud de Jaccard
        score = len(set1.intersection(set2)) / len(set1.union(set2))
        scores.append(score)
    
    # Retornar la puntuació mitjana si tenim puntuacions vàlides
    return scores

In [12]:
# Usando la distancia coseno para calcular la similitud entre dos oraciones
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import numpy as np

def one_hot_cosine_similarity(sent1: List[Union[str, Set[str]]], sent2: List[Union[str, Set[str]]], vocabulary: Optional[Dictionary] = None) -> List[float]:
    """
    Calcula la similitud del coseno entre pares de oraciones utilizando representación one-hot encoding
    
    Args:
        sent1: Lista de oraciones tokenizadas (primera oración de cada par)
        sent2: Lista de oraciones tokenizadas (segunda oración de cada par)
        vocabulary: Diccionario opcional para mapear palabras a índices
        
    Returns:
        List[float]: Lista de puntuaciones de similitud basadas en el coseno
    """
    scores = []
    
    for i in range(len(sent1)):
        # Convertir a conjuntos para facilitar el manejo
        set1 = set(sent1[i]) if not isinstance(sent1[i], set) else sent1[i]
        set2 = set(sent2[i]) if not isinstance(sent2[i], set) else sent2[i]
        
        if vocabulary:
            # Usar el vocabulario proporcionado
            vocab_size = len(vocabulary.token2id)
            vec1 = np.zeros(vocab_size)
            vec2 = np.zeros(vocab_size)
            
            for word in set1:
                if word in vocabulary.token2id:
                    vec1[vocabulary.token2id[word]] = 1
                    
            for word in set2:
                if word in vocabulary.token2id:
                    vec2[vocabulary.token2id[word]] = 1
        else:
            # Crear un vocabulario ad-hoc para este par
            all_words = set1.union(set2)
            word_to_idx = {word: idx for idx, word in enumerate(all_words)}
            
            vec1 = np.zeros(len(all_words))
            vec2 = np.zeros(len(all_words))
            
            for word in set1:
                vec1[word_to_idx[word]] = 1
                
            for word in set2:
                vec2[word_to_idx[word]] = 1
        
        # Calcular similitud del coseno
        # Si los vectores son cero, asignamos una similitud de 0
        norm1 = np.linalg.norm(vec1)
        norm2 = np.linalg.norm(vec2)
        
        if norm1 == 0 or norm2 == 0:
            scores.append(0.0)
        else:
            cos_sim = np.dot(vec1, vec2) / (norm1 * norm2)
            scores.append(cos_sim)
    
    return scores

# Ejemplo de uso
# similitudes = one_hot_cosine_similarity(sentences_1_preproc[:5], sentences_2_preproc[:5], diccionario)
# print(similitudes)

## Modelo regressió amb atenció



In [13]:
print("Max Len:", max([len(s) for s in sentences_1_preproc]), max([len(s) for s in sentences_2_preproc]))
print(list(diccionario.doc2idx(sentences_1_preproc[0])))

Max Len: 30 30
[0, 11, 13, 1, 9, 10, 5, 14, 8, 7, 2, 8, 12, 2, 6, 4, 3, 15]


In [14]:
from typing import Union


def map_word_embeddings(
        sentence: Union[str, List[str]],
        sequence_len: int = 32,
        fixed_dictionary: Optional[Dictionary] = None
        ) -> np.ndarray:
    """
    Map to word-embedding indices
    :param sentence:
    :param sequence_len:
    :param fixed_dictionary:
    :return:
    """
    if not isinstance(sentence, list):
        sentence_preproc = simple_preprocess(sentence)
    else:
        sentence_preproc = sentence
    _vectors = np.zeros(sequence_len, dtype=np.int32)
    index = 0
    for word in sentence_preproc:
        if fixed_dictionary is not None:
            if word in fixed_dictionary.token2id:
                # Sumo 1 perquè el valor 0 està reservat a padding
                _vectors[index] = fixed_dictionary.token2id[word] + 1
                index += 1
        else:
            if word in wv_model.key_to_index:
                _vectors[index] = wv_model.key_to_index[word] + 1
                index += 1
    return _vectors


def map_pairs(
        sentence_pairs: List[Tuple[str, str, float]],
        sequence_len: int = 32,
        fixed_dictionary: Optional[Dictionary] = None
) -> List[Tuple[Tuple[np.ndarray, np.ndarray], float]]:
    """
    Mapea els triplets d'oracions a llistes de (x, y), (pares de vectors, score)
    :param sentence_pairs:
    :param sequence_len:
    :param fixed_dictionary:
    :return:
    """
    # Mapeig dels paquets d'oracions a paquets de vectors
    pares_vectores = []
    for i, (sentence_1, sentence_2, similitud) in enumerate(sentence_pairs):
        vector1 = map_word_embeddings(sentence_1, sequence_len, fixed_dictionary)
        vector2 = map_word_embeddings(sentence_2, sequence_len, fixed_dictionary)
        # Afegir a la llista
        pares_vectores.append(((vector1, vector2), similitud))
    return pares_vectores

In [15]:
# Imprimir els paquets de vectors i la puntuació de similitud associada
mapped = map_pairs(sentence_pairs, fixed_dictionary=diccionario if REMAP_EMBEDDINGS else None)
# for vectors, similitud in mapped:
#     print(f"Pares de vectores: {vectors[0].shape}, {vectors[1].shape}")
#     print(f"Puntuació de similitud: {similitud}")
print(mapped[0])

((array([ 1, 12, 14,  2, 10, 11,  6, 15,  9,  8,  3,  9, 13,  3,  7,  5,  4,
       16,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), array([10010,     9,  2784,     6,    15,     9,     8,     3,     9,
          13,     3,     7,     5,     4,    16,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0])), 3.5)


In [16]:
# Definir constants d'entrenament
batch_size: int = 64
num_epochs: int = 128
train_val_split: float = 0.8

In [17]:
len(mapped)

3073

In [18]:
# Obtener x_train e y_train
train_slice: int = int(len(mapped) * train_val_split)

def pair_list_to_x_y(pair_list: List[Tuple[Tuple[np.ndarray, np.ndarray], int]]) -> Tuple[Tuple[np.ndarray, np.ndarray], np.ndarray]:
    """
    Otiene las matrices X_1 (N x d) , X_2 (N x d), e Y (n) a partir de listas de parelles de vectors d'oracions - Llistes de (d, d, 1)
    :param pair_list:
    :return:
    """
    _x, _y = zip(*pair_list)
    _x_1, _x_2 = zip(*_x)
    return (np.row_stack(_x_1), np.row_stack(_x_2)), np.array(_y) / 5.0

# Obtener las listas de train y test
x_train, y_train = pair_list_to_x_y(mapped[:train_slice])
x_val, y_val = pair_list_to_x_y(mapped[train_slice:])

In [19]:
import tensorflow as tf

# Preparar els conjunts de dades d'entrenament i validació
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=len(x_train)).batch(batch_size)

val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(batch_size)

In [20]:
import tensorflow as tf
print(tf.__version__)
print(dir(tf))  # ¿"data" aparece en esta lista?


2.19.0
['AggregationMethod', 'Assert', 'CriticalSection', 'DType', 'DeviceSpec', 'GradientTape', 'Graph', 'IndexedSlices', 'IndexedSlicesSpec', 'Module', 'Operation', 'OptionalSpec', 'RaggedTensor', 'RaggedTensorSpec', 'RegisterGradient', 'SparseTensor', 'SparseTensorSpec', 'Tensor', 'TensorArray', 'TensorArraySpec', 'TensorShape', 'TensorSpec', 'TypeSpec', 'UnconnectedGradients', 'Variable', 'VariableAggregation', 'VariableSynchronization', '_API_MODULE', '_KerasLazyLoader', '__all__', '__builtins__', '__cached__', '__compiler_version__', '__cxx11_abi_flag__', '__cxx_version__', '__doc__', '__file__', '__git_version__', '__internal__', '__loader__', '__monolithic_build__', '__name__', '__operators__', '__package__', '__path__', '__spec__', '__version__', '_api', '_compat', '_current_file_location', '_current_module', '_fi', '_initializers', '_inspect', '_kernel_dir', '_ll', '_losses', '_major_api_version', '_metrics', '_module_dir', '_module_util', '_name', '_names_with_underscore', '

In [21]:
pretrained_weights: Optional[np.ndarray] = None
if USE_PRETRAINED:
    if REMAP_EMBEDDINGS:
        pretrained_weights = np.zeros(
            (len(diccionario.token2id) + 1, wv_model.vector_size),  dtype=np.float32)
        for token, _id in diccionario.token2id.items():
            if token in wv_model:
                pretrained_weights[_id + 1] = wv_model[token]
            else:
                # In W2V, OOV will not have a representation. We will use 0.
                pass
    else:
        # Not recommended (this will consume A LOT of RAM) PORQUE CARGA TODOS LOS VECTORES DEL MODELO.
        pretrained_weights = np.zeros((wv_model.vectors.shape[0] + 1, wv_model.vector_size,),  dtype=np.float32)
        pretrained_weights[1:, :] = wv_model.vectors


In [22]:
pretrained_weights[:5]

array([[ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [-0.0307,  0.0032,  0.0128, ..., -0.0154,  0.0374,  0.0234],
       [ 0.0519, -0.0079, -0.0013, ..., -0.0154, -0.0353, -0.0235],
       [ 0.0058, -0.0161,  0.062 , ...,  0.0129,  0.019 ,  0.0177],
       [-0.042 , -0.0113,  0.0837, ..., -0.0396, -0.0253, -0.0045]],
      dtype=float32)

In [23]:
import tensorflow as tf
import numpy as np
from typing import Optional

class SimpleAttention(tf.keras.layers.Layer):
    def __init__(self, units: int, **kwargs):
        super(SimpleAttention, self).__init__(**kwargs)
        self.units = units
        self.dropout_s1 = tf.keras.layers.Dropout(0.3)
        self.dropout_s2 = tf.keras.layers.Dropout(0.2)
        self.W_s1 = tf.keras.layers.Dense(units, activation='tanh', use_bias=True, name="attention_transform")
        # Dense layer to compute attention scores (context vector)
        self.W_s2 = tf.keras.layers.Dense(1, use_bias=False, name="attention_scorer")
        self.supports_masking = True  # Declare that this layer supports masking

    def call(self, inputs: tf.Tensor, mask: Optional[tf.Tensor] = None) -> tf.Tensor:
        # inputs shape: (batch_size, sequence_length, embedding_dim)
        # mask shape: (batch_size, sequence_length) boolean tensor

        # Attention hidden states
        hidden_states = self.dropout_s1(self.W_s1(inputs))

        # Compute attention scores
        scores = self.dropout_s2(self.W_s2(hidden_states))

        if mask is not None:
            # Apply the mask to the scores before softmax
            expanded_mask = tf.expand_dims(tf.cast(mask, dtype=tf.float32), axis=-1)
            # Add a large negative number to masked (padded) scores
            scores += (1.0 - expanded_mask) * -1e9

        # Compute attention weights
        attention_weights = tf.nn.softmax(scores, axis=1)

        # Compute the context vector (weighted sum of input embeddings)
        context_vector = tf.reduce_sum(inputs * attention_weights, axis=1)

        return context_vector

    def get_config(self) -> dict:
        config = super(SimpleAttention, self).get_config()
        config.update({"units": self.units})
        return config

    def compute_mask(self, inputs: tf.Tensor, mask: Optional[tf.Tensor] = None) -> Optional[tf.Tensor]:
        return None


def build_and_compile_model_2(
        input_length: int = 32,
        dictionary_size: int = 1000,
        embedding_size: int = 300,
        learning_rate: float = 0.001,
        trainable_embedding: bool = False,
        pretrained_weights: Optional[np.ndarray] = None,
        attention_units: int = 4,
) -> tf.keras.Model:
    input_1 = tf.keras.Input((input_length,), dtype=tf.int32, name="input_1")
    input_2 = tf.keras.Input((input_length,), dtype=tf.int32, name="input_2")

    # Determine effective embedding parameters
    if pretrained_weights is not None:
        effective_dictionary_size = pretrained_weights.shape[0]
        effective_embedding_size = pretrained_weights.shape[1]
        embedding_initializer = tf.keras.initializers.Constant(pretrained_weights)
        is_embedding_trainable = trainable_embedding
        embedding_layer_name = "embedding_pretrained"
    else:
        effective_dictionary_size = dictionary_size
        effective_embedding_size = embedding_size
        embedding_initializer = 'uniform'
        is_embedding_trainable = True
        embedding_layer_name = "embedding"

    # Shared Embedding Layer
    embedding_layer = tf.keras.layers.Embedding(
        input_dim=effective_dictionary_size,
        output_dim=effective_embedding_size,
        input_length=input_length,
        mask_zero=True,
        embeddings_initializer=embedding_initializer,
        trainable=is_embedding_trainable,
        name=embedding_layer_name
    )

    # Apply embedding layer to both inputs
    embedded_1 = embedding_layer(input_1)  # Shape: (batch_size, input_length, effective_embedding_size)
    embedded_2 = embedding_layer(input_2)  # Shape: (batch_size, input_length, effective_embedding_size)

    # Shared Attention Layer
    # Input: (batch_size, input_length, effective_embedding_size) with a mask
    # Output: (batch_size, effective_embedding_size)
    sentence_attention_layer = SimpleAttention(units=attention_units, name="sentence_attention")
    # sentence_attention_layer = tf.keras.layers.GlobalAveragePooling1D(name="sentence_attention_layer")

    sentence_vector_1 = sentence_attention_layer(embedded_1)
    sentence_vector_2 = sentence_attention_layer(embedded_2)

    # Projection layer
    first_projection_layer = tf.keras.layers.Dense(
        effective_embedding_size,
        activation='tanh',
        kernel_initializer=tf.keras.initializers.Identity(),
        bias_initializer=tf.keras.initializers.Zeros(),
        name="projection_layer"
    )
    dropout = tf.keras.layers.Dropout(0.2, name="projection_dropout")
    projected_1 = dropout(first_projection_layer(sentence_vector_1))
    projected_2 = dropout(first_projection_layer(sentence_vector_2))

    # Normalize the projected vectors (L2 normalization)
    normalized_1 = tf.keras.layers.Lambda(
        lambda x: tf.linalg.l2_normalize(x, axis=1), name="normalize_1"
    )(projected_1)
    normalized_2 = tf.keras.layers.Lambda(
        lambda x: tf.linalg.l2_normalize(x, axis=1), name="normalize_2"
    )(projected_2)

    # Compute Cosine Similarity = X * Y / (||X|| * ||Y||) 
    similarity_score = tf.keras.layers.Lambda(
        lambda x: tf.reduce_sum(x[0] * x[1], axis=1, keepdims=True), name="cosine_similarity"
    )([normalized_1, normalized_2])

    # Scale similarity from [-1, 1] to [0, 1]
    output_layer = tf.keras.layers.Lambda(
        lambda x: 0.5 * (1.0 + x), name="output_scaling"
    )(similarity_score)

    # Define the Keras Model
    model = tf.keras.Model(
        inputs=[input_1, input_2],
        outputs=output_layer,
        name="sequence_similarity_attention_model"
    )

    # Compile the model
    model.compile(
        loss='mean_squared_error',
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        metrics=['mae'],
    )

    return model

In [24]:
# Construir i compilar el model
model = build_and_compile_model_2(pretrained_weights=pretrained_weights, learning_rate=1e-3)
# Entrenar el model
model.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)




Epoch 1/128
Epoch 1/128
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - loss: 0.1322 - mae: 0.3259 - val_loss: 0.1420 - val_mae: 0.3403
Epoch 2/128
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - loss: 0.1322 - mae: 0.3259 - val_loss: 0.1420 - val_mae: 0.3403
Epoch 2/128
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0907 - mae: 0.2633 - val_loss: 0.1263 - val_mae: 0.3158
Epoch 3/128
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0907 - mae: 0.2633 - val_loss: 0.1263 - val_mae: 0.3158
Epoch 3/128
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.0780 - mae: 0.2379 - val_loss: 0.1208 - val_mae: 0.3083
Epoch 4/128
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.0780 - mae: 0.2379 - val_loss: 0.1208 - val_mae: 0.3083
Epoch 4/128
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

<keras.src.callbacks.history.History at 0x23d1d8bec10>

In [25]:
model.summary()

In [26]:
from scipy.stats import pearsonr
# Obtener las predicciones del modelo para los datos de prueba. En este ejemplo vamos a utilizar el corpus de training.
y_pred = model.predict(x_val)
# Calcular la correlación de Pearson entre las predicciones y los datos de prueba
correlation, _ = pearsonr(y_pred.flatten(), y_val.flatten())
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson: {correlation}")


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Correlación de Pearson: 0.5325664179830356
Correlación de Pearson: 0.5325664179830356


In [27]:
from scipy.stats import pearsonr
# Obtener las predicciones del modelo para los datos de prueba. En este ejemplo vamos a utilizar el corpus de training.
y_pred = model.predict(x_train)
# Calcular la correlación de Pearson entre las predicciones y los datos de prueba
correlation, _ = pearsonr(y_pred.flatten(), y_train.flatten())
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson: {correlation}")


[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Correlación de Pearson: 0.7845078833995395
Correlación de Pearson: 0.7845078833995395


In [28]:
tf.__version__

'2.19.0'

In [29]:
print(y_train[:10])

[0.7   0.25  0.734 0.45  0.4   0.55  0.534 0.5   0.5   0.6  ]


In [30]:
# predecir para los de train
# The issue is that one_hot_evaluation returns a list of scores for all pairs,
# while y_train is only for the training portion

# Calculate scores for train data only (same length as y_train)
train_scores = one_hot_evaluation(sentences_1_preproc[:train_slice], sentences_2_preproc[:train_slice])

# Convert the list to a numpy array for proper comparison
train_scores = np.array(train_scores)

print(f"Length of scores: {len(train_scores)}, Length of y_train: {len(y_train)}")

# Now both arrays have the same length
correlation, p_value = pearsonr(train_scores, y_train.flatten())
print(f"Correlación de Pearson: {correlation}")
print(f"P-value: {p_value}")

Length of scores: 2458, Length of y_train: 2458
Correlación de Pearson: 0.5332081129168438
P-value: 1.2050755660899114e-180


#### TODO

- Model difícil de millorar
- Truncar les mesures dels embeddings extreient les paraules que pertanyen al corpus (cuidado amb overfitting al train).
- Agregació amb altres TF-IDF. (Modificant la capa Atenció)
- 