In [12]:
!python -m spacy download ca_core_news_trf

Collecting ca-core-news-trf==3.7.2
  Downloading https://github.com/explosion/spacy-models/releases/download/ca_core_news_trf-3.7.2/ca_core_news_trf-3.7.2-py3-none-any.whl (457.1 MB)
     ---------------------------------------- 0.0/457.1 MB ? eta -:--:--
     -------------------------------------- 0.0/457.1 MB 320.0 kB/s eta 0:23:49
     -------------------------------------- 0.1/457.1 MB 777.7 kB/s eta 0:09:48
     ---------------------------------------- 0.6/457.1 MB 5.0 MB/s eta 0:01:32
     ---------------------------------------- 1.4/457.1 MB 8.1 MB/s eta 0:00:57
     --------------------------------------- 2.4/457.1 MB 10.7 MB/s eta 0:00:43
     --------------------------------------- 3.5/457.1 MB 12.9 MB/s eta 0:00:36
     --------------------------------------- 4.8/457.1 MB 15.4 MB/s eta 0:00:30
      -------------------------------------- 6.4/457.1 MB 17.8 MB/s eta 0:00:26
      -------------------------------------- 8.4/457.1 MB 20.6 MB/s eta 0:00:22
      ------------------

In [1]:
from typing import Tuple, List, Optional
import spacy
import numpy as np
from model_bàsic import build_and_compile_model_better

In [14]:
# Cargar el modelo de spaCy con integración de transformers para catalán
roberta_model = spacy.load("ca_core_news_trf")

def get_roberta_embedding(text):
    return np.mean(roberta_model(text)._.trf_data.last_hidden_layer_state.data[:-1], axis=0)

In [15]:
def map_pairs(
    sentence_pairs: List[Tuple[str, str, float]],
) -> List[Tuple[Tuple[np.ndarray, np.ndarray], float]]:
    # Mapeo de los pares de oraciones a pares de vectores
    pares_vectores = []
    for i, (sentence_1, sentence_2, similitud) in enumerate(sentence_pairs):
        vector1 = get_roberta_embedding(sentence_1)
        vector2 = get_roberta_embedding(sentence_2)
        # Añadir a la lista
        
        pares_vectores.append(((vector1, vector2), similitud))
    return pares_vectores

In [16]:
from importació_data import read_all_ts_data, reformat_data

# Get data
train, val, test = read_all_ts_data()

#Transform data
train_pairs, val_pairs, test_pairs = reformat_data(train, val, test)

In [17]:
mapped_train = map_pairs(train_pairs)
mapped_val = map_pairs(val_pairs)
mapped_test = map_pairs(test_pairs)

In [18]:
# Prepare the data
from importació_data import pair_list_to_x_y

X_train, y_train = pair_list_to_x_y(mapped_train)
X_val, y_val = pair_list_to_x_y(mapped_val)
X_test, y_test = pair_list_to_x_y(mapped_test)

In [None]:
model = build_and_compile_model_better(768)

model.summary()

In [None]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=len(X_train)).batch(64)

val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_dataset = val_dataset.batch(64)

In [None]:
train_dataset

<_BatchDataset element_spec=((TensorSpec(shape=(None, 768), dtype=tf.float32, name=None), TensorSpec(shape=(None, 768), dtype=tf.float32, name=None)), TensorSpec(shape=(None,), dtype=tf.float64, name=None))>

In [None]:
model.fit(train_dataset, epochs=100, validation_data=val_dataset)

Epoch 1/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 0.7260 - val_loss: 0.7254
Epoch 2/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.7177 - val_loss: 0.7222
Epoch 3/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.7147 - val_loss: 0.7194
Epoch 4/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.7070 - val_loss: 0.7190
Epoch 5/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.6909 - val_loss: 0.7211
Epoch 6/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.6802 - val_loss: 0.7243
Epoch 7/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.6650 - val_loss: 0.7286
Epoch 8/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.6546 - val_loss: 0.7336
Epoch 9/100
[1m32/32[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x175518dce10>

In [None]:
from scipy.stats import pearsonr

x_test, y_test = pair_list_to_x_y(mapped_test)
def compute_pearson(x_, y_, model):
    # Get predictions for the model
    y_pred = model.predict(x_)
    # Compute pearson correlation
    correlation, _ = pearsonr(y_pred.flatten(), y_.flatten())
    return correlation

In [None]:
print(f"Correlación de Pearson (train): {compute_pearson(X_train, y_train, model)}")
print(f"Correlación de Pearson (validation): {compute_pearson(X_val, y_val, model)}")
print(f"Correlación de Pearson (test): {compute_pearson(X_test, y_test, model)}")

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Correlación de Pearson (train): 0.7361164836771393
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Correlación de Pearson (validation): 0.15813986630961854
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Correlación de Pearson (test): 0.23395487116758615
