In [3]:
%pip install tf-keras

Collecting tf-kerasNote: you may need to restart the kernel to use updated packages.

  Downloading tf_keras-2.16.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.16.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.7 MB 640.0 kB/s eta 0:00:03
   -- ------------------------------------- 0.1/1.7 MB 1.3 MB/s eta 0:00:02
   --------- ------------------------------ 0.4/1.7 MB 3.2 MB/s eta 0:00:01
   ----------------- ---------------------- 0.8/1.7 MB 4.4 MB/s eta 0:00:01
   --------------------------- ------------ 1.2/1.7 MB 5.3 MB/s eta 0:00:01
   -------------------------------------- - 1.7/1.7 MB 6.7 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 6.1 MB/s eta 0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.16.0


In [4]:
from typing import Tuple, List, Optional
import spacy
import numpy as np
from model_bàsic import build_and_compile_model_better

In [5]:
from transformers import pipeline, AutoTokenizer
from scipy.special import logit

model = 'projecte-aina/roberta-base-ca-v2-cased-sts'
tokenizer = AutoTokenizer.from_pretrained(model)
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer)

def prepare(sentence_pairs):
    sentence_pairs_prep = []
    for s1, s2 in sentence_pairs:
        sentence_pairs_prep.append(f"{tokenizer.cls_token} {s1}{tokenizer.sep_token}{tokenizer.sep_token} {s2}{tokenizer.sep_token}")
    return sentence_pairs_prep

sentence_pairs = [("El llibre va caure per la finestra.", "El llibre va sortir volant."),
                  ("M'agrades.", "T'estimo."),
                  ("M'agrada el sol i la calor", "A la Garrotxa plou molt.")]

predictions = pipe(prepare(sentence_pairs), add_special_tokens=False)

# convert back to scores to the original 0 and 5 interval
for prediction in predictions:
    prediction['score'] = logit(prediction['score'])
    
print(predictions)


[{'label': 'SIMILARITY', 'score': 2.1183004307689126}, {'label': 'SIMILARITY', 'score': 2.179974932297432}, {'label': 'SIMILARITY', 'score': 0.9511617858568939}]


In [7]:
prepare(sentence_pairs)

['<s> El llibre va caure per la finestra.</s></s> El llibre va sortir volant.</s>',
 "<s> M'agrades.</s></s> T'estimo.</s>",
 "<s> M'agrada el sol i la calor</s></s> A la Garrotxa plou molt.</s>"]

In [8]:
from importació_data import read_all_ts_data, reformat_data

# Get data
train, val, test = read_all_ts_data()

#Transform data
train_pairs, val_pairs, test_pairs = reformat_data(train, val, test)

In [37]:
X_train, y_train = [], []
X_val, y_val = [], []
X_test, y_test = [], []

for i in range(len(train_pairs)):    
    X_train.append(train_pairs[i][0:2])
    y_train.append(train_pairs[i][2])

for i in range(len(val_pairs)):
    X_val.append(val_pairs[i][0:2])
    y_val.append(val_pairs[i][2])
    
for i in range(len(test_pairs)):
    X_test.append(test_pairs[i][0:2])
    y_test.append(test_pairs[i][2])
    

In [39]:
predictions_train = pipe(prepare(X_train), add_special_tokens=False)
predicitions_val = pipe(prepare(X_val), add_special_tokens=False)
predictions_test = pipe(prepare(X_test), add_special_tokens=False)

In [35]:
from scipy.stats import pearsonr

def compute_pearson(x_, y_, model):
    # Get predictions for the model
    # y_pred = pipe(prepare(x_), add_special_tokens=False)
    y_pred = [logit(item['score']) for item in predictions]
    # Compute pearson correlation
    correlation, _ = pearsonr(y_pred, y_)
    return correlation

In [36]:
print(f"Correlación de Pearson (train): {compute_pearson(X_train, y_train, model)}")

[3.9050083169777956, 1.293888566880435, 4.201401910346833, 2.7064105493285155, 1.6543471716644051, 2.850589694845317, 3.0933511866798256, 2.7604804476504947, 2.5479786848723025, 3.131580894689497, 3.1964559619397312, 0.5483302696277794, 1.9513823807674204, 3.5313567310954768, 3.027013568262699, 2.890897104232176, 1.352541099739381, 3.247333131821523, 1.8342967685285183, 3.846831654193334, 3.0524080901557915, 1.6391532619572518, 4.2301754656599835, 1.6332861961750043, 2.784129407742889, 2.6385938475876727, 1.7459373878342639, 2.7093184012717004, 2.328370444217892, 0.7426231963870426, 4.2256860898754205, 2.4371843649782665, 3.297299099272809, 3.150575356694871, 2.521659699842311, 2.1959686101029523, 1.8317354496213982, 3.5566350775599944, 3.1544294844363963, 1.8677334247236062, 1.7186315323564008, 2.035867188884981, 4.221885241221951, 2.990966521627886, 2.6806734166044643, 1.3669705463777473, 2.3747336587967602, 1.9974553029389792, 1.9153525359599939, 2.0754111387250154, 1.32809911598289

In [None]:
def map_pairs(
    sentence_pairs: List[Tuple[str, str, float]],
) -> List[Tuple[Tuple[np.ndarray, np.ndarray], float]]:
    # Mapeo de los pares de oraciones a pares de vectores
    pares_vectores = []
    for i, (sentence_1, sentence_2, similitud) in enumerate(sentence_pairs):
        vector1 = get_roberta_embedding(sentence_1)
        vector2 = get_roberta_embedding(sentence_2)
        # Añadir a la lista
        
        pares_vectores.append(((vector1, vector2), similitud))
    return pares_vectores