# Pràctica 4 PLH - Rubén Álvarez Aragonés i Pol Pérez Prades

____

# Stuff

## Imports 

In [None]:
%pip install -r requirements.txt

In [None]:
import tensorflow as tf
import numpy as np
import nltk
import re
import spacy
from spacy.lang.ca.examples import sentences 
from gensim.models import word2vec
import torch
import plotly.express as px
import pandas as pd
from sklearn.manifold import TSNE

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Requisites
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
import numpy as np


nltk.download('punkt')
nltk.download('stopwords')

spacy.cli.download("ca_core_news_sm")


## Entrenament model Word2Vec

In [None]:
class Word2VecEmbedder:
    def __init__(self, corpus_path, corpus_size, load_model=False, model_path=None):
        if not load_model:
            self.corpus_path = corpus_path
            self.corpus_size = int(corpus_size * 2**30) if corpus_size else None  # Convert GB to bytes
            self.corpus = self.get_corpus(corpus_path)
            self.fit()
        else:
            try:
                self.load(model_path)
            except FileNotFoundError:
                print("Model not found. Please check the path.")
                return

    def get_corpus(self, corpus_path):
        with open(corpus_path, 'r', encoding='utf-8') as f:
            print("Reading corpus...")
            corpus = f.read(self.corpus_size) if self.corpus_size else f.read()
            print("Preprocessing corpus...")
            corpus = self.preprocess(corpus)  # Preprocess the corpus and tokenize it
        return corpus

    def fit(self, window_size=15, vector_size=300, min_count=10, workers=8, epochs=10):
        # Initialize the Word2Vec model with gensim
        print("Initializing Word2Vec model...")
        self.model = word2vec.Word2Vec(sentences=[self.corpus], vector_size=vector_size, window=window_size, min_count=min_count, workers=workers, epochs=epochs)
        print("Model training completed.")

    def save(self, model_path):
        # Save the model
        self.model.save(model_path)

    def load(self, model_path):
        # Load the model
        self.model = word2vec.Word2Vec.load(model_path)

    def preprocess(self, corpus):
        # Lowercase the corpus
        print("Lowercasing...")
        corpus = corpus.lower()
        
        # Remove special characters
        print("Removing special characters...")
        corpus = re.sub(r'[^a-záàéèíìóòúùñüç\s]', ' ', corpus)
        
        # Tokenize the corpus
        print("Tokenizing...")
        corpus = nltk.word_tokenize(corpus)
        
        # Eliminate last token (probably incomplete word)
        corpus = corpus[:-1]
        
        return corpus

    def get_embedding(self, word):
        # Get the embedding of a word
        try:
            return self.model.wv[word]
        except KeyError:
            print(f"Word '{word}' not in vocabulary.")
            return None

    def print_vocab(self):
        print("Vocabulary:", list(self.model.wv.index_to_key))
        
    

### 1. Model amb 100MB de dades

In [None]:
Word2Vec_model = Word2VecEmbedder('corpus\catalan_general_crawling.txt', 0.1)

In [None]:
Word2Vec_model.save('models/word2vec_model.bin')

In [None]:
Word2Vec_model.print_vocab()

In [None]:
Word2Vec_model.get_embedding("de")

In [None]:
Word2Vec_model.model.wv.most_similar("negre")

### 2. Model amb 500MB de dades

In [None]:
Word2Vec_model_500 = Word2VecEmbedder('corpus\catalan_general_crawling.txt', 0.5)

In [None]:
Word2Vec_model.get_embedding("hola")

In [None]:
Word2Vec_model_500.model.wv.most_similar("inshalla")

### 3. Model amb 1GB de dades

In [None]:
Word2Vec_model_1024 = Word2VecEmbedder('corpus\catalan_general_crawling.txt', 1)

In [None]:
Word2Vec_model.get_embedding("hola")

### 4. Model amb totes les dades

In [None]:
Word2Vec_model_full_data = Word2VecEmbedder('corpus\catalan_general_crawling.txt', None)

## Entrenament model de Similitud de Text Semàntic

### Imports

In [None]:
from importació_data import read_all_ts_data, reformat_data, create_corpus, preprocess, flattened_corpus_count, stopwords_cat
from importació_data import pair_list_to_x_y
from model_bàsic import build_and_compile_model_better
import tensorflow as tf
from model_bàsic import compute_pearson

Definim les stopwords del català i la funció del preprocessament del text, que les tindrà en compte a la hora de tokenitzar i natejar el text.

In [None]:
stpw_cat = stopwords_cat()
prepro = lambda x: preprocess(x, stpw_cat)

Per altra banda, llegim les dades i definim les variables més importants per a la creació dels models de similitud de text semàntic.
- Llegim totes les dades de text similarity dividint-les en train, test i val. 
- Reformatejem les dades per a que siguin l'estructura List[Tuple[str, str, float]]. 
- Definim el corpus i el diccionari amb totes les paraules.
- Creem un diccionari de python amb tots els indexs com a claus i amb la repetició de les paraules com a valor.

In [None]:
train, test, val = read_all_ts_data()
train, test, val = reformat_data(train, test, val)
corpus, dictionary = create_corpus(train, test, val, preprocess=prepro)
flat_corpus = flattened_corpus_count(corpus)

### Compartació amb diferents models de Word Embeddings

#### 1. One Hot

In [None]:
from onehot import map_one_hot

Un embedding OneHot té tamany igual a la llargada del diccionari. En la importació de dades ja hem eliminat les stopwords per reduïr la dimensió, però ara també eliminarem del embedding aquelles paraules que es repeteixen masses poques vegades o massa sovint. 

Per aconseguir això creem una llista que conté els indexs de les paraules que sí que utilitzarem i la passem com a argument a la funció *map_one_hot()*, per reduïr la dimensió del embedding.

In [None]:
# Eliminate from the dictionary the words that are repeated very few times or too many times
keys_preprocess = [index  for index in dictionary if flat_corpus[index] > 10 and flat_corpus[index] < 50]

Convertim les paraules a vectors OneHot amb la funció *map_one_hot()*. Aquesta funció crea un vector de zeros de la mida del diccionari i posa un 1 a la posició de la paraula en el diccionari.

In [None]:
mapped_one_hot_train = map_one_hot(train, dictionary, keys_preprocess)
mapped_one_hot_test = map_one_hot(test, dictionary, keys_preprocess)
mapped_one_hot_val = map_one_hot(val, dictionary, keys_preprocess)

Separem el X i Y

In [None]:
x_train_oh, y_train_oh = pair_list_to_x_y(mapped_one_hot_train)
x_val_oh, y_val_oh = pair_list_to_x_y(mapped_one_hot_val)
x_test_oh, y_test_oh = pair_list_to_x_y(mapped_one_hot_test)

Entrenem el model

In [None]:
batch_size_oh: int = 64
num_epochs_oh: int = 64

train_dataset_oh = tf.data.Dataset.from_tensor_slices((x_train_oh, y_train_oh))
train_dataset_oh = train_dataset_oh.shuffle(buffer_size=len(x_train_oh)).batch(batch_size_oh)

val_dataset_oh = tf.data.Dataset.from_tensor_slices((x_val_oh, y_val_oh))
val_dataset_oh = val_dataset_oh.batch(batch_size_oh)

In [None]:
embedding_size_oh = len(keys_preprocess)
model_oh = build_and_compile_model_better(embedding_size = embedding_size_oh)
tf.keras.utils.plot_model(model_oh, show_shapes=True, show_layer_activations=True, )
print(model_oh.summary())

In [None]:
model_oh.fit(train_dataset_oh, epochs=num_epochs_oh, validation_data=val_dataset_oh)

Evaluem el model amb la partició de validació

In [None]:
print(f"Correlación de Pearson (train): {compute_pearson(model_oh, x_train_oh, y_train_oh)}")
print(f"Correlación de Pearson (validation): {compute_pearson(model_oh, x_val_oh, y_val_oh)}")

Provem el model amb la partició de test

In [None]:
print(f"Correlación de Pearson (test): {compute_pearson(model_oh, x_test_oh, y_test_oh)}")

##### Conclusions de One-hot

Com podem observar a la hora de fer l'entrenament, el model fa molt *overfit*, així que no és capaç de generalitzar bé. Això és degut a que el model OneHot no té en compte la semàntica de les paraules, ja que cada paraula és tractada com a un element únic i no es té en compte el context en el que apareixen les paraules. Una altra raó per la que el model OneHot no és bó és perquè la mida dels vectors és molt gran, ja que la mida dels vectors és igual a la mida del diccionari, i això fa que el model sigui molt ineficient.

#### 2. Word2Vec preentrenats 

In [None]:
from word2vec_tf_idf import map_pairs_w2v

Llegim el model de Word2Vec preentrenat en català.

In [None]:
WORD_EMBEDDING_FILE = "C:/Users/Pol/Downloads/cc.ca.300.bin.gz"


USE_MMAP = False
if USE_MMAP:
    from gensim.models.fasttext import FastTextKeyedVectors
    MMAP_PATH = 'cc.ca.300.bin'
    # wv_model.save(MMAP_PATH)
    wv_model = FastTextKeyedVectors.load(MMAP_PATH, mmap='r')
else:
    from gensim.models import fasttext
    wv_model = fasttext.load_facebook_vectors(WORD_EMBEDDING_FILE)

##### 2.1 Mitjana

Convertim a vectors les frases amb la funció *map_w2v()*. Aquesta funció obté l'embedding de Word2Vec de cada paraula de la frase i en fa la mitjana.

In [None]:
mapped_w2v_mean_train = map_pairs_w2v(train, wv_model, dictionary=dictionary, preprocess=prepro)
mapped_w2v_mean_test = map_pairs_w2v(test, wv_model, dictionary=dictionary, preprocess=prepro)
mapped_w2v_mean_val = map_pairs_w2v(val, wv_model, dictionary=dictionary, preprocess=prepro)

Separem les dades en X i Y.

In [None]:
x_train_w2v_mean, y_train_w2v_mean = pair_list_to_x_y(mapped_w2v_mean_train)
x_val_w2v_mean, y_val_w2v_mean = pair_list_to_x_y(mapped_w2v_mean_val)
x_test_w2v_mean, y_test_w2v_mean = pair_list_to_x_y(mapped_w2v_mean_test)

Entrenem el model

In [None]:
batch_size_w2v_mean: int = 64
num_epochs_w2v_mean: int = 64

train_dataset_w2v_mean = tf.data.Dataset.from_tensor_slices((x_train_w2v_mean, y_train_w2v_mean))
train_dataset_w2v_mean = train_dataset_w2v_mean.shuffle(buffer_size=len(x_train_w2v_mean)).batch(batch_size_w2v_mean)

val_dataset_w2v_mean = tf.data.Dataset.from_tensor_slices((x_val_w2v_mean, y_val_w2v_mean))
val_dataset_w2v_mean = val_dataset_w2v_mean.batch(batch_size_w2v_mean)

In [None]:
embedding_size_w2v_mean = 300
model_w2v_mean = build_and_compile_model_better(embedding_size = embedding_size_w2v_mean)
tf.keras.utils.plot_model(model_w2v_mean, show_shapes=True, show_layer_activations=True)
print(model_w2v_mean.summary())

In [None]:
model_w2v_mean.fit(train_dataset_w2v_mean, epochs=num_epochs_w2v_mean, validation_data=val_dataset_w2v_mean)

Evaluem el model amb la partició de validació

In [None]:
print(f"Correlación de Pearson (train): {compute_pearson(model_w2v_mean, x_train_w2v_mean, y_train_w2v_mean)}")
print(f"Correlación de Pearson (validation): {compute_pearson(model_w2v_mean, x_val_w2v_mean, y_val_w2v_mean)}")

Provem el model amb la partició de test

In [None]:
print(f"Correlación de Pearson (test): {compute_pearson(model_w2v_mean, x_test_w2v_mean, y_test_w2v_mean)}")

##### Conclusions de vectors preentrenats amb mitjana

Podem observar com amb el model Word2vec i la mitjana el model segueix fent sobreajust però funciona molt millor que el OneHot. Això és degut a que el model Word2Vec té en compte la semàntica de les paraules, ja que les paraules que tenen un significat similar tenen un embedding similar. Això fa que el model sigui capaç de generalitzar millor.

##### 2.2 Tf-idf

Definim el model de Tf-idf amb el corpus definit al inici.

In [None]:
from gensim.models import TfidfModel
modelo_tfidf = TfidfModel(corpus)

Fent servir la funció *map_w2v_tfidf()* s'obté l'embedding de Word2Vec de cada paraula de la frase i s'aplica Tf-idf per obtenir un vector de la frase.

In [None]:
mapped_w2v_tfidf_train = map_pairs_w2v(train, wv_model, dictionary=dictionary,tf_idf_model=modelo_tfidf, preprocess=prepro)
mapped_w2v_tfidf_test = map_pairs_w2v(test, wv_model, dictionary=dictionary,tf_idf_model=modelo_tfidf, preprocess=prepro)
mapped_w2v_tfidf_val = map_pairs_w2v(val, wv_model, dictionary=dictionary,tf_idf_model=modelo_tfidf, preprocess=prepro)

Separem les dades en X i Y.

In [None]:
x_train_w2v_tfidf, y_train_w2v_tfidf = pair_list_to_x_y(mapped_w2v_tfidf_train)
x_val_w2v_tfidf, y_val_w2v_tfidf = pair_list_to_x_y(mapped_w2v_tfidf_val)
x_test_w2v_tfidf, y_test_w2v_tfidf = pair_list_to_x_y(mapped_w2v_tfidf_test)

Entrenem el model

In [None]:
batch_size_w2v_tfidf: int = 64
num_epochs_w2v_tfidf: int = 64

train_dataset_w2v_tfidf = tf.data.Dataset.from_tensor_slices((x_train_w2v_tfidf, y_train_w2v_tfidf))
train_dataset_w2v_tfidf = train_dataset_w2v_tfidf.shuffle(buffer_size=len(x_train_w2v_tfidf)).batch(batch_size_w2v_tfidf)

val_dataset_w2v_tfidf = tf.data.Dataset.from_tensor_slices((x_val_w2v_tfidf, y_val_w2v_tfidf))
val_dataset_w2v_tfidf = val_dataset_w2v_tfidf.batch(batch_size_w2v_tfidf)

In [None]:
embedding_size_w2v_tfidf = 300
model_w2v_tfidf = build_and_compile_model_better(embedding_size = embedding_size_w2v_tfidf)
tf.keras.utils.plot_model(model_w2v_tfidf, show_shapes=True, show_layer_activations=True)
print(model_w2v_tfidf.summary())

In [None]:
model_w2v_tfidf.fit(train_dataset_w2v_tfidf, epochs=num_epochs_w2v_tfidf, validation_data=val_dataset_w2v_tfidf)

Evaluem el model amb la partició de validació

In [None]:
print(f"Correlación de Pearson (train): {compute_pearson(model_w2v_tfidf, x_train_w2v_tfidf, y_train_w2v_tfidf)}")
print(f"Correlación de Pearson (validation): {compute_pearson(model_w2v_tfidf, x_val_w2v_tfidf, y_val_w2v_tfidf)}")

Provem el model amb la partició de test

In [None]:
print(f"Correlación de Pearson (test): {compute_pearson(model_w2v_tfidf, x_test_w2v_tfidf, y_test_w2v_tfidf)}")

##### Conclusions vectors preentrenats amb Tf-idf

Podem observar com, fent us de Tf-idf, el model millora una mica més. Això és degut a que Tf-idf té en compte la freqüència de les paraules en el corpus, i això fa que les paraules que apareixen molt sovint tinguin un pes més baix. Això fa que el model sigui capaç de generalitzar millor. Tot i així el model segueix sobreajustant

#### 3. SpaCy

Importem tot allò necessari per a fer servir el model de SpaCy en català, inclòs el model preentrenat.

In [None]:
import spacy
from spacy_embed import map_spacy_embed

In [None]:
!python -m spacy download ca_core_news_md

Carreguem el model de SpaCy en català.

In [None]:
spacy_model = spacy.load("ca_core_news_md")

Passem les particions de train, test i val a vectors amb la funció *map_spacy()*. Aquesta funció utilitza el model de SpaCy per a convertir les frases a vectors.

In [None]:
mapped_spacy_train = map_spacy_embed(train, spacy_model)
mapped_spacy_test = map_spacy_embed(test, spacy_model)
mapped_spacy_val = map_spacy_embed(val, spacy_model)

Dividim les particions en X i Y.

In [None]:
x_train_sp, y_train_sp = pair_list_to_x_y(mapped_spacy_train)
x_val_sp, y_val_sp = pair_list_to_x_y(mapped_spacy_val)
x_test_sp, y_test_sp = pair_list_to_x_y(mapped_spacy_test)

Entrenem el model

In [None]:
batch_size_sp: int = 64
num_epochs_sp: int = 64

train_dataset_sp = tf.data.Dataset.from_tensor_slices((x_train_sp, y_train_sp))
train_dataset_sp = train_dataset_sp.shuffle(buffer_size=len(x_train_sp)).batch(batch_size_sp)

val_dataset_sp = tf.data.Dataset.from_tensor_slices((x_val_sp, y_val_sp))
val_dataset_sp = val_dataset_sp.batch(batch_size_sp)

In [None]:
embedding_size_sp = 300
model_sp = build_and_compile_model_better(embedding_size = embedding_size_sp)
tf.keras.utils.plot_model(model_sp, show_shapes=True, show_layer_activations=True, )
print(model_sp.summary())

In [None]:
model_sp.fit(train_dataset_sp, epochs=num_epochs_sp, validation_data=val_dataset_sp)

Evaluem el model amb al partició de validation

In [None]:
print(f"Correlación de Pearson (train): {compute_pearson(model_sp, x_train_sp, y_train_sp)}")
print(f"Correlación de Pearson (validation): {compute_pearson(model_sp, x_val_sp, y_val_sp)}")

Provem el model amb la partició de test

In [None]:
print(f"Correlación de Pearson (train): {compute_pearson(model_sp, x_test_sp, y_test_sp)}")

##### Conclusions de SpaCy

Utilitzant el model de SpaCy, el model segueix fent sobreajust, i funciona de manera molt similar al Word2Vec, però una mica pitjor. Això és degut a que el model de SpaCy també té en compte la semàntica de les paraules, ja que les paraules que tenen un significat similar tenen un embedding similar. Això fa que el model sigui capaç de generalitzar millor.

#### 4. RoBERTa 

In [None]:
from roberta_base import map_roberta_embed
import spacy

In [None]:
!python -m spacy download ca_core_news_trf

Carreguem el model de RoBERTa preentrenat en català.

In [None]:
roberta_model = spacy.load("ca_core_news_trf")

Passem les particions de train, test i val a vectors amb la funció *map_roberta()*. Aquesta funció utilitza el model de RoBERTa per a convertir les frases a vectors.

In [None]:
mapped_roberta_train = map_roberta_embed(train, roberta_model)
mapped_roberta_test = map_roberta_embed(test, roberta_model)
mapped_roberta_val = map_roberta_embed(val, roberta_model)

Dividim les particions en X i Y.

In [None]:
x_train_roberta, y_train_roberta = pair_list_to_x_y(mapped_roberta_train)
x_val_roberta, y_val_roberta = pair_list_to_x_y(mapped_roberta_val)
x_test_roberta, y_test_roberta = pair_list_to_x_y(mapped_roberta_test)

Entrenem el model.

In [None]:
batch_size_roberta: int = 64
num_epochs_roberta: int = 64

train_dataset_roberta = tf.data.Dataset.from_tensor_slices((x_train_roberta, y_train_roberta))
train_dataset_roberta = train_dataset_roberta.shuffle(buffer_size=len(x_train_roberta)).batch(batch_size_roberta)

val_dataset_roberta = tf.data.Dataset.from_tensor_slices((x_val_roberta, y_val_roberta))
val_dataset_roberta = val_dataset_roberta.batch(batch_size_roberta)

In [None]:
embedding_size_roberta = 768
model_roberta = build_and_compile_model_better(embedding_size = embedding_size_roberta)
tf.keras.utils.plot_model(model_roberta, show_shapes=True, show_layer_activations=True)
print(model_roberta.summary())

In [None]:
model_roberta.fit(train_dataset_roberta, epochs=num_epochs_roberta, validation_data=val_dataset_roberta)

Evaluar el model amb la partició de validació.

In [None]:
print(f"Correlación de Pearson (train): {compute_pearson(model_roberta, x_train_roberta, y_train_roberta)}")
print(f"Correlación de Pearson (validation): {compute_pearson(model_roberta, x_val_roberta, y_val_roberta)}")

Provar el model amb la partició de test.

In [None]:
print(f"Correlación de Pearson (test): {compute_pearson(model_roberta, x_test_roberta, y_test_roberta)}")

##### Conclusions Roberta

Veiem com el model de RoBERTa és un dels que millor funciona, ja que és el model més avançat i el que millor semàntica té. Això fa que el model sigui capaç de generalitzar millor.
Per altra banda, els seus vectors són més grans així que és més pesat d'executar que els altres models.

#### 5. RoBERTa fine-tuned

Instal·lem les llibreries necessàries

In [None]:
%pip install tf-keras
%pip install transformers

Fem els imports de les funcions.

In [None]:
from roberta_fine_tuned import prepare_roberta_ft, compute_pearson_roberta_ft, x_y_split_roberta_ft
from transformers import pipeline, AutoTokenizer

Creem el model de RoBERTa fine-tuned.

In [None]:
model_roberta_ft = 'projecte-aina/roberta-base-ca-v2-cased-sts'
tokenizer_roberta_ft = AutoTokenizer.from_pretrained(model_roberta_ft)
pipe_roberta_ft = pipeline('text-classification', model=model_roberta_ft, tokenizer=tokenizer_roberta_ft)

Separem en X i Y.

In [None]:
X_train_roberta_ft, y_train_roberta_ft = x_y_split_roberta_ft(train)
X_val_roberta_ft, y_val_roberta_ft = x_y_split_roberta_ft(val)
X_test_roberta_ft, y_test_roberta_ft = x_y_split_roberta_ft(test)

In [None]:
X = X_train_roberta_ft + X_val_roberta_ft + X_test_roberta_ft
y = y_train_roberta_ft + y_val_roberta_ft + y_test_roberta_ft

Obtenim els resultats del model

In [None]:
predictions = pipe_roberta_ft(prepare_roberta_ft(X, tokenizer_roberta_ft), add_special_tokens=False)

In [None]:
print(f"Correlación de Pearson (all data): {compute_pearson_roberta_ft(predictions, y)}")

In [None]:
print(f"Correlación de Pearson (test): {compute_pearson_roberta_ft(predictions[-len(y_test_roberta_ft):], y_test_roberta_ft)}")

##### Conclusions Roberta fine-tuned

Sens dubte el model de RoBERTa fine-tuned és el que millor funciona, ja que és el model més avançat i el que millor semàntica té. Això fa que el model sigui capaç de generalitzar millor.

## Models amb embeddings entrenables

In [63]:
REMAP_EMBEDDINGS: bool = True
USE_PRETRAINED: bool = True
MAX_LEN: int = 96

In [65]:
from trainable_embed_model import map_w2v_trainable, model_2, pair_list_to_x_y

ImportError: cannot import name 'map_w2v_trainable' from 'trainable_embed_model' (c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\trainable_embed_model.py)

In [66]:
import tensorflow as tf
from typing import List, Tuple, Optional
import numpy as np
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess

def map_word_embeddings(
        sentence: str,
        sequence_len: int,
        fixed_dictionary: Optional[Dictionary] = None,
        wv_model: Optional[tf.keras.layers.Embedding] = None
) -> np.ndarray:
    """
    Map to word-embedding indices
    :param sentence:
    :param sequence_len:
    :param fixed_dictionary:
    :return:
    """
    sentence_preproc = simple_preprocess(sentence)[:sequence_len]
    _vectors = np.zeros(sequence_len, dtype=np.int32)
    index = 0
    for word in sentence_preproc:
        if fixed_dictionary is not None:
            if word in fixed_dictionary.token2id:
                # Sumo 1 porque el valor 0 está reservado a padding
                _vectors[index] = fixed_dictionary.token2id[word] + 1
                index += 1
        else:
            if word in wv_model.key_to_index:
                _vectors[index] = wv_model.key_to_index[word] + 1
                index += 1
    return _vectors

def map_w2v_trainable(
        wv_model: tf.keras.layers.Embedding,
    sentence_pairs: List[Tuple[str, str, float]],
    sequence_len: int,
    fixed_dictionary: Optional[Dictionary] = None
) -> List[Tuple[Tuple[np.ndarray, np.ndarray], float]]:
    '''
    Mapea los pares de oraciones a pares de vectores
    
    Parameters:
    - wv_model: Modelo de embeddings entrenable
    - sentence_pairs: Lista de pares de oraciones
    - sequence_len: Longitud de las secuencias
    - fixed_dictionary: Diccionario fijo de palabras
    
    Returns:
    - pares_vectores: Lista de pares de vectores
    '''
    # Mapeo de los pares de oraciones a pares de vectores
    pares_vectores = []
    for i, (sentence_1, sentence_2, similitud) in enumerate(sentence_pairs):
        vector1 = map_word_embeddings(sentence_1, sequence_len, fixed_dictionary, wv_model)
        vector2 = map_word_embeddings(sentence_2, sequence_len, fixed_dictionary, wv_model)
        # Añadir a la lista
        pares_vectores.append(((vector1, vector2), similitud))
    return pares_vectores

def pair_list_to_x_y(pair_list: List[Tuple[Tuple[np.ndarray, np.ndarray], int]]) -> Tuple[Tuple[np.ndarray, np.ndarray], np.ndarray]:
    '''
    Convierte una lista de pares de vectores a un par de arrays de vectores y un array de etiquetas
    
    Parameters:
    - pair_list: Lista de pares de vectores
    
    Returns:
    - x: Par de arrays de vectores
    - y: Array de etiquetas
    '''
    _x, _y = zip(*pair_list)
    _x_1, _x_2 = zip(*_x)
    return (np.row_stack(_x_1), np.row_stack(_x_2)), np.array(_y)

class MyLayer_mask(tf.keras.layers.Layer):
    def call(self, x):
        return tf.not_equal(x, 0)
    
class MyLayer_exp(tf.keras.layers.Layer):
    def call(self, x):
        return tf.exp(x)
    
class MyLayer_cast(tf.keras.layers.Layer):
    def call(self, x):
        return tf.cast(x, tf.float32)
    
class MyLayer_reduce_sum(tf.keras.layers.Layer):
    def call(self, x):
        return tf.reduce_sum(x, axis=1, keepdims=True)
def model_2(
    input_length: int,
    dictionary_size: int = 1000,
    embedding_size: int = 16,
    learning_rate: float = 1e-3,
    pretrained_weights: Optional[np.ndarray] = None,
    trainable: bool = False,
    use_cosine: bool = False,
) -> tf.keras.Model:
    # Inputs
    input_1 = tf.keras.Input((input_length,), dtype=tf.int32)
    input_2 = tf.keras.Input((input_length,), dtype=tf.int32)

    # Embedding Layer
    if pretrained_weights is None:
        embedding = tf.keras.layers.Embedding(
            dictionary_size, embedding_size, input_length=input_length, mask_zero=True
        )
    else:
        dictionary_size = pretrained_weights.shape[0]
        embedding_size = pretrained_weights.shape[1]
        initializer = tf.keras.initializers.Constant(pretrained_weights)
        embedding = tf.keras.layers.Embedding(
            dictionary_size,
            embedding_size,
            input_length=input_length,
            mask_zero=True,
            embeddings_initializer=initializer,
            trainable=trainable,
        )

    # Embed the inputs
    embedded_1 = embedding(input_1)
    embedded_2 = embedding(input_2)
    # Pass through the embedding layer
    _input_mask_1, _input_mask_2 = MyLayer_mask()(input_1), MyLayer_mask()(input_2)

    # Attention Mechanism
    attention_mlp = tf.keras.Sequential([
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(16, activation='tanh'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1)
    ])
    # Apply attention to each embedding
    attention_weights_1 = attention_mlp(embedded_1)  
    attention_weights_2 = attention_mlp(embedded_2) 
    # Mask the attention weights
    attention_weights_1 = MyLayer_exp()(attention_weights_1) * MyLayer_cast()(_input_mask_1[:, :, None])
    attention_weights_2 = MyLayer_exp()(attention_weights_2) * MyLayer_cast()(_input_mask_2[:, :, None])
    # Normalize attention weights
    attention_weights_1 = attention_weights_1 / MyLayer_reduce_sum()(attention_weights_1)
    attention_weights_2 = attention_weights_2 / MyLayer_reduce_sum()(attention_weights_2)
    # Compute context vectors
    projected_1 = MyLayer_reduce_sum()(embedded_1 * attention_weights_1) 
    projected_2 = MyLayer_reduce_sum()(embedded_2 * attention_weights_2) 
    
    if use_cosine:
        # Compute the cosine distance using a Lambda layer
        def cosine_distance(x):
            x1, x2 = x
            x1_normalized = tf.keras.backend.l2_normalize(x1, axis=1)
            x2_normalized = tf.keras.backend.l2_normalize(x2, axis=1)
            return 2.5 * (1.0 + tf.reduce_sum(x1_normalized * x2_normalized, axis=1))
        output = tf.keras.layers.Lambda(cosine_distance)([projected_1, projected_2])
    else:
         # Compute the cosine distance using a Lambda layer
        def normalized_product(x):
            x1, x2 = x
            x1_normalized = tf.keras.backend.l2_normalize(x1, axis=1)
            x2_normalized = tf.keras.backend.l2_normalize(x2, axis=1)
            return x1_normalized * x2_normalized
    
        output = tf.keras.layers.Lambda(normalized_product)([projected_1, projected_2])
        output = tf.keras.layers.Dropout(0.1)(output)
        output = tf.keras.layers.Dense(
            16,
            activation="relu",
        )(output)
        output = tf.keras.layers.Dropout(0.2)(output)
        output = tf.keras.layers.Dense(
            1,
            activation="sigmoid",
        )(output)
        
        output = tf.keras.layers.Lambda(lambda x: x * 5)(output)
    # Model Definition
    model = tf.keras.Model(inputs=(input_1, input_2), outputs=output)
    model.compile(
        loss="mean_squared_error", optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate)
    )
    return model


Passem les particions de train, test i val a vectors amb la funció *map_w2v_trainable()*. Aquesta funció utilitza el model de Word2Vec per a convertir les frases a vectors.

In [67]:
mapped_train_trainable = map_w2v_trainable(wv_model, train, MAX_LEN, dictionary)
mapped_test_trainable = map_w2v_trainable(wv_model, test, MAX_LEN, fixed_dictionary=dictionary)
mapped_val_trainable = map_w2v_trainable(wv_model, val, MAX_LEN, fixed_dictionary=dictionary)

Partim en X i Y.

In [68]:
x_train_trainable, y_train_trainable = pair_list_to_x_y(mapped_train_trainable)
x_val_trainable, y_val_trainable = pair_list_to_x_y(mapped_val_trainable)
x_test_trainable, y_test_trainable = pair_list_to_x_y(mapped_test_trainable)

Reformatejem les dades per a que siguin l'estructura correcte.

In [69]:
batch_size_trainable: int = 64
num_epochs_trainable: int = 128

train_dataset_trainable = tf.data.Dataset.from_tensor_slices((x_train_trainable, y_train_trainable))
train_dataset_trainable = train_dataset_trainable.shuffle(buffer_size=len(x_train_trainable)).batch(batch_size_trainable)

val_dataset_trainable = tf.data.Dataset.from_tensor_slices((x_val_trainable, y_val_trainable))
val_dataset_trainable = val_dataset_trainable.batch(batch_size_trainable)

### Random embeddings

Creem el model amb embeddings aleatoris, sense cap tipus de preentrenament.

In [70]:
model_random = model_2(MAX_LEN,pretrained_weights=None, trainable=False, use_cosine=False)
model_random.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, 96)]                 0         []                            
                                                                                                  
 input_6 (InputLayer)        [(None, 96)]                 0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 96, 16)               16000     ['input_5[0][0]',             
                                                                     'input_6[0][0]']             
                                                                                                  
 my_layer_mask (MyLayer_mas  (None, 96)                   0         ['input_5[0][0]']       

Entrenem el model

In [72]:
model_random.fit(train_dataset_trainable, epochs=num_epochs_trainable, validation_data=val_dataset_trainable)

Epoch 1/128





InvalidArgumentError: Graph execution error:

Detected at node model_2/embedding_2/embedding_lookup defined at (most recent call last):
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\runpy.py", line 196, in _run_module_as_main

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\runpy.py", line 86, in _run_code

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\asyncio\base_events.py", line 603, in run_forever

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\asyncio\base_events.py", line 1909, in _run_once

  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\asyncio\events.py", line 80, in _run

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\Pol\AppData\Local\Temp\ipykernel_6984\1647691059.py", line 1, in <module>

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\utils\traceback_utils.py", line 65, in error_handler

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\training.py", line 1804, in fit

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\training.py", line 1398, in train_function

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\training.py", line 1381, in step_function

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\training.py", line 1370, in run_step

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\training.py", line 1147, in train_step

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\training.py", line 553, in error_handler

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\training.py", line 558, in error_handler

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\training.py", line 588, in __call__

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\training.py", line 553, in error_handler

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\training.py", line 558, in error_handler

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\base_layer.py", line 1047, in __call__

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\base_layer.py", line 1136, in __call__

  File "C:\Users\Pol\AppData\Local\Temp\__autograph_generated_filemh31qqft.py", line 34, in error_handler

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\functional.py", line 514, in call

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\functional.py", line 661, in _run_internal_graph

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\functional.py", line 663, in _run_internal_graph

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\functional.py", line 663, in _run_internal_graph

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\functional.py", line 663, in _run_internal_graph

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\functional.py", line 671, in _run_internal_graph

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\training.py", line 553, in error_handler

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\training.py", line 558, in error_handler

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\base_layer.py", line 1047, in __call__

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\engine\base_layer.py", line 1136, in __call__

  File "C:\Users\Pol\AppData\Local\Temp\__autograph_generated_filemh31qqft.py", line 34, in error_handler

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\layers\core\embedding.py", line 225, in call

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\layers\core\embedding.py", line 263, in call

  File "c:\Users\Pol\Desktop\1_Uni\Q4\PLH\PLH-WordEmbeddings\PLH-WE-env\lib\site-packages\tf_keras\src\layers\core\embedding.py", line 273, in call

indices[16,5] = 2327 is not in [0, 1000)
	 [[{{node model_2/embedding_2/embedding_lookup}}]] [Op:__inference_train_function_4221]

Evaluem el model amb la partició de validació.

In [None]:
print(f"Correlación de Pearson (train): {compute_pearson(model_random, x_train_trainable, y_train_trainable)}")
print(f"Correlación de Pearson (validation): {compute_pearson(model_random, x_val_trainable, y_val_trainable)}")

Provem amb la partició de test.

In [None]:
print(f"Correlación de Pearson (test): {compute_pearson(model_random, x_test_trainable, y_test_trainable)}")

### Word2Vec

Definim els pesos preentrenats. Cal destacar que el model de Word2Vec *wv_model* ja ha estat importat en la secció [2. Word2Vec preentrenats](#2-word2vec-preentrenats).

In [None]:
_pretrained_weights = None
if USE_PRETRAINED:
    if REMAP_EMBEDDINGS:
        _pretrained_weights = np.zeros(
            (len(dictionary.token2id) + 1, wv_model.vector_size),  dtype=np.float32)
        for token, _id in dictionary.token2id.items():
            if token in wv_model:
             _pretrained_weights[_id + 1] = wv_model[token]

            else:
                # In W2V, OOV will not have a representation. We will use 0.
                pass
    else:
        # Not recommended (this will consume A LOT of RAM)
        _pretrained_weights = np.zeros((wv_model.vectors.shape[0] + 1, wv_model.vector_size,),  dtype=np.float32)
        _pretrained_weights[1:, :] = wv_model.vectors

Creem el model amb embeddings de Word2Vec preentrenats.

In [None]:
model2 = model_2(MAX_LEN,pretrained_weights=_pretrained_weights, trainable=False, use_cosine=False)
model2.summary()

Entrenem el model.

In [None]:
model2.fit(train_dataset_trainable, epochs=num_epochs_trainable, validation_data=val_dataset_trainable)

Evaluem el model amb la partició de validació.

In [None]:
print(f"Correlación de Pearson (train): {compute_pearson(x_train_trainable, y_train_trainable, model2)}")
print(f"Correlación de Pearson (validation): {compute_pearson(x_val_trainable, y_val_trainable, model2)}")

Provem amb la partició de test.

In [None]:
print(f"Correlación de Pearson (test): {compute_pearson(x_test_trainable, y_test_trainable, model2)}")