# Pràctica 4 PLH - Rubén Álvarez Aragonés i Pol Pérez Prades

____

# Stuff

In [None]:
%pip install nltk

## Imports 

In [8]:
%pip install -r requirements.txt

Collecting torch (from -r requirements.txt (line 6))
  Downloading torch-2.3.0-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting filelock (from torch->-r requirements.txt (line 6))
  Downloading filelock-3.14.0-py3-none-any.whl.metadata (2.8 kB)
Collecting sympy (from torch->-r requirements.txt (line 6))
  Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch->-r requirements.txt (line 6))
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting fsspec (from torch->-r requirements.txt (line 6))
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch->-r requirements.txt (line 6))
  Downloading mkl-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.4 kB)
Collecting intel-openmp==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch->-r requirements.txt (line 6))
  Downloading intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.2 kB)
Collecting tbb==2021.* (from mkl<=2021.4.0,>=202

In [11]:
import tensorflow as tf
import numpy as np
import nltk
import re
import spacy
from spacy.lang.ca.examples import sentences 
from gensim.models import Word2Vec
import torch

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ralva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ralva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Entrenament model Word2Vec

In [31]:
import re
import nltk
from nltk.corpus import stopwords
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import numpy as np

class Word2VecEmbedder:
    def __init__(self, corpus_path, corpus_size, load_model=False, model_path=None):
        if not load_model:
            self.corpus_path = corpus_path
            self.corpus_size = int(corpus_size * 2**30)  # Convert GB to bytes
            self.corpus = self.get_corpus(corpus_path)
            self.build_vocab()
            self.fit()
        else:
            try:
                self.load(model_path)
            except:
                print("Model not found. Please check the path.")
                return

    def get_corpus(self, corpus_path):
        with open(corpus_path, 'r', encoding='utf-8') as f:
            corpus = f.read(self.corpus_size)
            corpus = self.preprocess(corpus)  # Preprocess the corpus and tokenize it
        return corpus

    def preprocess(self, corpus):
        corpus = corpus.lower()
        corpus = re.sub(r'[^a-záéíóúñü\s]', '', corpus)
        corpus = nltk.word_tokenize(corpus)
        corpus = corpus[:-1]
        return corpus

    def build_vocab(self):
        self.word_counts = Counter(self.corpus)
        self.vocab = {word: i for i, word in enumerate(self.word_counts.keys())}
        self.inv_vocab = {i: word for word, i in self.vocab.items()}
        self.vocab_size = len(self.vocab)

    def generate_training_data(self, window_size):
        training_data = []
        for i, word in enumerate(self.corpus):
            target_word = self.vocab[word]
            context_words = [self.vocab[self.corpus[i+j]] for j in range(-window_size, window_size+1) if j != 0 and 0 <= i+j < len(self.corpus)]
            for context_word in context_words:
                training_data.append((target_word, context_word))
        return training_data

    def fit(self, window_size=5, vector_size=100, epochs=10, lr=0.01):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = Word2VecModel(self.vocab_size, vector_size).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.loss_fn = nn.CrossEntropyLoss()

        training_data = self.generate_training_data(window_size)
        for epoch in range(epochs):
            total_loss = 0
            for target, context in training_data:
                target = torch.tensor([target], dtype=torch.long).to(self.device)
                context = torch.tensor([context], dtype=torch.long).to(self.device)
                self.optimizer.zero_grad()
                output = self.model(target)
                loss = self.loss_fn(output, context)
                loss.backward()
                self.optimizer.step()
                total_loss += loss.item()
            print(f'Epoch {epoch+1}, Loss: {total_loss/len(training_data)}')

    def save(self, model_path):
        torch.save(self.model.state_dict(), model_path)

    def load(self, model_path):
        self.model = Word2VecModel(self.vocab_size, vector_size).to(self.device)
        self.model.load_state_dict(torch.load(model_path))

    def get_embedding(self, word):
        word_idx = self.vocab.get(word)
        if word_idx is not None:
            return self.model.input_embeddings.weight[word_idx].cpu().detach().numpy()
        else:
            print("Word not in vocabulary.")
            return None

class Word2VecModel(nn.Module):
    def __init__(self, vocab_size, vector_size):
        super(Word2VecModel, self).__init__()
        self.input_embeddings = nn.Embedding(vocab_size, vector_size)
        self.output_embeddings = nn.Linear(vector_size, vocab_size)

    def forward(self, inputs):
        embeds = self.input_embeddings(inputs)
        output = self.output_embeddings(embeds)
        return output


In [16]:
class Word2VecEmbedder:
    def __init__(self, corpus_path, corpus_size, load_model=False, model_path=None):
        if not load_model:
            self.corpus_path = corpus_path
            self.corpus_size = int(corpus_size * 2**30)  # Convert GB to bytes
            self.corpus = self.get_corpus(corpus_path)
            self.fit()
        else:
            try:
                self.load(model_path)
            except:
                print("Model not found. Please check the path.")
                return

    def get_corpus(self, corpus_path):
        with open(corpus_path, 'r', encoding='utf-8') as f:
            print("Reading corpus...")
            corpus = f.read(self.corpus_size)
            print("Preprocessing corpus...")
            corpus = self.preprocess(corpus)  # Preprocess the corpus and tokenize it
        return corpus

    def fit(self, window_size=5, vector_size=100, min_count=10, workers=8, epochs=10):
        # Initialize the Word2Vec model with gensim
        print("Initializing Word2Vec model...")
        self.model = Word2Vec(vector_size=vector_size, window=window_size, min_count=min_count, workers=workers)
        
        # Build vocabulary from the corpus
        print("Building vocabulary...")
        self.model.build_vocab(self.corpus)
        
        # Train the model on the corpus, using GPU if available
        print("Training model...")
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(device)
        self.model.train(self.corpus, total_examples=self.model.corpus_count, epochs=epochs, compute_loss=True)
        
        # Move model to GPU if available
        if torch.cuda.is_available():
            self.model.wv.vectors = torch.tensor(self.model.wv.vectors, device=device)

    def save(self, model_path):
        # Save the model
        self.model.save(model_path)

    def load(self, model_path):
        # Load the model
        self.model = Word2Vec.load(model_path)

    def preprocess(self, corpus):
        # Lowercase the corpus
        print("Lowercasing...")
        corpus = corpus.lower()
        
        # Remove special characters
        print("Removing special characters...")
        corpus = re.sub(r'[^a-záéíóúñü\s]', '', corpus)
        
        # Tokenize the corpus
        print("Tokenizing...")
        corpus = nltk.word_tokenize(corpus)
        
        # Eliminate last token (probably incomplete word)
        corpus = corpus[:-1]
        
        return corpus

    def get_embedding(self, word):
        # Get the embedding of a word
        return self.model.wv[word]
    

### 1. Model amb 100MB de dades

In [32]:
model = Word2VecEmbedder('corpus\catalan_general_crawling.txt', 0.1)

In [24]:
model.corpus

['reduu',
 'els',
 'costos',
 'dels',
 'processos',
 'administratius',
 'al',
 'vostre',
 'organisme',
 'públic',
 'eviteu',
 'els',
 'desplaaments',
 'i',
 'prdua',
 'de',
 'temps',
 'als',
 'ciutadans',
 'en',
 'les',
 'seves',
 'gestions',
 'oferiu',
 'una',
 'administració',
 'més',
 'transparent',
 'a',
 'ciutadans',
 'i',
 'empreses',
 'ens',
 'grans',
 'i',
 'petits',
 'experimenten',
 'aquesta',
 'transformació',
 'amb',
 'xit',
 'grcies',
 'al',
 'suport',
 'de',
 'laoc',
 'departament',
 'de',
 'sistemes',
 'dinformació',
 'i',
 'processos',
 'via',
 'oberta',
 'ens',
 'ha',
 'perms',
 'fer',
 'efectiu',
 'el',
 'dret',
 'dels',
 'ciutadans',
 'a',
 'no',
 'aportar',
 'documents',
 'eliminant',
 'paper',
 'i',
 'simplificant',
 'procediments',
 'efact',
 'proporciona',
 'informació',
 'indispensable',
 'per',
 'a',
 'la',
 'realització',
 'de',
 'les',
 'auditories',
 'del',
 'registre',
 'comptable',
 'de',
 'factures',
 'de',
 'les',
 'administracions',
 'públiques',
 'cata

### 2. Model amb 500MB de dades

### 3. Model amb 1GB de dades

## Entrenament model de Similitud de Text Semàntic

### Implementació

In [None]:
def build_and_compile_model(hidden_size: int = 64) -> tf.keras.Model:
  model = tf.keras.Sequential([
      tf.keras.layers.Concatenate(axis=-1, ),
      tf.keras.layers.Dense(hidden_size, activation='relu'),
      tf.keras.layers.Dense(1)
  ])
  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
  return model

In [None]:
baseline_model = build_and_compile_model()

In [None]:
y = baseline_model((np.ones((1, 100)), np.ones((1,100)), ), )

### Compartació amb diferents models de Word Embeddings

#### 1. One Hot

#### 2. Word2Vec preentrenats 

In [1]:
model.save('model.bin')

from gensim.models import KeyedVectors
kv = KeyedVectors.load_word2vec_format('model.bin', binary=True, mmap='r')

FileNotFoundError: [Errno 2] No such file or directory: 'model.bin'

In [None]:
word1 = 'casa'
word2 = 'cotxe'

word1_vector = kv[word1]
word2_vector = kv[word2]

In [None]:
w2v = build_and_compile_model()
result = w2v((word1_vector, word2_vector))

#### 3. SpaCy

In [None]:
nlp = spacy.load("en_core_web_md")
sentence = nlp("I sit on a bank.")
sentence[4].vector
# -> NDArray

#### 4. RoBERTa 

In [None]:
nlp = spacy.load("ca_core_news_trf")
doc = nlp(sentences[0])
print(doc.text)
for token in doc:
    print(token.text, token.pos_, token.dep_)

#### 5. RoBERTa fine-tuned

In [None]:
model = 'projecte-aina/roberta-base-ca-v2-cased-sts'
tokenizer = AutoTokenizer.from_pretrained(model)
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer)

def prepare(sentence_pairs):
    sentence_pairs_prep = []
    for s1, s2 in sentence_pairs:
        sentence_pairs_prep.append(f"{tokenizer.cls_token} {s1}{tokenizer.sep_token}{tokenizer.sep_token} {s2}{tokenizer.sep_token}")
    return sentence_pairs_prep

sentence_pairs = [("El llibre va caure per la finestra.", "El llibre va sortir volant."),
                  ("M'agrades.", "T'estimo."),
                  ("M'agrada el sol i la calor", "A la Garrotxa plou molt.")]

predictions = pipe(prepare(sentence_pairs), add_special_tokens=False)

# convert back to scores to the original 0 and 5 interval
for prediction in predictions:
    prediction['score'] = logit(prediction['score'])
print(predictions)