In [7]:
!pip install transformers torch peft

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
Downloading accelerate-1.1.1-py3-none-any.whl (333 kB)
Installing collected packages: accelerate, peft
Successfully installed accelerate-1.1.1 peft-0.13.2


In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from peft import PeftModel
import torch

MODEL_VERSION = 1

base_model_name = "bert-base-uncased"
adapter_path = f"./models/model_{MODEL_VERSION}/fine_tuned_lora_mlm"

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

base_model = AutoModelForMaskedLM.from_pretrained(base_model_name, output_hidden_states=True)
model = PeftModel.from_pretrained(base_model, adapter_path)

  from .autonotebook import tqdm as notebook_tqdm
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a mod

In [2]:
# Função para codificar as entradas usando o modelo LoRA
def encoder(inputs):
    embeddings = []
    for input_text in inputs:
        # Tokenizar a entrada
        tokens = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=128)

        # Certifique-se de que os tensores estejam na mesma device do modelo
        tokens = {key: val for key, val in tokens.items()}

        # Passar os tokens pelo modelo (incluindo os adaptadores LoRA)
        with torch.no_grad():
            outputs = model(**tokens)  # Chamada ao modelo
            # Extração do embedding da última camada oculta
            embedding = outputs.hidden_states[-1].mean(dim=1).squeeze().cpu().numpy()
            embeddings.append(embedding)

    return embeddings

# Gerar embeddings para uma lista de descrições de CVE
cve_descriptions = ["Teste descrição"]
embeddings_train = encoder(cve_descriptions)

# Verificar o comprimento do embedding gerado
print(embeddings_train[0])



[-1.59184411e-01  1.04749784e-01  4.89766821e-02  1.19697057e-01
  4.64422703e-01 -3.61814559e-01  2.74628580e-01  3.36516291e-01
  1.00488737e-01 -5.50551116e-01  2.26543695e-01  1.48407355e-01
  5.83740994e-02  2.97494531e-01 -3.78306448e-01  4.50280786e-01
  8.03930163e-02  2.43607253e-01  1.84037805e-01 -2.21060850e-02
  5.97796738e-02 -2.07727894e-01 -1.73781350e-01  2.87904680e-01
 -4.92344409e-01 -4.28109355e-02  3.28049771e-02 -1.19974285e-01
 -5.02179027e-01  2.61972636e-01 -1.28775567e-01  1.28682896e-01
 -2.71517754e-01 -2.17973709e-01 -4.35077846e-01 -3.79266925e-02
  2.73012370e-02 -8.87770876e-02 -1.45556986e-01  6.98257014e-02
 -2.31916815e-01 -2.23698884e-01 -4.42977622e-02  1.62013501e-01
  8.07240754e-02  3.35303918e-02 -3.23557884e-01  4.20951471e-03
  3.30137610e-01  2.22963184e-01 -3.84423167e-01  3.87669891e-01
  4.24453840e-02 -2.50005890e-02  3.79839718e-01  1.48208797e-01
 -2.00261235e-01 -4.12309647e-01  9.54100676e-03 -1.80492356e-01
  3.26197237e-01  2.54425