In [1]:
!pip install -U sentence_transformers
!pip install xformers
!pip install bitsandbytes
!pip install peft
!pip install huggingface_hub
!pip install datasets

Collecting sentence_transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.3.1
Collecting xformers
  Downloading xformers-0.0.29.post1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting torch==2.5.1 (from xformers)
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1->xformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.1->xformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collectin

In [26]:
torch.cuda.empty_cache()

In [4]:
import os
import json
from sentence_transformers import SentenceTransformer
import torch
from transformers import AutoModel, AutoTokenizer
from safetensors.torch import load_file
from peft import PeftModel
from sklearn.preprocessing import normalize
from datasets import load_dataset, concatenate_datasets, Dataset
from typing import Optional
from sentence_transformers.evaluation import TripletEvaluator, InformationRetrievalEvaluator
from sentence_transformers.util import cos_sim, mine_hard_negatives
import json
import time
#from langchain_community.document_loaders import PyPDFLoader
#from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
def load_finetune_dataset(data_file: str, data_config_type: str, train_test_split: Optional[float]=0.8):
    """
    Load the dataset for finetuning embedding models

    Args:
        data_file (str): Path to the dataset file.
        data_config_type (str): Data format type (e.g., "triplets", "pair").
        train_test_split(Optional[float]): Ratio of training set. By default, 0.8.

    Return:
        dataset(dict): dataset with train/validation/test split
    """
    ds = load_dataset("json", data_files=data_file, split="train")
    # Rename columns
    ds = ds.rename_columns({'user_query': 'anchor', 'positive_answer':'positive'})
    if data_config_type == "triplets":
        ds = ds.rename_column('negative_answer', 'negative')
    # Add an id column to the dataset
    ds = ds.add_column("id", range(len(ds)))
    train_val_split = ds.train_test_split(test_size=1-train_test_split, shuffle=True)
    val_test_split = train_val_split["test"].train_test_split(test_size=0.5, shuffle=True)
    dataset = {
        'train': train_val_split['train'],
        'validation': val_test_split['train'],
        'test': val_test_split['test']
    }
    return dataset


def get_embedding(text: str, iTokenizer: AutoTokenizer, iModel: AutoModel, iVector: torch.nn.Linear):
    """
    Obtain the text embedding vectors from the hugging face transformers pipeline.

    Args:
        text: input text seqeunces
        iTokenizer: tokenizer
        iModel: base model
        iVector: vector linear layer

    Return:
        query_vectors: embedding vectors
    """
    with torch.no_grad():
        input_data = iTokenizer(text, padding="longest", truncation=True, max_length=512, return_tensors="pt")
        input_data = {k: v.to("cpu") for k, v in input_data.items()}
        attention_mask = input_data["attention_mask"]
        last_hidden_state = iModel(**input_data)[0]
        last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
        query_vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
        query_vectors = normalize(iVector(query_vectors).cpu().numpy())
        return query_vectors

def triplet_evaluator(test_dataset, model):
    # Evaluate the test dataset
    test_evaluator = TripletEvaluator(
                        anchors=test_dataset["anchor"],
                        positives=test_dataset["positive"],
                        negatives=test_dataset["negative"],
                        name="triplet_evaluation_test",
                        similarity_fn_names=['cosine', 'euclidean']
                    )
    results = test_evaluator(model)
    print(f"{test_evaluator.primary_metric}: {results[test_evaluator.primary_metric]}")
    return results

def information_retrieval_evaluator(test_dataset, corpus_dataset, model):
    corpus = dict(zip(corpus_dataset["id"], corpus_dataset["positive"]))
    queries = dict(zip(test_dataset["id"], test_dataset["anchor"]))
    relevant_docs = {}
    for q_id in queries:
        relevant_docs[q_id] = [q_id]    
    test_evaluator =  InformationRetrievalEvaluator(
                        queries=queries,
                        corpus=corpus,
                        relevant_docs=relevant_docs,
                        name="eval_finetune_embed",
                        score_functions={"cosine": cos_sim},
                    )
    results = test_evaluator(model)
    print(f"{test_evaluator.primary_metric}: {results[test_evaluator.primary_metric]}")
    return results

## Pretrained model
We choose `stella_en_400M_v5` model from [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard).

In [27]:
query_prompt_name = "s2p_query"
queries = [
    "What material is the rear wing of the 718 Cayman GT4 RS made of?",
    "What is the unique feature of the Cayenne Turbo GT?",
    "What customization options are available for the Taycan Cross Turismo?",
    "What is the combined CO₂ emissions for the 718 Cayman GTS 4.0?",
    "What is the GTS model?",
    "What makes the Cayenne iconic in its category?"
]

docs = [
    "Carbon fiber reinforced plastic (CFRP).",
    "It offers 471 kW (640 PS) and is optimized for high performance.",
    "Exclusive paint finishes, interior trims, and wheel designs.",
    "The 718 Cayman GTS 4.0 produces zero CO₂ emissions because it is fully electric.",
    "GTS model is a great coaching system for MCQ practicing, which G stands for Guardian, T for Teacher and S for Student.",
    "The Cayenne is iconic for being the capital of French Guiana, known for its colonial architecture and a thriving pepper trade."
]

# ！The default dimension is 1024, if you need other dimensions, please clone the model and modify `modules.json` to replace `2_Dense_1024` with another dimension, e.g. `2_Dense_256` or `2_Dense_8192` !
# on gpu
# model = SentenceTransformer("dunzhang/stella_en_400M_v5", trust_remote_code=True).cuda()
# you can also use this model without the features of `use_memory_efficient_attention` and `unpad_inputs`. It can be worked in CPU.
#model = SentenceTransformer(
#    "dunzhang/stella_en_400M_v5",
#    trust_remote_code=True,
#    device="cpu",
#    config_kwargs={"use_memory_efficient_attention": False, "unpad_inputs": False}
#)
#model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", truncate_dim=512)
#model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)
#model = SentenceTransformer('BAAI/bge-large-en-v1.5', trust_remote_code=True)
model = SentenceTransformer('WhereIsAI/UAE-Large-V1')
start_time = time.time()
#query_embeddings = model.encode(queries, prompt_name=query_prompt_name)
#query_embeddings = model.encode(queries, prompt_name="query")
query_embeddings = model.encode(queries)
doc_embeddings = model.encode(docs)
print(f"Inference time: {time.time() - start_time} sec.")
print(query_embeddings.shape, doc_embeddings.shape)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.2k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inference time: 1.7747178077697754 sec.
(6, 1024) (6, 1024)


## Fine-tuned model

In [None]:
!unzip /content/models.zip -d .

In [8]:
fine_tuned_model_path = "../input/models/models/stella_en_400M_v5/finetune_triplets_2025-01-02_18-06-49"
dense_path = "../input/models/models/stella_en_400M_v5/finetune_triplets_2025-01-02_18-06-49/2_Dense/model.safetensors"
base_model = AutoModel.from_pretrained("dunzhang/stella_en_400M_v5",
                            trust_remote_code=True,
                            device_map='cpu',
                            use_memory_efficient_attention=False,
                            unpad_inputs=False)

lora_model = PeftModel.from_pretrained(base_model, fine_tuned_model_path)

tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path,
                            trust_remote_code=True,
                            device_map='cpu',
                            use_memory_efficient_attention=False,
                            unpad_inputs=False)

vector_linear = torch.nn.Linear(in_features=lora_model.config.hidden_size, out_features=1024)
vector_linear_dict = {
    k.replace("linear.", ""): v for k, v in
    load_file(dense_path).items()
}
vector_linear.load_state_dict(vector_linear_dict)
vector_linear.to("cpu")
start_time = time.time()
fine_tuned_query_embeddings = get_embedding(queries, tokenizer, lora_model, vector_linear)
fine_tuned_doc_embeddings = get_embedding(docs, tokenizer, lora_model, vector_linear)
# fine_tuned_model = SentenceTransformer(
#     model_path,
#     device="cuda" if torch.cuda.is_available() else "cpu",
#     trust_remote_code=True,
# )
# fine_tuned_query_embeddings = fine_tuned_model.encode(queries, prompt_name=query_prompt_name)
# fine_tuned_doc_embeddings = fine_tuned_model.encode(docs)
print(f"Inference time: {time.time() - start_time} sec.")
print(fine_tuned_query_embeddings.shape, fine_tuned_doc_embeddings.shape)

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Inference time: 11.011378526687622 sec.
(6, 1024) (6, 1024)


In [28]:
model = SentenceTransformer("../input/models/models/UAE-Large-V1/finetune_triplets_2025-01-12_15-52-10")
start_time = time.time()
#query_embeddings = model.encode(queries, prompt_name=query_prompt_name)
query_embeddings = model.encode(queries)
doc_embeddings = model.encode(docs)
print(f"Inference time: {time.time() - start_time} sec.")
print(query_embeddings.shape, doc_embeddings.shape)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inference time: 5.085451602935791 sec.
(6, 1024) (6, 1024)


## Comparison between fine-tuned model and pre-trained model
Issues of pretrained model:
1. positive example sentence pairs but low similariy
2. negative example sentence pairs but high similariy
After finetuning, the fine-tuned model can improve the performance.

### Case 1: Positive answer but low similarity score
#### Pretrained model

In [10]:
for i in range(3):
    similarities = model.similarity(query_embeddings[i], doc_embeddings[i])
    print(f"Uesr Query: {queries[i]}")
    print(f"Answer: {docs[i]}")
    print(f"Similarity score: {similarities.data.cpu().numpy()[0][0]} \n\n")

Uesr Query: What material is the rear wing of the 718 Cayman GT4 RS made of?
Answer: Carbon fiber reinforced plastic (CFRP).
Similarity score: 0.5579702258110046 


Uesr Query: What is the unique feature of the Cayenne Turbo GT?
Answer: It offers 471 kW (640 PS) and is optimized for high performance.
Similarity score: 0.41068923473358154 


Uesr Query: What customization options are available for the Taycan Cross Turismo?
Answer: Exclusive paint finishes, interior trims, and wheel designs.
Similarity score: 0.5345100164413452 




#### Fine-tuned model

In [17]:
fine_tuned_similarities = fine_tuned_query_embeddings @ fine_tuned_doc_embeddings.T

In [18]:
for i in range(3):
    # similarities = model.similarity(fine_tuned_query_embeddings[i], fine_tuned_doc_embeddings[i])
    print(f"Uesr Query: {queries[i]}")
    print(f"Answer: {docs[i]}")
    # print(f"Similarity score: {similarities.data.cpu().numpy()[0][0]} \n\n")
    print(f"Similarity score: {fine_tuned_similarities[i, i]} \n\n")

Uesr Query: What material is the rear wing of the 718 Cayman GT4 RS made of?
Answer: Carbon fiber reinforced plastic (CFRP).
Similarity score: 0.8132065534591675 


Uesr Query: What is the unique feature of the Cayenne Turbo GT?
Answer: It offers 471 kW (640 PS) and is optimized for high performance.
Similarity score: 0.908795177936554 


Uesr Query: What customization options are available for the Taycan Cross Turismo?
Answer: Exclusive paint finishes, interior trims, and wheel designs.
Similarity score: 0.7588391900062561 




### Case 2: negative answer but high similariy
#### Pretrained model

In [19]:
for i in range(3, 6):
    similarities = model.similarity(query_embeddings[i], doc_embeddings[i])
    print(f"Uesr Query: {queries[i]}")
    print(f"Answer: {docs[i]}")
    print(f"Similarity score: {similarities.data.cpu().numpy()[0][0]} \n\n")

Uesr Query: What is the combined CO₂ emissions for the 718 Cayman GTS 4.0?
Answer: The 718 Cayman GTS 4.0 produces zero CO₂ emissions because it is fully electric.
Similarity score: 0.8328068256378174 


Uesr Query: What is the GTS model?
Answer: GTS model is a great coaching system for MCQ practicing, which G stands for Guardian, T for Teacher and S for Student.
Similarity score: 0.7170218229293823 


Uesr Query: What makes the Cayenne iconic in its category?
Answer: The Cayenne is iconic for being the capital of French Guiana, known for its colonial architecture and a thriving pepper trade.
Similarity score: 0.6261855363845825 




#### Fine-tuned model

In [20]:
for i in range(3, 6):
    # similarities = fine_tuned_model.similarity(fine_tuned_query_embeddings[i], fine_tuned_doc_embeddings[i])
    print(f"Uesr Query: {queries[i]}")
    print(f"Answer: {docs[i]}")
    # print(f"Similarity score: {similarities.data.cpu().numpy()[0][0]} \n\n")
    print(f"Similarity score: {fine_tuned_similarities[i, i]} \n\n")

Uesr Query: What is the combined CO₂ emissions for the 718 Cayman GTS 4.0?
Answer: The 718 Cayman GTS 4.0 produces zero CO₂ emissions because it is fully electric.
Similarity score: 0.15441642701625824 


Uesr Query: What is the GTS model?
Answer: GTS model is a great coaching system for MCQ practicing, which G stands for Guardian, T for Teacher and S for Student.
Similarity score: 0.35840776562690735 


Uesr Query: What makes the Cayenne iconic in its category?
Answer: The Cayenne is iconic for being the capital of French Guiana, known for its colonial architecture and a thriving pepper trade.
Similarity score: 0.18086686730384827 




## Comparison with different embedding models
The embedding models are chosen from [SBERT library](https://sbert.net/) and two embedding model benchmarks, i.e. [Huggingface MTEB](https://huggingface.co/spaces/mteb/leaderboard) and [Crossing Minds ICLERB](https://www.crossingminds.com/company-resources/iclerb).

The chosen models are listed as follows:
- [dunzhang/stella_en_400M_v5](https://huggingface.co/dunzhang/stella_en_400M_v5)
- [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)
- [all-mpnet-base-v2](https://www.sbert.net/docs/sentence_transformer/pretrained_models.html)
- [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5)
- [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5)
- [mixedbread-ai/mxbai-embed-large-v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)
- [Alibaba-NLP/gte-large-en-v1.5](https://huggingface.co/Alibaba-NLP/gte-large-en-v1.5)
- [WhereIsAI/UAE-Large-V1](https://huggingface.co/WhereIsAI/UAE-Large-V1)

### Data type: triplets {query, positive, negative}


In [8]:
print(os.listdir("../input"))

['models', 'qa_pairs_pos_only.json', 'qa_pairs_pos_and_neg.json']


In [6]:
results = dict()

In [5]:
# Prepare test dataset based on the data configuration format
data_config_type = "triplets"
data_file = "../input/qa_pairs_pos_and_neg.json"
dataset = load_finetune_dataset(data_file, data_config_type)
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
test_dataset = dataset["test"]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
base = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True).cuda()
base_results = triplet_evaluator(test_dataset, base)

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/95.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

triplet_evaluation_test_max_accuracy: 0.7289156626506024


In [36]:
model_path = "../input/models/models/UAE-Large-V1/finetune_triplets_2025-01-12_15-52-10"
finetune = SentenceTransformer(model_path, device="cuda" if torch.cuda.is_available() else "cpu", trust_remote_code=True)
finetune_results = triplet_evaluator(test_dataset, finetune)

triplet_evaluation_test_max_accuracy: 0.9939759036144579


In [37]:
results["UAE"] = {}

In [38]:
results["UAE"]["triplets"] = {}

In [39]:
results["UAE"]["triplets"] = {"base_model": base_results, "fine_tine": finetune_results}

In [9]:
data_config_type = "pair"
data_file = "../input/qa_pairs_pos_only.json"
dataset = load_finetune_dataset(data_file, data_config_type)
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
test_dataset = dataset["test"]
corpus_dataset = concatenate_datasets([train_dataset, eval_dataset, test_dataset])

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
base_results = information_retrieval_evaluator(test_dataset, corpus_dataset, base)

eval_finetune_embed_cosine_ndcg@10: 0.6721534613276363


In [42]:
model_path = "../input/models/models/UAE-Large-V1/finetune_pair_2025-01-12_16-05-51"
finetune = SentenceTransformer(model_path, device="cuda" if torch.cuda.is_available() else "cpu", trust_remote_code=True)
finetune_results = information_retrieval_evaluator(test_dataset, corpus_dataset, finetune)

eval_finetune_embed_cosine_ndcg@10: 0.7118463341691333


In [43]:
results["UAE"]["pair"] = {}

In [44]:
results["UAE"]["pair"] = {"base_model": base_results, "fine_tine": finetune_results}

In [46]:
with open('/kaggle/working/bge_large_and_allmpnet_and_UAE.json', 'w') as f:
    json.dump(results, f)