In [3]:
!pip install -U sentence_transformers
!pip install xformers
!pip install bitsandbytes
!pip install peft
!pip install huggingface_hub
!pip install datasets

Collecting sentence_transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.3.1
Collecting xformers
  Downloading xformers-0.0.29.post1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting torch==2.5.1 (from xformers)
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1->xformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.1->xformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecti

In [21]:
import os
from typing import Optional
from datetime import datetime
from huggingface_hub import login
from sentence_transformers import SentenceTransformer
from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import MultipleNegativesRankingLoss, MatryoshkaLoss, TripletLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import InformationRetrievalEvaluator, TripletEvaluator, SequentialEvaluator
from sentence_transformers.util import cos_sim
from peft import LoraConfig, LoraRuntimeConfig, TaskType
from peft.optimizers import create_loraplus_optimizer
import bitsandbytes as bnb
from datasets import load_dataset, concatenate_datasets
from getpass import getpass
from kaggle_secrets import UserSecretsClient
import wandb

In [22]:
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("hf_token") 
login(token=hf_token)
my_secret = user_secrets.get_secret("wandb_api_key") 
wandb.login(key=my_secret)

[34m[1mwandb[0m: Currently logged in as: [33mgarychang0406[0m ([33mgarychang0406-rwth-aachen-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [34]:
MODEL_REPO = "dunzhang/stella_en_400M_v5"
MODEL_NAME = MODEL_REPO.split('/')[-1]
DATA_FORM = "pair"

In [7]:
def load_finetune_dataset(data_file: str, data_form: str, train_test_split: Optional[float]=0.8):
    """
    Load the dataset for finetuning embedding models

    Args:
        data_file(str): dataset filepath
        data_form(str): format of dataset
        train_test_split(Optional[float]): define the data size of training set. By default, 0.8.
    
    Return:
        dataset(dict): dataset with train/validation/test split
    """
    ds = load_dataset("json", data_files=data_file, split="train")
    # Rename columns
    ds = ds.rename_columns({'user_query': 'anchor', 'positive_answer':'positive'})
    if data_form == "triplets":
        ds = ds.rename_column('negative_answer', 'negative')
    # Add an id column to the dataset
    ds = ds.add_column("id", range(len(ds)))
    train_val_split = ds.train_test_split(test_size=1-train_test_split, shuffle=True)
    val_test_split = train_val_split["test"].train_test_split(test_size=0.5, shuffle=True)
    dataset = {
        'train': train_val_split['train'],
        'validation': val_test_split['train'],
        'test': val_test_split['test']
    }
    return dataset

In [16]:
model = SentenceTransformer(MODEL_REPO, trust_remote_code=True).cuda()
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules="all-linear", # target_modules=["qkv_proj", "o_proj", "up_gate_proj", "down_proj"],
    use_dora=True, 
    runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=True)
)
model.add_adapter(peft_config)

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
args = SentenceTransformerTrainingArguments(
    # Required parameters:
    output_dir=MODEL_REPO + "_finetune", # output directory and hugging face model ID
    # Optional training parameters:
    num_train_epochs=3,                        # number of epochs
    per_device_train_batch_size=16,             # train batch size
    #gradient_accumulation_steps=16,            # for a global batch size of per_device_train_batch_size * gradient_accumulation_steps
    per_device_eval_batch_size=16,              # evaluation batch size
    warmup_ratio=0.1,                           # warmup ratio
    learning_rate=2e-5,                         # learning rate, 2e-5 is a good value
    lr_scheduler_type="cosine",                 # use consine learning rate scheduler
    #optim="adamw_torch_fused",                 # use fused adamw optimizer
    #tf32=True,                                 # use tf32 precision
    #bf16=False,                                # use bf16 precision
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="epoch",                      # evaluate after each epoch
    save_strategy="epoch",                      # save after each epoch
    logging_steps=100,                          # log every 100 steps
    save_total_limit=3,                         # save only the last 3 models
    #load_best_model_at_end=True,                # load the best model when training ends
    run_name="porsche_challenge_finetune_stella"# will be used in W&B if `wandb` is installed
)

In [25]:
optimizer = create_loraplus_optimizer(
    model=model,
    optimizer_cls=bnb.optim.Adam8bit,
    lr=2e-5,
    loraplus_lr_ratio=16,
)
scheduler = None

In [35]:
if DATA_FORM == "triplets":
    # 3. Prepare the dataset for finetuning
    data_file = "../data/qa_pairs_pos_and_neg.json"
    # data_file = "/kaggle/input/porschechallenge/qa_pairs_pos_and_neg.json"
    dataset = load_finetune_dataset(data_file, DATA_FORM)
    train_dataset = dataset["train"]
    eval_dataset = dataset["validation"]
    test_dataset = dataset["test"]
    print(f"train size: {len(train_dataset)}, val size: {len(eval_dataset)}, test size: {len(test_dataset)}")

    # 4. Finetune the model with LoRA adapter
    # Initialize the TripletEvaluator using anchors, positives, and negatives
    dev_evaluator = TripletEvaluator(
        anchors=eval_dataset["anchor"],
        positives=eval_dataset["positive"],
        negatives=eval_dataset["negative"],
        name="eval_finetune_embed",
    )
    dev_evaluator(model)
    # Define the loss function
    loss = TripletLoss(model)

    # 5. Create a trainer & train
    trainer = SentenceTransformerTrainer(
        model=model,
        args=args,
        train_dataset=train_dataset.select_columns(['anchor', 'positive', 'negative']),
        eval_dataset=eval_dataset.select_columns(['anchor', 'positive', 'negative']),
        loss=loss,
        optimizers=(optimizer, scheduler),
        evaluator=dev_evaluator,
    )
    trainer.train()

    # 6. Evaluate the trained model on the test set
    test_evaluator = TripletEvaluator(
        anchors=test_dataset["anchor"],
        positives=test_dataset["positive"],
        negatives=test_dataset["negative"],
        name="test_finetune_embed",
    )
    results = test_evaluator(model)
    print(f"{test_evaluator.primary_metric}: {results[test_evaluator.primary_metric]}")

elif DATA_FORM == "pair":
    # 3. Prepare the dataset for finetuning
    data_file = "../data/qa_pairs_pos_only.json"
    # data_file = "/kaggle/input/porschechallenge/qa_pairs_pos_only.json"
    dataset = load_finetune_dataset(data_file, DATA_FORM)
    train_dataset = dataset["train"]
    eval_dataset = dataset["validation"]
    test_dataset = dataset["test"]
    print(f"train size: {len(train_dataset)}, val size: {len(eval_dataset)}, test size: {len(test_dataset)}")
    # Convert the datasets to the evaluator-ready format
    # Our corpus (cid => document)
    corpus_dataset = concatenate_datasets([train_dataset, eval_dataset, test_dataset])
    corpus = dict(
        zip(corpus_dataset["id"], corpus_dataset["positive"])
    )
    # Our queries (qid => question)
    eval_queries = dict(
        zip(eval_dataset["id"], eval_dataset["anchor"])
    )
    test_queries = dict(
        zip(test_dataset["id"], test_dataset["anchor"])
    )
    # Query ID to relevant documents (qid => set([relevant_cids])
    eval_relevant_docs = {}
    for q_id in eval_queries:
        eval_relevant_docs[q_id] = [q_id]
    test_relevant_docs = {}
    for q_id in test_queries:
        test_relevant_docs[q_id] = [q_id]

    # 4. Finetune the model with LoRA adapter
    # Initialize the InformationRetrievalEvaluator using anchors and positives
    dev_evaluator = InformationRetrievalEvaluator(
        queries=eval_queries,
        corpus=corpus,
        relevant_docs=eval_relevant_docs,
        name="eval_finetune_embed",
        score_functions={"cosine": cos_sim},
    )
    dev_evaluator(model)
    # Define the loss function
    loss = MultipleNegativesRankingLoss(model)

    # 5. Create a trainer & train
    trainer = SentenceTransformerTrainer(
        model=model,
        args=args,
        train_dataset=train_dataset.select_columns(['anchor', 'positive']),
        eval_dataset=eval_dataset.select_columns(['anchor', 'positive']),
        loss=loss,
        optimizers=(optimizer, scheduler),
        evaluator=dev_evaluator,
    )
    trainer.train()

    # 6. Evaluate the trained model on the test set
    test_evaluator = InformationRetrievalEvaluator(
        queries=test_queries,
        corpus=corpus,
        relevant_docs=test_relevant_docs,
        name="eval_finetune_embed",
        score_functions={"cosine": cos_sim},
    )
    results = test_evaluator(model)
    print(f"{test_evaluator.primary_metric}: {results[test_evaluator.primary_metric]}")


else:
    raise Exception("Only positive-pair or triplet dataset provided.")

Generating train split: 0 examples [00:00, ? examples/s]

train size: 1437, val size: 180, test size: 180


Epoch,Training Loss,Validation Loss,Finetune Embed Cosine Accuracy@1,Finetune Embed Cosine Accuracy@3,Finetune Embed Cosine Accuracy@5,Finetune Embed Cosine Accuracy@10,Finetune Embed Cosine Precision@1,Finetune Embed Cosine Precision@3,Finetune Embed Cosine Precision@5,Finetune Embed Cosine Precision@10,Finetune Embed Cosine Recall@1,Finetune Embed Cosine Recall@3,Finetune Embed Cosine Recall@5,Finetune Embed Cosine Recall@10,Finetune Embed Cosine Ndcg@10,Finetune Embed Cosine Mrr@10,Finetune Embed Cosine Map@100
1,No log,0.099829,0.583333,0.783333,0.844444,0.9,0.583333,0.261111,0.168889,0.09,0.583333,0.783333,0.844444,0.9,0.742326,0.691523,0.694128
2,0.298200,0.082056,0.594444,0.783333,0.855556,0.905556,0.594444,0.261111,0.171111,0.090556,0.594444,0.783333,0.855556,0.905556,0.751495,0.701664,0.704328
3,0.078500,0.072989,0.616667,0.788889,0.844444,0.916667,0.616667,0.262963,0.168889,0.091667,0.616667,0.788889,0.844444,0.916667,0.763244,0.714297,0.716198




eval_finetune_embed_cosine_ndcg@10: 0.7222834337280267


In [37]:
save_model_dir = f"models/{MODEL_NAME}"
if not os.path.exists(save_model_dir):
    os.makedirs(save_model_dir)
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model.save_pretrained(os.path.join(save_model_dir, f"finetune_{DATA_FORM}_{str(current_datetime)}"))

In [38]:
!zip -r /kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08.zip /kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08

  adding: kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08/ (stored 0%)
  adding: kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08/tokenizer.json (deflated 71%)
  adding: kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08/sentence_bert_config.json (deflated 4%)
  adding: kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08/2_Dense/ (stored 0%)
  adding: kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08/2_Dense/model.safetensors

  pid, fd = os.forkpty()


 (deflated 7%)
  adding: kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08/2_Dense/config.json (deflated 22%)
  adding: kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08/1_Pooling/ (stored 0%)
  adding: kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08/1_Pooling/config.json (deflated 57%)
  adding: kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08/vocab.txt (deflated 53%)
  adding: kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08/adapter_config.json (deflated 53%)
  adding: kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08/special_tokens_map.json (deflated 80%)
  adding: kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08/modules.json (deflated 62%)
  adding: kaggle/working/models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08/adapter_model.safetensors (deflated 8%)
  adding: kaggle/working/models/stella_en_400M_v5/f