In [1]:
from datasets import Dataset, concatenate_datasets
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)
from sentence_transformers.util import cos_sim

In [2]:
df = pd.read_csv("../Data/Train_data/BGE-finetune-data.csv")

In [3]:
df = df.drop_duplicates()
df.reset_index(drop=True,inplace=True)
dataset = Dataset.from_pandas(df)

In [4]:
qstns = df.question
rel_ids = []

In [5]:
for qstn in qstns:
    idxs = df[df['question'] == qstn].index.tolist()
    rel_ids.append(idxs)

In [6]:
# rename columns
dataset = dataset.rename_column("question", "anchor")
dataset = dataset.rename_column("oracle_context", "positive")

# Add an id column to the dataset
dataset = dataset.add_column("id", range(len(dataset)))

# split dataset into a 10% test set
dataset = dataset.train_test_split(test_size=0.01,shuffle=True)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['anchor', 'positive', 'id'],
        num_rows: 12675
    })
    test: Dataset({
        features: ['anchor', 'positive', 'id'],
        num_rows: 129
    })
})

In [8]:
model_id = "BAAI/bge-large-en-v1.5"
matryoshka_dimensions = [1024,768] # Important: large to small

# Load a model
model = SentenceTransformer(
    model_id, device="cuda" if torch.cuda.is_available() else "cpu",
    trust_remote_code = True
)



In [9]:
train_dataset = dataset['train']
test_dataset = dataset['test']
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

In [10]:
# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["positive"])
)  # Our corpus (cid => document)
queries = dict(
    zip(test_dataset["id"], test_dataset["anchor"])
)

In [11]:
relevant_docs = {}  # Query ID to relevant documents (qid => set([relevant_cids])
for q_id in queries:
    relevant_docs[q_id] = rel_ids[q_id]

In [12]:
matryoshka_evaluators = []
for dim in matryoshka_dimensions:
    ir_evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        name=f"dim_{dim}",
        truncate_dim=dim,  # Truncate the embeddings to a certain dimension
        score_functions={"cosine": cos_sim},
    )
    matryoshka_evaluators.append(ir_evaluator)

In [13]:
evaluator = SequentialEvaluator(matryoshka_evaluators)

In [14]:
# # Evaluate the model
# results = evaluator(model)

# # # COMMENT IN for full results
# # print(results)

# # Print the main score
# for dim in matryoshka_dimensions:
#     key = f"dim_{dim}_cosine_ndcg@10"
#     print
#     print(f"{key}: {results[key]}")

In [15]:
from sentence_transformers import SentenceTransformerModelCardData, SentenceTransformer


# load model with SDPA for using Flash Attention 2
model = SentenceTransformer(
    model_id,
    model_kwargs={"attn_implementation": "sdpa"},
    trust_remote_code = True,
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="BGE large OS Matryoshka",
    ),
)

In [16]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [17]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers


# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="bge-large-matryoshka", # output directory and hugging face model ID
    num_train_epochs=4,                         # number of epochs
    per_device_train_batch_size=32,             # train batch size
    gradient_accumulation_steps=2,             # for a global batch size of 512
    per_device_eval_batch_size=16,              # evaluation batch size
    warmup_ratio=0.1,                           # warmup ratio
    learning_rate=2e-5,                         # learning rate, 2e-5 is a good value
    lr_scheduler_type="cosine",                 # use constant learning rate scheduler
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    # tf32=True,                                  # use tf32 precision
    bf16=True,                                  # use bf16 precision
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    eval_strategy="epoch",                      # evaluate after each epoch
    save_strategy="epoch",                      # save after each epoch
    logging_steps=10,                           # log every 10 steps
    save_total_limit=2,                         # save only the last 3 models
    load_best_model_at_end=True,                # load the best model when training ends
    metric_for_best_model="eval_dim_768_cosine_ndcg@10",  # Optimizing for the best ndcg@10 score for the 728 dimension
)

In [18]:
from sentence_transformers import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset.select_columns(
        ["positive", "anchor"]
    ),  # training dataset
    loss=train_loss,
    evaluator=evaluator,
)

In [19]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33m2021-ritesh-bhalerao[0m ([33mves_ritesh[0m). Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Dim 1024 Cosine Accuracy@1,Dim 1024 Cosine Accuracy@3,Dim 1024 Cosine Accuracy@5,Dim 1024 Cosine Accuracy@10,Dim 1024 Cosine Precision@1,Dim 1024 Cosine Precision@3,Dim 1024 Cosine Precision@5,Dim 1024 Cosine Precision@10,Dim 1024 Cosine Recall@1,Dim 1024 Cosine Recall@3,Dim 1024 Cosine Recall@5,Dim 1024 Cosine Recall@10,Dim 1024 Cosine Ndcg@10,Dim 1024 Cosine Mrr@10,Dim 1024 Cosine Map@100,Dim 768 Cosine Accuracy@1,Dim 768 Cosine Accuracy@3,Dim 768 Cosine Accuracy@5,Dim 768 Cosine Accuracy@10,Dim 768 Cosine Precision@1,Dim 768 Cosine Precision@3,Dim 768 Cosine Precision@5,Dim 768 Cosine Precision@10,Dim 768 Cosine Recall@1,Dim 768 Cosine Recall@3,Dim 768 Cosine Recall@5,Dim 768 Cosine Recall@10,Dim 768 Cosine Ndcg@10,Dim 768 Cosine Mrr@10,Dim 768 Cosine Map@100,Sequential Score
0,1.5054,No log,0.03876,0.162791,0.271318,0.565891,0.03876,0.054264,0.054264,0.06124,0.014212,0.045958,0.074732,0.188215,0.116407,0.157614,0.089997,0.03876,0.170543,0.271318,0.55814,0.03876,0.056848,0.054264,0.058915,0.011628,0.047896,0.073053,0.178968,0.111497,0.15662,0.088029,0.088029
2,0.3413,No log,0.054264,0.178295,0.333333,0.689922,0.054264,0.059432,0.066667,0.076744,0.012468,0.050286,0.087135,0.216925,0.13487,0.190455,0.116228,0.077519,0.193798,0.317829,0.713178,0.077519,0.064599,0.063566,0.07907,0.021512,0.055011,0.086499,0.224548,0.143533,0.206906,0.122096,0.122096
3,0.2434,No log,0.062016,0.186047,0.302326,0.651163,0.062016,0.062016,0.060465,0.074419,0.014747,0.052566,0.086563,0.215052,0.134862,0.189043,0.119444,0.077519,0.20155,0.310078,0.666667,0.077519,0.067183,0.062016,0.074419,0.017709,0.055915,0.08565,0.215172,0.137584,0.202261,0.120539,0.120539


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=792, training_loss=0.9354039154043703, metrics={'train_runtime': 11953.2109, 'train_samples_per_second': 4.242, 'train_steps_per_second': 0.066, 'total_flos': 0.0, 'train_loss': 0.9354039154043703, 'epoch': 3.9899244332493704})