In [1]:
import json
import os
from typing import Any, Dict, List, Tuple
import pandas as pd
import torch
from tqdm import tqdm
from pathlib import Path

from datasets import Dataset
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

from product_search.encoder_finetuning import build_training_dataset, get_trainer

project_dir = Path(os.getcwd()).parent
data_dir = project_dir / "Data" / "RAW"
processed_dir = project_dir / "Data" / "PROCESSED"

output_dir = project_dir / 'src' / 'product_search' / 'finetuned_encoder' / 'checkpoints'

if not torch.cuda.is_available():
    raise RuntimeError("CUDA not available. This script is intended to run on an NVIDIA GPU machine.")
device = "cuda"
print("torch:", torch.__version__)
print("cuda:", torch.cuda.is_available(), torch.cuda.get_device_name(0))

  from .autonotebook import tqdm as notebook_tqdm


torch: 2.9.1+cu128
cuda: True NVIDIA GeForce RTX 5070


In [2]:
# load datasets
with open(processed_dir / "product_store.json", "r", encoding="utf-8") as f:
    product_store = json.load(f)

with open(processed_dir / "train_qrels.json", "r", encoding="utf-8") as f:
    train_qrels_dict = json.load(f)

train_query_df = pd.read_parquet(
    processed_dir / "train_query_table.parquet"
)

In [3]:
# Model Information

BASE_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
GUIDE_MODEL = "BAAI/bge-m3"

# Training Parameters

epochs = 3
learning_rate = 2e-5
warmup_ratio = 0.05
weight_decay = 0.01
mini_batch_size = 64
margin_strategy = "relative"  # options: "absolute", "relative"
margin = 0.05
train_batch_size = 128
max_pairs = 0

In [4]:
train_dataset = build_training_dataset(
    train_qrels_dict, 
    train_query_df, 
    product_store,
    max_pairs
)

Building training pairs: 100%|████████████████████████████████████| 21223/21223 [00:00<00:00, 460345.94it/s]

[INFO] training_pairs=21223
[INFO] skipped: no_query=0, no_pos=0, missing_product=0, empty_text=0





In [5]:
trainer = get_trainer(
    train_dataset=train_dataset,
    base_model_name=BASE_MODEL,
    guide_model_name=GUIDE_MODEL,
    device=device,
    mini_batch_size=mini_batch_size,
    margin_strategy=margin_strategy,
    margin=margin,
    output_dir=str(output_dir),
    epochs=epochs,
    lr=learning_rate,
    warmup_ratio=warmup_ratio,
    weight_decay=weight_decay,
    train_batch_size=train_batch_size,
    )

                                                                                                            

In [6]:
trainer.train()

Step,Training Loss
25,2.3474
50,1.6955
75,1.7741
100,1.5868
125,1.5064
150,1.3588
175,1.2914
200,1.0891
225,1.0765
250,1.12


TrainOutput(global_step=498, training_loss=1.1818392726790954, metrics={'train_runtime': 534.4584, 'train_samples_per_second': 119.128, 'train_steps_per_second': 0.932, 'total_flos': 0.0, 'train_loss': 1.1818392726790954, 'epoch': 3.0})

In [10]:
trainer.save_model(output_dir.parent / 'finetuned_model')

In [11]:
finetuned = SentenceTransformer(str(output_dir.parent / 'finetuned_model'), device=device)
test_text = "red puma socks"
vec = finetuned.encode(test_text, normalize_embeddings=True)

In [13]:
str(output_dir.parent / 'finetuned_model')

'/home/harini/rahul_projects/Product-Search-Engine/src/product_search/finetuned_encoder/finetuned_model'