In [33]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/output-with-teacherscore-minedhn/output_with_teacherscore_minedHN.jsonl


In [34]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import InformationRetrievalEvaluator
import torch

In [35]:
model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)
# model.to("cuda")

# Dual encoder access
query_encoder = model[0]
passage_encoder = model[1]

# Set task heads explicitly
query_encoder.default_task = "retrieval.query"
passage_encoder.default_task = "retrieval.passage"



In [36]:
import json
full_train_data = []
prompt = "Represent this sentence for searching relevant passages: "

with open("/kaggle/input/output-with-teacherscore-minedhn/output_with_teacherscore_minedHN.jsonl", "r", encoding = "utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            temp_dict = json.loads(line)
            temp_dict['prompt'] = prompt
            full_train_data.append(temp_dict)

In [37]:
from sklearn.model_selection import train_test_split

# Full dataset
all_data = full_train_data  # list of your dicts

# Split into train and temp (eval+test)
train_data, temp_data = train_test_split(all_data, test_size=0.2, random_state=42)
eval_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# train_data[0]

In [38]:
import torch.nn as nn

class CustomDualEncoder(nn.Module):
    def __init__(self, query_encoder, passage_encoder):
        super().__init__()
        self.query_encoder = query_encoder
        self.passage_encoder = passage_encoder

    def forward(self, features):
        # features is a single dictionary containing "texts" with 2 texts per sample
        # SentenceTransformer will automatically tokenize and split per encoder
        query_features = {k: v for k, v in features.items() if k.endswith("0")}
        passage_features = {k: v for k, v in features.items() if k.endswith("1")}

        # Clean keys: remove the 0/1 suffix to make them usable
        query_features = {k[:-1]: v for k, v in query_features.items()}
        passage_features = {k[:-1]: v for k, v in passage_features.items()}

        query_embedding = self.query_encoder(query_features)["sentence_embedding"]
        passage_embedding = self.passage_encoder(passage_features)["sentence_embedding"]

        return [query_embedding, passage_embedding]


dual_model = CustomDualEncoder(query_encoder, passage_encoder)


In [39]:
# Create training examples (positive query-passage pairs)
def prepare_multiple_negatives_data(data_dicts, prompt=""):
    examples = []
    for d in data_dicts:
        query = prompt + d['query']
        for pos_passage in d['pos']:
            examples.append(InputExample(texts=[query, pos_passage]))
    return examples

train_examples = prepare_multiple_negatives_data(train_data, prompt="Represent this sentence for searching relevant passages: ")

print(train_examples[0])
# Wrap in a DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Use MultipleNegativesRankingLoss
train_loss = losses.MultipleNegativesRankingLoss(model = dual_model)



<InputExample> label: 0, texts: Represent this sentence for searching relevant passages: Cơ quan nào có thẩm quyền cấp lại giấy chứng nhận đăng ký doanh nghiệp?; 1. Cơ quan đăng ký kinh doanh được tổ chức ở tỉnh, thành phố trực thuộc Trung ương (sau đây gọi chung là cấp tỉnh) và ở quận, huyện, thị xã, thành phố thuộc tỉnh (sau đây gọi chung là cấp huyện), bao gồm:a) Ở cấp tỉnh: Phòng Đăng ký kinh doanh thuộc Sở Kế hoạch và Đầu tư (sau đây gọi chung là Phòng Đăng ký kinh doanh).Phòng Đăng ký kinh doanh có thể tổ chức các điểm để tiếp nhận hồ sơ và trả kết quả thuộc Phòng Đăng ký kinh doanh tại các địa điểm khác nhau trên địa bàn cấp tỉnh;b) Ở cấp huyện: Phòng Tài chính - Kế hoạch thuộc Ủy ban nhân dân cấp huyện (sau đây gọi chung là Cơ quan đăng ký kinh doanh cấp huyện).

2. Cơ quan đăng ký kinh doanh có tài khoản và con dấu riêng.


In [40]:
def prepare_evaluator_input(data, prompt=""):
    queries = {}
    corpus = {}
    relevant_docs = {}
    corpus_id_counter = 0

    for idx, entry in enumerate(data):
        qid = f"q{idx}"
        queries[qid] = prompt + entry["query"]
        relevant_docs[qid] = []

        for pos in entry["pos"]:
            pid = f"p{corpus_id_counter}"
            corpus[pid] = pos
            relevant_docs[qid].append(pid)
            corpus_id_counter += 1

        for neg in entry.get("neg", []):
            pid = f"p{corpus_id_counter}"
            corpus[pid] = neg
            corpus_id_counter += 1

    return queries, corpus, relevant_docs

# prepare evaluator 
eval_queries, eval_corpus, eval_relevant_docs = prepare_evaluator_input(eval_data, prompt="Represent this sentence for searching relevant passages: ")
test_queries, test_corpus, test_relevant_docs = prepare_evaluator_input(test_data, prompt="Represent this sentence for searching relevant passages: ")

evaluator = InformationRetrievalEvaluator(
    queries=eval_queries,
    corpus=eval_corpus,
    relevant_docs=eval_relevant_docs,
    name="eval"
)

In [41]:
import torch
torch.cuda.empty_cache()
import wandb
wandb.login(key="yourapikey")



True

In [None]:
# Train
print("Starting training...")
from sentence_transformers import SentenceTransformer

new_model = SentenceTransformer(modules=[])

new_model.model = dual_model

# Now train using model.fit()
print("Starting training...")
new_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    evaluation_steps=10,
    epochs=3,
    warmup_steps=100,
    show_progress_bar=True,
    use_amp=True
)
print("Training complete.")

# Save the fine-tuned model
new_model.save("output/jina-v3-asym-finetune-1")

In [None]:
test_evaluator = InformationRetrievalEvaluator(
    queries=test_queries,
    corpus=test_corpus,
    relevant_docs=test_relevant_docs,
    name="test",
    score_function="cos_sim"
)

test_evaluator(model)