# Module

In [33]:
from keybert import KeyBERT
import pandas as pd
from tqdm import tqdm
import pickle
from pymilvus import model
from pymilvus import MilvusClient, Collection, connections, DataType, CollectionSchema, FieldSchema
import numpy as np
import json
from FlagEmbedding import FlagReranker
from pymilvus.model.reranker import BGERerankFunction
import random
import subprocess
random.seed(42)

# Data Load

In [3]:
# dataset load
data = pd.read_csv("../data/top1000_dev.tsv", sep='\t', names=['qid', 'pid', 'query', 'passage'])
unique_query = pd.read_csv("../data/unique_query.csv")
qrels = pd.read_csv("../data/qrels.dev.small.tsv", sep='\t', names=['qid', 'r', 'pid', 'l'])

In [4]:
with open('../data/all_pid_list.pkl', 'rb') as file:
    all_pid_list = pickle.load(file)
with open('../data/train_pid_list.pkl', 'rb') as file:
    train_pid_list = pickle.load(file)
with open('../data/test_pid_list.pkl', 'rb') as file:
    test_pid_list = pickle.load(file)

# Get Keywords Generation

In [6]:
# 키워드 추출 모델 -> BAAI/bge-m3 활용한 키워드 추출
kw_model = KeyBERT("BAAI/bge-m3")

# 유사도 검색을 위한 embedding 모델 
bge_m3_ef = model.hybrid.BGEM3EmbeddingFunction(
        model_name= "BAAI/bge-m3",
        batch_size = 16,
        device = "cuda:1",
        # use_fp16 = True,
        return_dense = True,
        return_sparse = False,
        return_colbert_vecs = False,
    )

# 벡터DB -> Milvus
client = MilvusClient()

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [5]:
pid_list = data["pid"].tolist()

# Easy Nagative 를 위한 랜덤 pid 선택 함수
def get_random_pid(pid_list, exclude_pid, total_numbers):
    pid_list_rerange = [pid for pid in pid_list if pid != exclude_pid]  # 제외할 숫자 제거
    random_numbers = random.sample(pid_list_rerange, total_numbers) 
    return random_numbers

# passage로 부터 qseudo query 추출 및 finetuning dataset 구성 함수
def get_keyword_query(top_n, negative_type):

    train_json = []

    for pid in tqdm(train_pid_list):
        passage = data[data["pid"] == pid]["passage"].tolist()[0]

        passage_keywords = kw_model.extract_keywords(passage, keyphrase_ngram_range=(1,1), top_n=top_n) # 키워드 추출
        passage_keywords = sorted(passage_keywords, key=lambda x: passage.find(x[0])) # 추출된 키워드를 문장 내의 순서대로 정렬
        query = " ".join([i[0] for i in passage_keywords]) # 추출된 키워드를 하나의 string으로 결합

        query_vectors = bge_m3_ef.encode_queries([query])["dense"] # pseudo query에 대한 embedding

        # hard negative를 위한 passage vector search
        if negative_type == "hard":
            res = client.search(
                collection_name="msmarco_bgem3",
                data=query_vectors,
                limit=10,
                output_fields=["text"],
                anns_field="dense_vector",
                filter=f"pid != {pid}",
            )
            neg_list = [i["entity"]["text"] for i in res[0]]
            
        # easy negative를 위한 passage 랜덤 선택
        elif negative_type == "easy":
             neg_pid = get_random_pid(pid_list, pid, 10)
             neg_list = [data[data["pid"] == i]["passage"].tolist()[0] for i in neg_pid]

        # finetuning dataset format
        train_json.append({
            "query": query, 
            "pos": passage, 
            "neg": neg_list
                })

    return train_json

# Ground Truth finetuning을 위한 데이터셋 구성 함수
def get_gt_query(negative_type):
    train_json = []
    for pid in tqdm(train_pid_list):
        passage = data[data["pid"] == pid]["passage"].tolist()[0]
        qid = qrels[qrels["pid"] == pid]["qid"].tolist()[0]
        query = unique_query[unique_query["qid"] == qid]["query"].tolist()[0]

        # hard negative를 위한 passage vector search
        if negative_type == "hard":
            query_vectors = bge_m3_ef.encode_queries([query])["dense"]
            res = client.search(
                collection_name="msmarco_bgem3",
                data=query_vectors,
                limit=10,
                output_fields=["text"],
                anns_field="dense_vector",
                filter=f"pid != {pid}",
            )
            neg_list = [i["entity"]["text"] for i in res[0]]

        # easy negative를 위한 passage vector search
        elif negative_type == "easy":
             neg_pid = get_random_pid(pid_list, pid, 10)
             neg_list = [data[data["pid"] == i]["passage"].tolist()[0] for i in neg_pid]

        # finetuning dataset format
        train_json.append({
            "query": query, 
            "pos": passage, 
            "neg": neg_list
                })

    return train_json

# Generation

In [7]:
keyword_num_list = [3,5,7,9]
negative_type_list = ["hard", "easy"]

In [None]:
# pseudo query dataset 생성
for keyword_num in keyword_num_list:
    for negative_type in negative_type_list:

        file_path = f"../data/finetuning_dataset/kw_{keyword_num}_{negative_type}.jsonl"

        # finetuning dataset 생성
        result = get_keyword_query(keyword_num, negative_type)

        # 저장
        with open(file_path , encoding= "utf-8",mode="w") as file: 

# gt query dataset 생성
re = get_gt_query("easy")
with open("../data/finetuning_dataset/gt_easy_train.jsonl" , encoding= "utf-8",mode="w") as file: 
	for i in re: file.write(json.dumps(i) + "\n")

re = get_gt_query("hard")
with open("../data/finetuning_dataset/gt_hard_train.jsonl" , encoding= "utf-8",mode="w") as file: 
	for i in re: file.write(json.dumps(i) + "\n") 


# Finetuning

In [None]:
! torchrun --nproc_per_node 2 -m FlagEmbedding.reranker.run --output_dir ../model/kw_3_easy_train --model_name_or_path BAAI/bge-reranker-v2-m3 --train_data ../data/finetuning_dataset/kw_3_easy_train.jsonl --learning_rate 5e-6 --fp16 --num_train_epochs 40 --per_device_train_batch_size 2 --gradient_accumulation_steps 32 --dataloader_drop_last True --train_group_size 3 --max_len 512 --weight_decay 0.01 --logging_steps 10 --save_steps 100 ; torchrun --nproc_per_node 2 -m FlagEmbedding.reranker.run --output_dir ../model/gt_easy_train --model_name_or_path BAAI/bge-reranker-v2-m3 --train_data ../data/finetuning_dataset/gt_easy_train.jsonl --learning_rate 5e-6 --fp16 --num_train_epochs 40 --per_device_train_batch_size 2 --gradient_accumulation_steps 32 --dataloader_drop_last True --train_group_size 3 --max_len 512 --weight_decay 0.01 --logging_steps 10 --save_steps 100 ;

# Test

In [8]:
bge_m3_ef = model.hybrid.BGEM3EmbeddingFunction(
        model_name= "BAAI/bge-m3",
        batch_size = 16,
        device = "cuda:0",
        # use_fp16 = True,
        return_dense = True,
        return_sparse = False,
        return_colbert_vecs = False,
    )

client = MilvusClient()

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [30]:
def get_MRR100(eval_code_path, test_qrels_path, inference_path):
    command = ["python", eval_code_path, test_qrels_path, inference_path] 
    result = subprocess.run(command, capture_output=True, text=True)
    stdout = result.stdout
    stdout_value = float(stdout.split("\n")[1].split(" ")[-1])
    return stdout_value

def inference(test_qid, model_name, output_path):

    bge_rf = BGERerankFunction(
        model_name=model_name,
        device="cuda:0" # Specify the device to use, e.g., 'cpu' or 'cuda:0'
    )

    result = []
    error_list = []

    for qid in tqdm(test_qid):
        try:
            query = data[data["qid"] == qid]["query"].tolist()[0]

            query_vectors = bge_m3_ef.encode_queries([query])["dense"]

            candidate = client.search(
                collection_name="msmarco_bgem3",  # target collection
                data=query_vectors,  # query vectors
                limit=100,  # number of returned entities
                output_fields=["pid","text"],
                anns_field="dense_vector"
            )
            candidate_text = [i["entity"]["text"] for i in candidate[0]]
            candidate_pid = np.array([i["entity"]["pid"] for i in candidate[0]])

            top_k = bge_rf(
                query=query,
                documents=candidate_text,
                top_k=100,
            )
            for n,i in enumerate(top_k):
                result.append([qid, candidate_pid[i.index], n+1])
        except:
            error_list.append(qid)
        break

    result_df = pd.DataFrame(result)
    result_df.to_csv(output_path, sep='\t', index=False)

    mrr_score = get_MRR100("../ms_marco_eval.py", "../data/test_qrels.tsv", output_path)

    model_name_item = model_name.split("/")[-1]
    print(f"{model_name_item} : {mrr_score}")
    print(f"error_list : {len(error_list)}")

In [34]:
test_qid = pd.read_csv("../data/test_qrels.tsv", sep='\t', names=['qid', 'r', 'pid', 'l'])["qid"].tolist()
inference(test_qid, "BAAI/bge-reranker-v2-m3", "../result/test.tsv")

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 