In [1]:
from FlagEmbedding import BGEM3FlagModel
import random

model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

data_file = '/Users/lucas/Downloads/paraphrase_unorder'
sentence_pairs = []

with open(data_file, 'r', encoding='utf-8') as f:
    for idx, line in enumerate(f):
        if idx >= 500: break
        splits = line.rstrip('\n').split('\t')
        if len(splits) == 2 and splits[0].strip() and splits[1].strip():
            sentence_pairs.append([splits[0].strip(), splits[1].strip()])

print(f"Loaded {len(sentence_pairs)} sentence pairs!")

# ==========1. 正例部分，原始句对=========
def run_and_stat(sentence_pairs, title):
    batch_size = 64
    all_scores = []

    for batch_start in range(0, len(sentence_pairs), batch_size):
        batch_pairs = sentence_pairs[batch_start:batch_start + batch_size]
        # 指定权重
        scores = model.compute_score(
            batch_pairs, max_passage_length=128,
            weights_for_different_modes=[0.4, 0.2, 0.4]
        )
        if isinstance(scores, list):
            all_scores.extend(scores)
        else:
            all_scores.append(scores)

    count_colbert = 0
    count_sparse = 0
    count_dense = 0
    count_sparse_dense = 0
    count_all = 0
    total_samples = 0

    for batch_scores in all_scores:
        n = len(batch_scores['colbert'])
        for i in range(n):
            if batch_scores['colbert'][i] > 0.7:
                count_colbert += 1
            if batch_scores['sparse'][i] > 0.2:
                count_sparse += 1
            if batch_scores['dense'][i] > 0.7:
                count_dense += 1
            if batch_scores['sparse+dense'][i] > 0.7:
                count_sparse_dense += 1
            if batch_scores['colbert+sparse+dense'][i] > 0.7:
                count_all += 1
        total_samples += n

    print(f"\n===={title} 句子对中各项分数超阈值（总数：{total_samples}）====")
    print(f"colbert > 0.7             : {count_colbert}")
    print(f"sparse  > 0.2             : {count_sparse}")
    print(f"dense   > 0.7             : {count_dense}")
    print(f"sparse+dense > 0.7        : {count_sparse_dense}")
    print(f"colbert+sparse+dense >0.7 : {count_all}")

# 正例
run_and_stat(sentence_pairs, "正例/原始 matched")

# ==========2. 负例部分，A列+B列打乱配对=========

# 保持A列顺序，随机打乱B列配对，若i==j则再打乱一次避免正例混入
A, B = zip(*sentence_pairs)
B_shuffled = list(B)
while True:
    random.shuffle(B_shuffled)
    # check无一位匹配原来（防止正例漏入负例）
    if all(B_shuffled[i]!=B[i] for i in range(len(B))):
        break

neg_pairs = [[A[i], B_shuffled[i]] for i in range(len(A))]

# 负例
run_and_stat(neg_pairs, "负例/A列+B列打乱")

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

Loaded 500 sentence pairs!

====正例/原始 matched 句子对中各项分数超阈值（总数：500）====
colbert > 0.7             : 414
sparse  > 0.2             : 311
dense   > 0.7             : 428
sparse+dense > 0.7        : 139
colbert+sparse+dense >0.7 : 298

====负例/A列+B列打乱 句子对中各项分数超阈值（总数：500）====
colbert > 0.7             : 0
sparse  > 0.2             : 0
dense   > 0.7             : 0
sparse+dense > 0.7        : 0
colbert+sparse+dense >0.7 : 0
