In [1]:
import json
import pandas as pd
from typing import List
import tqdm
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
import pandas as pd

external_info = pd.read_csv('../data/finetune_extra_module.csv')
external_info['matched_sentences'] = external_info['matched_sentences'].apply(lambda x: eval(x))
external_info


def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery:{query}'


tokenizer = AutoTokenizer.from_pretrained('../Qwen3-Embedding-4B/', padding_side='left')

# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModel.from_pretrained('../Qwen3-Embedding-4B/', torch_dtype=torch.float16)

max_length = 2048


# === Step 1: 定义 Chunk 函数 ===
def split_into_chunks(text: str, max_chars: int = 100) -> List[str]:
    chunks = []
    current_chunk = ""

    for char in text:
        current_chunk += char
        if len(current_chunk) >= max_chars:
            chunks.append(current_chunk.strip())
            current_chunk = ""

    if current_chunk:  # 剩余部分
        chunks.append(current_chunk.strip())

    return chunks


def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery:{query}'

# Each query must come with a one-sentence instruction that describes the task
task = '给定义一个文本，查找相似的文本。'

# === Step 2: last_token_pool ===
def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


# === Step 3: 批量计算相似度 ===
def batch_compute_similarity(queries, documents, tokenizer, model, max_length=1024):
    input_texts = queries + documents

    batch_dict = tokenizer(
        input_texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        outputs = model(**batch_dict)

    embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

    # normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    
    scores = (embeddings[:len(queries)] @ embeddings[len(queries):].T)
    return scores.cpu().numpy()

# === Step 4: 读取数据并两两比对 ===
def process_and_compare(df, model, tokenizer):
    all_rows = []
    
    batch_size_1 = 32   # chunks1 的 batch size
    batch_size_2 = 32  # chunks2 的 batch size

    for idx, row in tqdm.tqdm(df.iterrows(), total=len(df), desc='Extract embedding'):
        contents = row['matched_sentences']  # e.g. [{"src1": "内容"}, {"src2": "内容2"}]

        if len(contents) < 2:
            continue
            
        for i in range(len(contents) - 1):
            c1 = contents[i]
            key1, v1 = list(c1.keys())[0], list(c1.values())[0]
            chunks1 = split_into_chunks(v1, max_chars=20)
        
            for j in range(i + 1, len(contents)):
                c2 = contents[j]
                key2, v2 = list(c2.keys())[0], list(c2.values())[0]
                if key1 == key2:
                    continue
        
                chunks2 = split_into_chunks(v2, max_chars=20)
        
                for b1_start in range(0, len(chunks1), batch_size_1):
                    b1_end = min(b1_start + batch_size_1, len(chunks1))
                    chunks1_batch = chunks1[b1_start:b1_end]
                    queries = [
                        get_detailed_instruct(task, chunk.replace('。', '').strip()) 
                        for chunk in chunks1_batch
                    ]
        
                    for b2_start in range(0, len(chunks2), batch_size_2):
                        b2_end = min(b2_start + batch_size_2, len(chunks2))
                        chunks2_batch = chunks2[b2_start:b2_end]
        
                        scores = batch_compute_similarity(
                            queries, chunks2_batch, tokenizer, model
                        )  # 输出 shape: [len(queries), len(chunks2_batch)]
        
                        for m in range(len(queries)):
                            for n in range(len(chunks2_batch)):
                                all_rows.append({
                                    "module": row['rule'].replace("该产品的", "").replace("在各材料中的定义没有冲突", ""),
                                    "rule_id": row['rule_id'],
                                    "material_id": row['material_id'],
                                    "source_1": key1,
                                    "source_2": key2,
                                    "chunk_1": chunks1_batch[m],
                                    "chunk_2": chunks2_batch[n],
                                    "score": float(scores[m][n])
                                })

    return pd.DataFrame(all_rows)

# === Step 5: 初始化模型 ===
model.eval().cuda()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Qwen3Model(
  (embed_tokens): Embedding(151665, 2560)
  (layers): ModuleList(
    (0-35): 36 x Qwen3DecoderLayer(
      (self_attn): Qwen3Attention(
        (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
        (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
        (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
        (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
        (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
        (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
      )
      (mlp): Qwen3MLP(
        (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
        (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
        (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
      (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
    )
  )
  (norm): Qwen3RMSNorm((256

In [2]:
# print(external_info.iloc[10]['matched_sentences'])
output_df = process_and_compare(external_info, model,tokenizer)
# # === Step 7: 保存为 JSONL ===
# output_df.to_json("pairwise_chunk_scores.jsonl", orient="records", lines=True, force_ascii=False)

Extract embedding: 100%|██████████| 167/167 [36:01<00:00, 12.94s/it] 


In [3]:
output_df

Unnamed: 0,module,rule_id,material_id,source_1,source_2,chunk_1,chunk_2,score
0,基础产品销售信息,r_00001,m_00001a,CLAUSE,INTRODUCE_IMG,众安在线财产保险股份有限公司 附加旅行意,畅玩境内旅行保险畅游中华·众安随行神奇敦,0.671875
1,基础产品销售信息,r_00001,m_00001a,CLAUSE,INTRODUCE_IMG,众安在线财产保险股份有限公司 附加旅行意,煌帝都北京古都西安诗画西湖山水桂林浪漫三,0.298340
2,基础产品销售信息,r_00001,m_00001a,CLAUSE,INTRODUCE_IMG,众安在线财产保险股份有限公司 附加旅行意,亚四重保障护航旅途人身安全意外保障人身意,0.606934
3,基础产品销售信息,r_00001,m_00001a,CLAUSE,INTRODUCE_IMG,众安在线财产保险股份有限公司 附加旅行意,外公共交通意外自驾车意外身故及残疾身故及,0.569336
4,基础产品销售信息,r_00001,m_00001a,CLAUSE,INTRODUCE_IMG,众安在线财产保险股份有限公司 附加旅行意,残疾身故及残疾最高50万最高50万最高5,0.537598
...,...,...,...,...,...,...,...,...
13680924,赔付 & 领取规则,r_00200,m_00066a,ADDITIONAL_AGREEMENT,INSURE_NOTICE,100万元。,512 转 2； 3.平安财产保险股份有,0.509277
13680925,赔付 & 领取规则,r_00200,m_00066a,ADDITIONAL_AGREEMENT,INSURE_NOTICE,100万元。,限公司最近季度偿付能力符合监管要求，详情,0.550781
13680926,赔付 & 领取规则,r_00200,m_00066a,ADDITIONAL_AGREEMENT,INSURE_NOTICE,100万元。,请参见保险公司官网（http://bao,0.430420
13680927,赔付 & 领取规则,r_00200,m_00066a,ADDITIONAL_AGREEMENT,INSURE_NOTICE,100万元。,xian.pingan.com/）主页偿,0.495605


In [4]:
top_k_df = (
    output_df.groupby(['module','rule_id','material_id','source_1',	'chunk_1'], group_keys=False)
      .apply(lambda x: x.nlargest(10,'score'))
      .reset_index(drop=True)
)
top_k_df

  .apply(lambda x: x.nlargest(10,'score'))


Unnamed: 0,module,rule_id,material_id,source_1,source_2,chunk_1,chunk_2,score
0,与保障相关的时间,r_00025,m_00025a,INSURE_NOTICE,LIABILITY_EXCLUSION,(2)有效身份证件。 公司收到退保申请,份证件。自本公司收到您解除合同的通知书时,0.694824
1,与保障相关的时间,r_00025,m_00025a,INSURE_NOTICE,LIABILITY_EXCLUSION,(2)有效身份证件。 公司收到退保申请,超过10元的工本费后退还您所支付的全部保,0.656738
2,与保障相关的时间,r_00025,m_00025a,INSURE_NOTICE,LIABILITY_EXCLUSION,(2)有效身份证件。 公司收到退保申请,除合同通知书，并提供您的保险合同及有效身,0.606445
3,与保障相关的时间,r_00025,m_00025a,INSURE_NOTICE,LIABILITY_EXCLUSION,(2)有效身份证件。 公司收到退保申请,险费（本产品提供电子保单，犹豫期撤保免收,0.588379
4,与保障相关的时间,r_00025,m_00025a,INSURE_NOTICE,LIABILITY_EXCLUSION,(2)有效身份证件。 公司收到退保申请,以在此期间提出解除本合同，本公司将扣除不,0.567383
...,...,...,...,...,...,...,...,...
525921,附加条款,r_00197,m_00120a,ADDITIONAL_AGREEMENT,CLAUSE,；升级版方案提供3件衣鞋任洗服务（1次）,条 本附加险条款未尽事宜，以主险条款的约,0.434814
525922,附加条款,r_00197,m_00120a,ADDITIONAL_AGREEMENT,CLAUSE,；升级版方案提供3件衣鞋任洗服务（1次）,保险责任范围内的损失、费用。其他事项第五,0.427979
525923,附加条款,r_00197,m_00120a,ADDITIONAL_AGREEMENT,CLAUSE,；升级版方案提供3件衣鞋任洗服务（1次）,人按照本附加险合同的约定也负责赔偿。责任,0.426270
525924,附加条款,r_00197,m_00120a,ADDITIONAL_AGREEMENT,CLAUSE,；升级版方案提供3件衣鞋任洗服务（1次）,室内外管道爆裂，由此产生的修复费用，保险,0.425781


In [5]:
top_k_df.to_csv('../data/top_10_similar_data.csv', index=False)