In [None]:
import os
os.environ["TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL"] = "1"

In [None]:
import pandas as pd

df = pd.read_csv('./law_faq.csv')
df.head()

In [None]:
from transformers import AutoTokenizer
tokenzier = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

In [None]:
from dual_model import DualModel
dual_model = DualModel.from_pretrained('../sentence-similarity/dual_model/checkpoint-750').to("cuda")

In [None]:
import torch
from tqdm import tqdm

questions = df['title'].to_list()
vectors = []
batch_size = 32

with torch.inference_mode():
    for i in tqdm(range(0, len(questions), batch_size)):
        batch_sens = questions[i:i + batch_size]
        inputs = tokenzier(batch_sens, return_tensors="pt", padding=True, max_length=256, truncation=True).to("cuda")
        outputs = dual_model.bert(**inputs)
        vectors.append(outputs[1])
vectors = torch.concat(vectors, dim=0).cpu().numpy()

In [None]:
import faiss

index = faiss.IndexFlatIP(768)
faiss.normalize_L2(vectors)
index.add(vectors)
index

In [None]:
question = "寻衅滋事"
with torch.inference_mode():
    inputs = tokenzier(question, return_tensors="pt", padding=True, max_length=256, truncation=True).to("cuda")
    outputs = dual_model.bert(**inputs)
    q_vector = outputs.pooler_output
    q_vector = q_vector.cpu().numpy()

In [None]:
faiss.normalize_L2(q_vector)
scores, indexes = index.search(q_vector, 10)
print(df.iloc[indexes[0].tolist()])

In [None]:
res_indexes = []
for score, index in zip(scores[0], indexes[0]):
    if score.item() > 0.85:
        res_indexes.append(index.item())

In [None]:
topk_questions = df.iloc[res_indexes]

In [None]:
topk_questions

In [None]:
from transformers import AutoModelForSequenceClassification
cross_model = AutoModelForSequenceClassification.from_pretrained("../sentence-similarity/cross_model/checkpoint-750", num_labels=1).to("cuda")

In [None]:
cross_model.config.id2label = {0: "不相似", 1: "相似"}

In [None]:
from transformers import pipeline
pipe = pipeline("text-classification", model=cross_model, tokenizer=tokenzier)

In [None]:
result = pipe({"text": question, "text_pair": topk_questions['title'].to_list()[0]}, function_to_apply="none")
result["label"] ="相似" if result["score"] > 0.7 else "不相似"
result

In [None]:
with torch.inference_mode():
    inputs = tokenzier([question for i in range(len(topk_questions['title']))], topk_questions['title'].to_list(), return_tensors="pt", padding=True, max_length=256, truncation=True).to("cuda")
    outputs = cross_model(**inputs)
    print(outputs)

In [None]:
index = outputs.logits.argmax(dim=0).item()
if outputs.logits[index][0].item() < 0.7:
    index = None

In [None]:
if index is not None:
    result = topk_questions.iloc[index]['reply']
else:
    result = None

In [None]:
result