In [None]:
import json
import pdfplumber
import torch
import transformers
import jieba
import sklearn
import requests
import time
import jwt
import sklearn
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from rank_bm25 import BM25Okapi

# bge+bm25

In [None]:
questions = json.load(open("questions.json"))
pdf = pdfplumber.open("初赛训练数据集.pdf")
pdf_content = []

def split_text_fixed_size(text, chunk_size):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

for page_idx in range(len(pdf.pages)):
    text = pdf.pages[page_idx].extract_text()
    for chunk_text in split_text_fixed_size(text, 40):
        pdf_content.append({
            'page': 'page_' + str(page_idx + 1),
            'content': chunk_text
        })

pdf_content_words = [jieba.lcut(x['content']) for x in pdf_content]
bm25 = BM25Okapi(pdf_content_words)

model = SentenceTransformer('BAAI/bge-small-zh-v1.5')
question_sentences = [x['question'] for x in questions]
pdf_content_sentences = [x['content'] for x in pdf_content]

question_embeddings = model.encode(question_sentences, normalize_embeddings=True)
pdf_embeddings = model.encode(pdf_content_sentences, normalize_embeddings=True)

from scipy.stats import rankdata
for query_idx, feat in enumerate(question_embeddings):
    score1 = feat @ pdf_embeddings.T
    score2 = bm25.get_scores(jieba.lcut(questions[query_idx]["question"]))

    score = rankdata(score1) + rankdata(score2)
    max_score_page_idx = score.argsort()[-1]
    questions[query_idx]['reference'] = pdf_content[max_score_page_idx]['page']

with open('submit_bge_bm25_segment.json', 'w', encoding='utf8') as up:
    json.dump(questions, up, ensure_ascii=False, indent=4)

In [68]:
def generate_token(apikey: str, exp_seconds: int):
    try:
        id, secret = apikey.split(".")
    except Exception as e:
        raise Exception("invalid apikey", e)

    payload = {
        "api_key": id,
        "exp": int(round(time.time() * 1000)) + exp_seconds * 1000,
        "timestamp": int(round(time.time() * 1000)),
    }
    return jwt.encode(
        payload,
        secret,
        algorithm="HS256",
        headers={"alg": "HS256", "sign_type": "SIGN"},
    )

def ask_glm(content):
    url = "https://open.bigmodel.cn/api/paas/v4/chat/completions"
    headers = {
      'Content-Type': 'application/json',
      'Authorization': generate_token("0f2f6aca4fb384e90505b066a1ddf204.hM6JYEdsgvTYLxWz", 1000)
    }

    data = {
        "model": "glm-4",
        "messages": [{"role": "user", "content": content}]
    }

    response = requests.post(url, headers=headers, json=data)
    return response.json()

# 重排序

In [70]:

# Load questions and PDF
questions = json.load(open("questions.json"))
pdf = pdfplumber.open("初赛训练数据集.pdf")
pdf_content = []

# Function to split text into fixed-size chunks
def split_text_fixed_size(text, chunk_size):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Extract text from PDF and split into chunks
for page_idx in range(len(pdf.pages)):
    text = pdf.pages[page_idx].extract_text()
    for chunk_text in split_text_fixed_size(text, 40):
        pdf_content.append({
            'page': 'page_' + str(page_idx + 1),
            'content': chunk_text
        })

# Tokenize and create BM25 index
pdf_content_words = [jieba.lcut(x['content']) for x in pdf_content]
bm25 = BM25Okapi(pdf_content_words)

# Load model for sentence embeddings
model = SentenceTransformer('BAAI/bge-small-zh-v1.5')
question_sentences = [x['question'] for x in questions]
pdf_content_sentences = [x['content'] for x in pdf_content]

# Encode sentences to embeddings
question_embeddings = model.encode(question_sentences, normalize_embeddings=True)
pdf_embeddings = model.encode(pdf_content_sentences, normalize_embeddings=True)

# Load rerank model
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-base')
rerank_model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-base')
# rerank_model.cuda()
rerank_model.eval()

# Process each question
for query_idx, feat in enumerate(question_embeddings):
    score1 = feat @ pdf_embeddings.T
    score2 = bm25.get_scores(jieba.lcut(questions[query_idx]["question"]))

    score = rankdata(score1) + rankdata(score2)
    max_score_page_idxs = score.argsort()[-5:]

    pairs = []
    for idx in max_score_page_idxs:
        pairs.append((questions[query_idx]["question"], pdf_content[idx]['content']))

    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        inputs = {key: inputs[key] for key in inputs.keys()}
        scores = rerank_model(**inputs).logits.view(-1, ).float()

    max_score_page_idx = max_score_page_idxs[scores.cpu().numpy().argmax()]
    questions[query_idx]['reference'] = str(pdf_content[max_score_page_idx]['page'])

    prompt = '''你是一个汽车专家，帮我结合给定的资料，回答一个问题。如果问题无法从资料中获得，请输出结合给定的资料，无法回答问题。
资料：{0}

问题：{1}
    '''.format(
        pdf_content[max_score_page_idx]['content'],
        questions[query_idx]["question"]
    )
    answer = ask_glm(prompt)['choices'][0]['message']['content']
    questions[query_idx]['answer'] = answer

# Save results
with open('submit_bge_bm25_segment_rerank_1.json', 'w', encoding='utf8') as up:
    json.dump(questions, up, ensure_ascii=False, indent=4)
