### 数据预处理

In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import jieba

# 加载问题数据
df1 = pd.read_csv('QA_newsys.csv', usecols=['question'])

# 文本清理函数
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # 移除标点符号
    text = text.upper()  #统一转为大写
    return text

def chinese_tokenizer(text):
    # 使用Jieba分词
    return " ".join(jieba.cut(text))

df1['cleaned_question'] = df1['question'].apply(clean_text)
df1['tokenized_question'] = df1['cleaned_question'].apply(chinese_tokenizer)

# 分割训练和测试集
#train, test = train_test_split(df1, test_size=0.2, random_state=42)


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.485 seconds.
Prefix dict has been built successfully.


### 加载预训练bert模型

In [3]:
from transformers import BertTokenizer, BertModel
import torch

# 加载预训练BERT模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# 转换为BERT输入格式
def encode_question(question):
    inputs = tokenizer(question, return_tensors="pt", truncation=True, padding=True, max_length=128)
    return inputs

# 获取句子向量
def get_sentence_embedding(question):
    inputs = encode_question(question)
    with torch.no_grad():
        outputs = model(**inputs)
    # BERT最后一层的CLS token输出作为句子的表示
    sentence_embedding = outputs.last_hidden_state[:, 0, :]
    return sentence_embedding




### STS语义相似度

In [8]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 初始化模型
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
            
def compute_sts(sentence1, sentence2):
    # 计算句子的嵌入
    embedding1 = model.encode(sentence1, convert_to_tensor=False)
    embedding2 = model.encode(sentence2, convert_to_tensor=False)

    #将嵌入reshape为二维数组
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)
    
    # 计算余弦相似度
    similarity = cosine_similarity(embedding1, embedding2)
    return similarity[0][0]

# 示例使用
sentence1 = "IFS的多次曝光光谱数据是如何合成的？"
sentence2 = "MCI伴随图像处理的输入是什么？"
sts_score = compute_sts(sentence1, sentence2)
print(f"STS Score: {sts_score}")



STS Score: 0.6316959261894226


### 相似度计算

测试样例

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

sentence1 = "IFS的多次曝光光谱数据是如何合成的？"
sentence2 = "MCI伴随图像处理的输入是什么？"
embedding1 = get_sentence_embedding(sentence1)
embedding2 = get_sentence_embedding(sentence2)
bert_score = cosine_similarity(embedding1, embedding2)

print(f"BERT Score: {bert_score}")
print(f"embedding1.shape: {embedding1.shape}")
print(f"embedding2.shape: {embedding2.shape}")

BERT Score: [[0.95451903]]
embedding1.shape: torch.Size([1, 768])
embedding2.shape: torch.Size([1, 768])


BERT

In [4]:
# 获取所有问题的句子向量
question_embeddings = []
for question in df1['tokenized_question']:
    embedding = get_sentence_embedding(question)
    question_embeddings.append(embedding)

# 将向量转为Tensor形式
question_embeddings = torch.cat(question_embeddings, dim=0)


In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# 获取用户输入问题的向量表示
user_question = "IFS外部定标参考文件是如何生成的？"
user_embedding = get_sentence_embedding(user_question)

# 计算用户问题与所有问题的余弦相似度
similarities = cosine_similarity(user_embedding, question_embeddings)

# 找出相似度最高的几个问题
top_k = 3
top_k_indices = similarities.argsort()[0][-top_k:][::-1]  # 按照相似度排序并取前5个

# 输出推荐问题
print("Recommended Questions and Similarity Scores:")
for idx in top_k_indices:
    question = df1.iloc[idx]['question']
    score = similarities[0][idx]
    print(f"Question: {question}, Similarity Score: {score:.4f}")

Recommended Questions and Similarity Scores:
Question: IFS外部定标参考文件是如何生成的？, Similarity Score: 0.9544
Question: IFS科学数据生成的主要步骤有哪些？, Similarity Score: 0.9469
Question: IFS内部定标和外部定标中的查考文件如何比较和校验？, Similarity Score: 0.9447


STS

In [14]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 用户提出的问题
user_question = "IFS外部定标参考文件是如何生成的？"

# 初始化SentenceTransformer模型
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# 对用户的问题进行嵌入
user_embedding2 = model.encode(user_question, convert_to_tensor=False).reshape(1, -1)

# 对df1['tokenized_question']中的每个问题进行嵌入
question_embeddings2 = model.encode(df1['tokenized_question'].tolist(), convert_to_tensor=False)

# 计算用户问题与每个问题的余弦相似度
similarities2 = cosine_similarity(user_embedding2, question_embeddings2).flatten()  # 转换为1D数组

# 找出相似度最高的三个问题
top_k = 3
top_k_indices = similarities2.argsort()[-top_k:][::-1]  # 按相似度降序排列并取前3个

# 输出推荐的相似问题及其相似度
recommended_questions = df1.iloc[top_k_indices]['question']
recommended_similarities2 = similarities2[top_k_indices]

# 打印相似度最高的三个问题及其相似度
print("Recommended Questions and Similarity Scores:")
for question, similarity in zip(recommended_questions, recommended_similarities2):
    print(f"Question: {question}, Similarity Score: {similarity:.4f}")




Recommended Questions and Similarity Scores:
Question: IFS外部定标参考文件是如何生成的？, Similarity Score: 0.9920
Question: IFS内部定标和外部定标中的查考文件如何比较和校验？, Similarity Score: 0.9585
Question: IFS（积分视场光谱仪）是什么设施的重要组成部分？, Similarity Score: 0.9364


### 相似度计算方法测评

In [None]:
from sklearn.metrics import accuracy_score
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from bert_score import score as bert_score
import numpy as np

def evaluate_responses(predictions, references):
    # 初始化评估指标
    bleu_scores = []
    rouge = Rouge()
    rouge_scores = []
    exact_matches = []

    for pred, ref in zip(predictions, references):
        # BLEU Score
        bleu_score_value = sentence_bleu([ref.split()], pred.split())
        bleu_scores.append(bleu_score_value)

        # ROUGE Score
        rouge_score_value = rouge.get_scores(pred, ref)[0]
        rouge_scores.append(rouge_score_value)

        # Exact Match
        exact_match = 1 if pred == ref else 0
        exact_matches.append(exact_match)

    # BERTScore
    P, R, F1 = bert_score(predictions, references, lang="zh", verbose=True)
    bert_scores = F1.tolist()

    # 计算平均得分
    avg_bleu = np.mean(bleu_scores)
    avg_rouge = {key: np.mean([score[key]['f'] for score in rouge_scores]) for key in rouge_scores[0]}
    avg_exact_match = np.mean(exact_matches)
    avg_bert_score = np.mean(bert_scores)

    results = {
        "BLEU": avg_bleu,
        "ROUGE": avg_rouge,
        "Exact Match": avg_exact_match,
        "BERTScore": avg_bert_score
    }

    return results

In [2]:
from flask import Flask, request, jsonify
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re
import jieba
from transformers import BertTokenizer, BertModel
import torch

# app = Flask(__name__)

# 文本清理函数
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # 移除标点符号
    text = text.upper()  #统一转为大写
    return text

def chinese_tokenizer(text):
    # 使用Jieba分词
    return " ".join(jieba.cut(text))

# 转换为BERT输入格式
def encode_question(question):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    inputs = tokenizer(question, return_tensors="pt", truncation=True, padding=True, max_length=128)
    return inputs

# 获取句子向量
def get_sentence_embedding(question):
    inputs = encode_question(question)
    model = BertModel.from_pretrained('bert-base-uncased')
    with torch.no_grad():
        outputs = model(**inputs)
    # BERT最后一层的CLS token输出作为句子的表示
    sentence_embedding = outputs.last_hidden_state[:, 0, :]
    return sentence_embedding

# 加载问题库
df1 = pd.read_csv('QA_newsys.csv', usecols=['question'])
df1['cleaned_question'] = df1['question'].apply(clean_text)
df1['tokenized_question'] = df1['cleaned_question'].apply(chinese_tokenizer)

# 获取所有问题的句子向量
question_embeddings = []
for question in df1['tokenized_question']:
    embedding = get_sentence_embedding(question)
    question_embeddings.append(embedding)

# 将向量转为Tensor形式
question_embeddings = torch.cat(question_embeddings, dim=0)

'''
@app.route('/recommend', methods=['POST'])
def recommend():
    data = request.json
    user_question = data['question']
'''
# 测试用户问题
user_question = "IFS外部定标参考文件是如何生成的？"
user_embedding = get_sentence_embedding(user_question)

# 计算用户问题与所有问题的余弦相似度
similarities = cosine_similarity(user_embedding, question_embeddings)

# 找出相似度最高的几个问题
top_k = 3
top_k_indices = similarities.argsort()[0][-top_k:][::-1]  # 按照相似度排序并取前5个

# 输出推荐问题
recommended_questions = []
for idx in top_k_indices:
    question = df1.iloc[idx]['question']
    score = similarities[0][idx]
    recommended_questions.append({'question': question, 'similarity_score': score})
    
print(recommended_questions)

'''
if __name__ == '__main__':
    app.run(debug=True)
'''



[{'question': 'IFS外部定标参考文件是如何生成的？', 'similarity_score': 0.9543768}, {'question': 'IFS科学数据生成的主要步骤有哪些？', 'similarity_score': 0.94692516}, {'question': 'IFS内部定标和外部定标中的查考文件如何比较和校验？', 'similarity_score': 0.9447235}]


"\nif __name__ == '__main__':\n    app.run(debug=True)\n"