# 安装必要包

In [None]:
# # 列出所有已安装的包
# %pip freeze > requirements.txt

# # 卸载所有包
# %pip uninstall -r requirements.txt -y

# %pip install --upgrade --quiet langchain sentence_transformers
# %pip install --upgrade --quiet torch sentence-transformers
# %pip install --quiet scikit-learn numpy
# %pip install --quiet langchain_huggingface

In [None]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer


In [None]:
from sklearn.metrics import top_k_accuracy_score
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sentence_transformers import util
from sklearn.metrics.pairwise import cosine_similarity

# 文本嵌入

In [None]:
# 计算QQP任务指标
def evaluate_qqp(model, qqp_data):
    y_true = []
    y_pred = []

    # 遍历所有数据项
    for idx in range(len(qqp_data['question1'])):
        question1 = qqp_data['question1'][idx]
        question2 = qqp_data['question2'][idx]
        label = qqp_data['label'][idx]

        # 获取嵌入
        embeddings_q1 = model.encode([question1])
        embeddings_q2 = model.encode([question2])

        # 计算余弦相似度
        cosine_sim = cosine_similarity(embeddings_q1, embeddings_q2)[0][0]

        # 判断相似性，余弦相似度 > 0.9 认为是相似
        prediction = 1 if cosine_sim > 0.8 else 0

        # 记录实际标签和预测标签
        y_true.append(label)
        y_pred.append(prediction)

    # 计算准确率、召回率、精确率
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    return acc, precision, recall

In [None]:
# 计算ANA任务的前1、前3准确率
def evaluate_analogy(model, analogy_data):
    correct_1 = 0
    correct_2 = 0
    correct_3 = 0

    # 遍历所有数据项
    for idx in range(len(analogy_data['stem'])):
        stem = analogy_data['stem'][idx]
        answer = analogy_data['answer'][idx]
        choices = analogy_data['choice'][idx]

        # 获取嵌入
        embeddings_stem_1 = model.encode(stem[0])  # stem 第一个词的嵌入
        embeddings_stem_2 = model.encode(stem[1])  # stem 第二个词的嵌入

        # 计算stem两个词汇之间的相似度
        cosine_sim_stem = cosine_similarity([embeddings_stem_1], [embeddings_stem_2])[0][0]

        # 计算每个选项的相似度差值
        cosine_differences = []
        for choice in choices:
            embeddings_choice_1 = model.encode(choice[0])  # 选项第一个词的嵌入
            embeddings_choice_2 = model.encode(choice[1])  # 选项第二个词的嵌入

            # 计算当前选项两个词的相似度
            cosine_sim_choice = cosine_similarity([embeddings_choice_1], [embeddings_choice_2])[0][0]

            # 计算与stem的相似度差值
            cosine_diff = abs(cosine_sim_stem - cosine_sim_choice)
            cosine_differences.append(cosine_diff)

        # 获取按相似度差值排序后的索引
        top_indices = np.argsort(cosine_differences)

        # 判断前1、2、3、5个选项是否正确
        if top_indices[0] == answer:
            correct_1 += 1
        if top_indices[1] == answer:
            correct_2 += 1
        if top_indices[2] == answer:
            correct_3 += 1

    # 计算前1、2、3、5的准确率
    acc_1 = correct_1 / len(analogy_data['stem'])
    acc_2 = correct_2 / len(analogy_data['stem'])
    acc_3 = correct_3 / len(analogy_data['stem'])
    # acc_5 = correct_5 / len(analogy_data['stem'])

    return acc_1, acc_2, acc_3

# thenlper/gte-large-zh 模型评估

In [None]:
# !pip install datasets
from datasets import load_dataset

def data_load():
  analogy_dataset = load_dataset("relbert/analogy_questions","bats")
  print(analogy_dataset['test'][:2])
  qqp_dataset = load_dataset("glue", "qqp")
  print(qqp_dataset['train'][:10])
  return analogy_dataset['test'][:500],qqp_dataset['train'][:500]

In [None]:
model = SentenceTransformer('thenlper/gte-large-zh')
ana,qqp=data_load()

acc_1, acc_2, acc_3=evaluate_analogy(model,ana)
acc, precision, recall=evaluate_qqp(model,qqp)

KeyboardInterrupt: 

In [None]:
print(f"Analogy Task - Top 1 Accuracy: {acc_1*100:.2f}%")
print(f"Analogy Task - Top 2 Accuracy: {(acc_2+acc_1)*100:.2f}%")
print(f"Analogy Task - Top 3 Accuracy: {(acc_2+acc_1+acc_3)*100:.2f}%")

# 打印 qqp 任务的准确率、精确度和召回率
print(f"QQP Task - Accuracy: {acc*100:.2f}%")
print(f"QQP Task - Precision: {precision*100:.2f}%")
print(f"QQP Task - Recall: {recall*100:.2f}%")

Analogy Task - Top 1 Accuracy: 36.40%
Analogy Task - Top 2 Accuracy: 66.00%
Analogy Task - Top 3 Accuracy: 80.40%
QQP Task - Accuracy: 70.20%
QQP Task - Precision: 57.45%
QQP Task - Recall: 73.37%


# Alibaba-NLP/gte-Qwen2-1.5B-instruct模型评估

In [2]:
# !pip install datasets
from datasets import load_dataset

def data_load():
  analogy_dataset = load_dataset("relbert/analogy_questions","bats")
  print(analogy_dataset['test'][:2])
  qqp_dataset = load_dataset("glue", "qqp")
  print(qqp_dataset['train'][:10])
  return analogy_dataset['test'][:500],qqp_dataset['train'][:500]

In [4]:
model = SentenceTransformer('Alibaba-NLP/gte-Qwen2-1.5B-instruct')
ana,qqp=data_load()

acc_1, acc_2, acc_3=evaluate_analogy(model,ana)
acc, precision, recall=evaluate_qqp(model,qqp)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

{'stem': [['hitler', 'dictator'], ['rousseau', 'writer']], 'answer': [0, 3], 'choice': [[['strauss', 'composer'], ['kepler', 'hegel'], ['wagner', 'beethoven'], ['ecuador', 'spanish']], [['cattle', 'calf'], ['edison', 'hawking'], ['rembrandt', 'picasso'], ['hegel', 'philosopher']]], 'prefix': ['./cache/BATS_3.0/3_Encyclopedic_semantics/E05 [name - occupation].txt', './cache/BATS_3.0/3_Encyclopedic_semantics/E05 [name - occupation].txt']}
{'question1': ['How is the life of a math student? Could you describe your own experiences?', 'How do I control my horny emotions?', 'What causes stool color to change to yellow?', 'What can one do after MBBS?', 'Where can I find a power outlet for my laptop at Melbourne Airport?', "How not to feel guilty since I am Muslim and I'm conscious we won't have sex together?", 'How is air traffic controlled?', 'What is the best self help book you have read? Why? How did it change your life?', "Can I enter University of Melbourne if I couldn't achieve the guara

In [5]:
print(f"Analogy Task - Top 1 Accuracy: {acc_1*100:.2f}%")
print(f"Analogy Task - Top 2 Accuracy: {(acc_2+acc_1)*100:.2f}%")
print(f"Analogy Task - Top 3 Accuracy: {(acc_2+acc_1+acc_3)*100:.2f}%")

# 打印 qqp 任务的准确率、精确度和召回率
print(f"QQP Task - Accuracy: {acc*100:.2f}%")
print(f"QQP Task - Precision: {precision*100:.2f}%")
print(f"QQP Task - Recall: {recall*100:.2f}%")

Analogy Task - Top 1 Accuracy: 30.40%
Analogy Task - Top 2 Accuracy: 54.00%
Analogy Task - Top 3 Accuracy: 77.80%
QQP Task - Accuracy: 73.80%
QQP Task - Precision: 59.30%
QQP Task - Recall: 91.85%


# gte-large-en-v1.5模型评估



In [None]:
# !pip install datasets
from datasets import load_dataset

def data_load():
  analogy_dataset = load_dataset("relbert/analogy_questions","bats")
  print(analogy_dataset['test'][:2])
  qqp_dataset = load_dataset("glue", "qqp")
  print(qqp_dataset['train'][:10])
  return analogy_dataset['test'][:500],qqp_dataset['train'][:500]

In [6]:
model = SentenceTransformer('gte-large-en-v1.5')
ana,qqp=data_load()

acc_1, acc_2, acc_3=evaluate_analogy(model,ana)
acc, precision, recall=evaluate_qqp(model,qqp)



OSError: sentence-transformers/gte-large-en-v1.5 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
print(f"Analogy Task - Top 1 Accuracy: {acc_1*100:.2f}%")
print(f"Analogy Task - Top 2 Accuracy: {(acc_2+acc_1)*100:.2f}%")
print(f"Analogy Task - Top 3 Accuracy: {(acc_2+acc_1+acc_3)*100:.2f}%")

# 打印 qqp 任务的准确率、精确度和召回率
print(f"QQP Task - Accuracy: {acc*100:.2f}%")
print(f"QQP Task - Precision: {precision*100:.2f}%")
print(f"QQP Task - Recall: {recall*100:.2f}%")