In [1]:
!pip install sentencepiece
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
Collecting to

In [4]:
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from tqdm import tqdm
import json

encoded_evidences = np.load("/content/drive/MyDrive/Colab Notebooks/NLP project/FactChecker_NLP/data/dpr_evidence_embeddings.npy", allow_pickle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/NLP project/FactChecker_NLP/data/models/bert_model.pt"))
model = model.to(device)

def predict_claim_category_and_evidences(claim, model, encoded_evidences):
    # 初始化DPR问题编码器和tokenizer
    question_encoder_model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
    question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

    # 编码claim
    encoded_claim = question_encoder_tokenizer(claim, return_tensors="pt")
    claim_embedding = question_encoder_model(**encoded_claim).pooler_output.detach().numpy()

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


    # 计算与所有证据的相似度
    similarities = []
    for encoded_evidence in encoded_evidences:
        similarity = np.inner(claim_embedding, encoded_evidence["embedding"])
        similarities.append((encoded_evidence["id"], similarity))

    # 对相似度进行排序，取前6个最相关的证据
    top_evidences = sorted(similarities, key=lambda x: x[1], reverse=True)[:5]
    # 使用分类器预测claim的类别
    encoding = tokenizer(claim, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()


    return predicted_label, [evidence_id for evidence_id, _ in top_evidences]

dev = pd.read_json('/content/drive/MyDrive/Colab Notebooks/NLP project/FactChecker_NLP/data/dev-claims.json').T
test = pd.read_json('/content/drive/MyDrive/Colab Notebooks/NLP project/FactChecker_NLP/data/test-claims-unlabelled.json').T

predictions = {}
LABELS = ["SUPPORTS", "REFUTES", "DISPUTED", "NOT_ENOUGH_INFO"]
# 对dev数据集中的每个claim进行预测
for index, row in tqdm(test.iterrows(), total=test.shape[0]):
    claim = row['claim_text']
    label, evidences = predict_claim_category_and_evidences(claim, model, encoded_evidences)
    evidence_ids = [f"evidence-{id}" for id in evidences]
    predictions[index] = {"claim_text": claim, "claim_label": LABELS[label], "evidences": evidence_ids}

# 将预测结果保存到json文件中
with open('predictions.json', 'w') as f:
    json.dump(predictions, f)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at