# Download and Import Repository

In [2]:
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'relation'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence', 'relation'],
        num_rows: 2717
    })
})


In [3]:
import torch
import numpy as np
import pandas as pd
import random
import datasets
import transformers
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the SemEval-2010 Task 8 dataset

In [10]:
from datasets import load_dataset

dataset = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")

# 转换成 Pandas DataFrame
df_train = dataset['train'].to_pandas()
df_test = dataset['test'].to_pandas()

print(df_train.head())  # 确保数据加载正确
print(df_train.columns)  # 查看列名

Index(['sentence', 'relation'], dtype='object')


# 数据预处理

## (1) 处理关系标签

In [5]:
label_list = [
    "Cause-Effect", "Instrument-Agency", "Product-Producer",
    "Content-Container", "Entity-Origin", "Entity-Destination",
    "Component-Whole", "Member-Collection", "Message-Topic", "Other"
]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

## (2) 处理句子

In [6]:
def preprocess_text(sentence):
    sentence = sentence.replace("<e1>", "[E1]").replace("</e1>", "[/E1]")
    sentence = sentence.replace("<e2>", "[E2]").replace("</e2>", "[/E2]")
    return sentence

## (3) 生成数据集格式

In [11]:
class REDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # 计算实体 <e1> 和 <e2> 在原始文本中的索引
        e1_start = text.find("[E1]")
        e1_end = text.find("[/E1]") + 4  # 包含 "[/E1]"
        e2_start = text.find("[E2]")
        e2_end = text.find("[/E2]") + 4  # 包含 "[/E2]"

        # 移除标记，使得输入 BERT 的文本不会包含 "[E1]" 和 "[E2]"
        clean_text = text.replace("[E1]", "").replace("[/E1]", "").replace("[E2]", "").replace("[/E2]", "")

        # Tokenizer 编码
        encoding = self.tokenizer(
            clean_text, padding="max_length", truncation=True,
            max_length=self.max_length, return_tensors="pt"
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        item["e1_pos"] = torch.tensor([e1_start, e1_end], dtype=torch.long)
        item["e2_pos"] = torch.tensor([e2_start, e2_end], dtype=torch.long)

        return item

# 预处理数据
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_texts = [preprocess_text(sent) for sent in df_train["sentence"]]
test_texts = [preprocess_text(sent) for sent in df_test["sentence"]]

# 确保标签是数字
if df_train["relation"].dtype in [int, np.int32, np.int64]:
    train_labels = df_train["relation"].tolist()
    test_labels = df_test["relation"].tolist()
else:
    train_labels = [label2id.get(label, 0) for label in df_train["relation"]]
    test_labels = [label2id.get(label, 0) for label in df_test["relation"]]

# 创建数据集
train_dataset = REDataset(train_texts, train_labels, tokenizer)
test_dataset = REDataset(test_texts, test_labels, tokenizer)


# 加载BERT进行训练

### 使用BERT进行关系分类

In [12]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(label_list)
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 训练参数设置

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 训练和评估

## (1) 训练模型

In [None]:
trainer.train()

## (2) 在测试集上评估

In [None]:
results = trainer.evaluate()
print(results)

# 运行推理（Inference）

## 对新句子进行预测

In [None]:
def predict_relation(sentence):
    model.eval()
    inputs = tokenizer(preprocess_text(sentence), return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    pred_label = torch.argmax(logits, dim=-1).item()
    return id2label[pred_label]

test_sentence = "The <e1>storm</e1> caused severe <e2>flooding</e2> in the city."
predicted_relation = predict_relation(test_sentence)
print("Predicted Relation:", predicted_relation)

# 保存和加载模型

## (1) 保存模型

In [None]:
model.save_pretrained("./bert-relation-extraction")
tokenizer.save_pretrained("./bert-relation-extraction")

## (2) 加载模型

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

model = BertForSequenceClassification.from_pretrained("./bert-relation-extraction")
tokenizer = BertTokenizer.from_pretrained("./bert-relation-extraction")