# Download and Import Repository

In [1]:
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'relation'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence', 'relation'],
        num_rows: 2717
    })
})


In [2]:
import torch
import numpy as np
import pandas as pd
import random
import datasets
import transformers
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the SemEval-2010 Task 8 dataset

In [3]:
from datasets import load_dataset

dataset = load_dataset("SemEvalWorkshop/sem_eval_2010_task_8")

# 转换成 Pandas DataFrame
df_train = dataset['train'].to_pandas()
df_test = dataset['test'].to_pandas()

print(df_train.head())  # 确保数据加载正确
print(df_train.columns)  # 查看列名

                                            sentence  relation
0  The system as described above has its greatest...         3
1  The <e1>child</e1> was carefully wrapped and b...        18
2  The <e1>author</e1> of a keygen uses a <e2>dis...        11
3  A misty <e1>ridge</e1> uprises from the <e2>su...        18
4  The <e1>student</e1> <e2>association</e2> is t...        12
Index(['sentence', 'relation'], dtype='object')


# 数据预处理

## (1) 处理关系标签

In [4]:
label_list = [
    "Cause-Effect", "Instrument-Agency", "Product-Producer",
    "Content-Container", "Entity-Origin", "Entity-Destination",
    "Component-Whole", "Member-Collection", "Message-Topic", "Other"
]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

## (2) 处理句子

In [5]:
def preprocess_text(sentence):
    sentence = sentence.replace("<e1>", "[E1]").replace("</e1>", "[/E1]")
    sentence = sentence.replace("<e2>", "[E2]").replace("</e2>", "[/E2]")
    return sentence

## (3) 生成数据集格式

In [6]:
class REDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # 计算实体 <e1> 和 <e2> 在原始文本中的索引
        e1_start = text.find("[E1]")
        e1_end = text.find("[/E1]") + 4  # 包含 "[/E1]"
        e2_start = text.find("[E2]")
        e2_end = text.find("[/E2]") + 4  # 包含 "[/E2]"

        # 移除标记，使得输入 BERT 的文本不会包含 "[E1]" 和 "[E2]"
        clean_text = text.replace("[E1]", "").replace("[/E1]", "").replace("[E2]", "").replace("[/E2]", "")

        # Tokenizer 编码
        encoding = self.tokenizer(
            clean_text, padding="max_length", truncation=True,
            max_length=self.max_length, return_tensors="pt"
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        item["e1_pos"] = torch.tensor([e1_start, e1_end], dtype=torch.long)
        item["e2_pos"] = torch.tensor([e2_start, e2_end], dtype=torch.long)

        return item

# 预处理数据
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_texts = [preprocess_text(sent) for sent in df_train["sentence"]]
test_texts = [preprocess_text(sent) for sent in df_test["sentence"]]

# # 确保标签是数字
# if df_train["relation"].dtype in [int, np.int32, np.int64]:
#     train_labels = df_train["relation"].tolist()
#     test_labels = df_test["relation"].tolist()
# else:
#     train_labels = [label2id.get(label, 0) for label in df_train["relation"]]
#     test_labels = [label2id.get(label, 0) for label in df_test["relation"]]

# 确保标签格式正确
if df_train["relation"].dtype in [int, np.int32, np.int64]:
    train_labels = df_train["relation"].tolist()
    test_labels = df_test["relation"].tolist()
else:
    train_labels = [label2id.get(label, 0) for label in df_train["relation"]]
    test_labels = [label2id.get(label, 0) for label in df_test["relation"]]

# 打印标签检查
print("Unique train label indices:", set(train_labels))
print("Unique test label indices:", set(test_labels))

# 确保所有标签在范围内
train_labels = [min(len(label_list) - 1, label) for label in train_labels]
test_labels = [min(len(label_list) - 1, label) for label in test_labels]

# 创建数据集
train_dataset = REDataset(train_texts, train_labels, tokenizer)
test_dataset = REDataset(test_texts, test_labels, tokenizer)


Unique train label indices: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
Unique test label indices: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}


# 加载BERT进行训练

### 使用BERT进行关系分类

In [7]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(label_list)
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 训练参数设置

In [8]:
print("GPU Available:", torch.cuda.is_available())
print("GPU Count:", torch.cuda.device_count())
print("Current GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

GPU Available: True
GPU Count: 1
Current GPU: NVIDIA GeForce RTX 4090


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# RTX 4090 高性能优化设置
training_args = TrainingArguments(
    output_dir="results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,  # RTX 4090 显存足够，可提高 batch_size
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="logs",
    logging_strategy="steps",  # ✅ 确保 logging 按步数触发
    logging_steps=10,  # ✅ 让日志更频繁输出，方便观察
    save_total_limit=2,
    fp16=True,  # ✅ 开启自动混合精度，加快训练
    gradient_accumulation_steps=1,  # ✅ RTX 4090 显存足够，设为 1
    report_to="none",  # ✅ 不使用 TensorBoard
    dataloader_num_workers=8,  # ✅ 增加数据加载线程，加速训练
    load_best_model_at_end=True,
    optim="adamw_torch",  # ✅ 采用更稳定的 AdamW 优化器
    lr_scheduler_type="linear",  # ✅ 改用线性学习率衰减（可能更稳定）
    warmup_ratio=0.06,  # ✅ 适当增加 warmup，提升稳定性
    disable_tqdm=False,  # ✅ 确保 Jupyter Notebook 可以显示 tqdm 进度条
    log_level="info"  # ✅ 让 Trainer 输出更多日志信息
)

# 计算评估指标
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)  
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")  
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}  

In [None]:
from tqdm.auto import tqdm

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# progress_bar = tqdm(total=training_args.num_train_epochs, desc="Training Progress", unit="epoch")
# for epoch in range(training_args.num_train_epochs):
#     trainer.train()  # 运行 Trainer 训练
#     progress_bar.update(1)  # 更新进度条
# progress_bar.close()

trainer.train()

# 获取训练历史
history = trainer.state.log_history

# 转换成 DataFrame，方便分析
df = pd.DataFrame(history)

# 打印日志，查看所有记录
print(df)

Using auto half precision backend
***** Running training *****
  Num examples = 8,000
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 625
  Number of trainable parameters = 109,489,930


# 训练和评估

## (1) 训练模型

In [None]:
trainer.train()

## (2) 在测试集上评估

In [None]:
results = trainer.evaluate()
print(results)

# 运行推理（Inference）

## 对新句子进行预测

In [None]:
def predict_relation(sentence):
    model.eval()
    inputs = tokenizer(preprocess_text(sentence), return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    pred_label = torch.argmax(logits, dim=-1).item()
    return id2label[pred_label]

test_sentence = "The <e1>storm</e1> caused severe <e2>flooding</e2> in the city."
predicted_relation = predict_relation(test_sentence)
print("Predicted Relation:", predicted_relation)

# 保存和加载模型

## (1) 保存模型

In [None]:
model.save_pretrained("./bert-relation-extraction")
tokenizer.save_pretrained("./bert-relation-extraction")

## (2) 加载模型

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

model = BertForSequenceClassification.from_pretrained("./bert-relation-extraction")
tokenizer = BertTokenizer.from_pretrained("./bert-relation-extraction")