In [19]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
Tesla T4


In [20]:
class MRPCStructuredDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df.reset_index(drop=True)
        # 过滤空值行和无效标签行（确保Quality是0或1）
        self.df = self.df.dropna(subset=['#1 String', '#2 String', 'Quality'])
        self.df = self.df[self.df['Quality'].isin([0, 1])]  # 只保留标签为0或1的行
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sentence1 = str(row['#1 String'])
        sentence2 = str(row['#2 String'])

        label = int(row['Quality'])

        encoding = self.tokenizer(
            sentence1,
            sentence2,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

In [21]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from google.colab import drive
from sklearn.model_selection import train_test_split
drive.mount('/content/drive')

train_path = f"/content/drive/MyDrive/datasets/msr_paraphrase_train.txt"
test_path = f"/content/drive/MyDrive/datasets/msr_paraphrase_test.txt"

full_train_df = pd.read_csv(
    train_path,
    sep="\t",
    header=0,
    names=['Quality', '#1 ID', '#2 ID', '#1 String', '#2 String'],
    on_bad_lines='skip'
)

full_train_df = full_train_df.dropna(
    subset=['#1 String', '#2 String', 'Quality']
)
full_train_df = full_train_df[full_train_df['Quality'].isin([0, 1])]

train_df, dev_df = train_test_split(
    full_train_df,
    test_size=0.2,          # 20% 用于验证
    random_state=42,
    stratify=full_train_df['Quality']
)

try:
    train_dataset = MRPCStructuredDataset(train_df, tokenizer)
    dev_dataset = MRPCStructuredDataset(dev_df, tokenizer)
    print(f"训练集加载成功，共 {len(train_dataset)} 条数据")
    print(f"验证集加载成功，共 {len(dev_dataset)} 条数据")
except Exception as e:
    print(f"数据集加载失败：{e}")
    exit()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
训练集加载成功，共 3133 条数据
验证集加载成功，共 784 条数据


In [23]:
test_df = pd.read_csv(
    test_path,
    sep="\t",
    header=0,
    names=['Quality', '#1 ID', '#2 ID', '#1 String', '#2 String'],
    on_bad_lines='skip'
)

test_df = test_df.dropna(
    subset=['#1 String', '#2 String', 'Quality']
)
test_df = test_df[test_df['Quality'].isin([0, 1])]

test_dataset = MRPCStructuredDataset(test_df, tokenizer)


In [24]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [25]:



training_args = TrainingArguments(
    output_dir="./mrpc_results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=2e-5,
    load_best_model_at_end=True,
    no_cuda=False  # 若没有GPU，设为True
)

# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics
)

# 开始训练
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.8221,0.654303,0.798469,0.85316,0.840659,0.866038
2,0.5028,0.90734,0.82398,0.875899,0.83677,0.918868
3,0.2125,0.993345,0.836735,0.88015,0.873606,0.886792


TrainOutput(global_step=4701, training_loss=0.4851495563869805, metrics={'train_runtime': 415.2342, 'train_samples_per_second': 22.635, 'train_steps_per_second': 11.321, 'total_flos': 618245202332160.0, 'train_loss': 0.4851495563869805, 'epoch': 3.0})

In [26]:
def predict(sentence1, sentence2):
    inputs = tokenizer(
        sentence1,
        sentence2,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
        model.cuda()

    with torch.no_grad():
        outputs = model(**inputs)
        pred_label = torch.argmax(outputs.logits, dim=1).item()
    return "一致" if pred_label == 1 else "不一致"


# 测试示例
print("\n测试推理：")
print(predict("A man is playing guitar.", "Someone is playing a guitar."))
print(predict("The cat sits on the mat.", "A dog chases a ball."))


测试推理：
一致
不一致


In [27]:
test_metrics = trainer.evaluate(test_dataset)
print("Test set results:", test_metrics)


Test set results: {'eval_loss': 0.6624998450279236, 'eval_accuracy': 0.803680981595092, 'eval_f1': 0.8579040852575488, 'eval_precision': 0.8256410256410256, 'eval_recall': 0.8927911275415896, 'eval_runtime': 13.7112, 'eval_samples_per_second': 118.881, 'eval_steps_per_second': 59.441, 'epoch': 3.0}
