In [1]:
# === 1. 安装依赖库 ===
# 在Colab中运行以下命令安装所需库
!pip install transformers datasets scikit-learn torch



Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [1]:
# === 2. 挂载Google Drive（用于保存模型）===
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# === 3. 导入模块 ===
import pandas as pd
from sklearn.model_selection import train_test_split
# === 4. 数据加载与预处理 ===
def load_data(file_path):
    df = pd.read_csv(file_path)
    texts = df['text'].tolist()
    labels = df['Y'].tolist()
    return texts, labels

# 路径配置（请确保文件已上传到Google Drive）
train_file = "/content/drive/MyDrive/Colab Notebooks/sarcasm/train.csv"
test_file = "/content/drive/MyDrive/Colab Notebooks/sarcasm/test.csv"

# 加载数据
train_texts, train_labels = load_data(train_file)
test_texts, test_labels = load_data(test_file)

# 分割训练集/验证集
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42
)

print(f"conjunto de entrenamiento: {len(train_texts)} | conjunto de validación: {len(val_texts)} | conjunto de pruebas: {len(test_texts)}")


conjunto de entrenamiento: 16026 | conjunto de validación: 4007 | conjunto de pruebas: 8586


In [None]:
import os
import re
import numpy as np

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
# os.environ['WANDB_API_KEY'] = '255fab36462f5587d825c69b9d5b53a852a2c4d3'  # 替换为你的实际 API 密钥

# === 5. BERT专用预处理 ===
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)          # 移除URL
    text = re.sub(r'\b(not|no|never)\b\s*', r'\1_', text)  # 处理否定词
    return text.strip()

# 清洗数据
cleaned_train = [preprocess(text) for text in train_texts]
cleaned_val = [preprocess(text) for text in val_texts]
cleaned_test = [preprocess(text) for text in test_texts]

# 转换为Dataset格式
class SarcasmDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=128,  # 讽刺文本通常较短
            return_tensors='pt'
        )
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SarcasmDataset(cleaned_train, train_labels)
val_dataset = SarcasmDataset(cleaned_val, val_labels)
test_dataset = SarcasmDataset(cleaned_test, test_labels)

# === 6. 模型配置与训练 ===
model_save_path = "/content/drive/MyDrive/Colab Notebooks/sarcasm"
os.makedirs(model_save_path, exist_ok=True)

# 加载预训练模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    attention_probs_dropout_prob=0.3  # 增加Dropout防止过拟合
).to(device)





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 训练参数
training_args = TrainingArguments(
    output_dir=model_save_path,
    evaluation_strategy="epoch",
    learning_rate=3e-5,          # 更高的学习率
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=5,          # 增加训练轮次
    weight_decay=0.01,
    save_total_limit=3,
    load_best_model_at_end=True, # 根据验证集加载最优模型
    metric_for_best_model="f1",
    logging_dir='./logs',
    logging_steps=50,
    save_strategy="epoch"
)

# 添加早停机制
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {
        "accuracy": (p.predictions.argmax(axis=1) == p.label_ids).mean(),
        "f1": f1_score(p.label_ids, p.predictions.argmax(axis=1))
    },
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] # 新增早停机制
)

# 执行训练
trainer.train()

# 保存最佳模型
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

# === 7. 模型评估 ===
predictions = trainer.predict(test_dataset)
bert_preds = np.argmax(predictions.predictions, axis=1)

# 计算指标
accuracy = accuracy_score(test_labels, bert_preds)
precision = precision_score(test_labels, bert_preds)
recall = recall_score(test_labels, bert_preds)
f1 = f1_score(test_labels, bert_preds)

print(f"\nBERT Results for Sarcasm Detection:")
print(f"Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1-Score: {f1:.4f}")



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2781,0.248732,0.891939,0.879084
2,0.1755,0.26046,0.89693,0.884281
3,0.089,0.341915,0.910157,0.900332
4,0.0574,0.382059,0.919391,0.913659
5,0.0219,0.480816,0.909409,0.899585



BERT Results for Sarcasm Detection:
Accuracy: 0.9189 | Precision: 0.9215 | Recall: 0.9066 | F1-Score: 0.9140


In [7]:
# === 8. 错误案例分析 ===
errors = []
for i in range(len(test_labels)):
    if bert_preds[i] != test_labels[i]:
        errors.append({
            'Text': test_texts[i],
            'Cleaned Text': cleaned_test[i],
            'True': test_labels[i],
            'Predicted': bert_preds[i],
            'Logits': predictions.predictions[i].tolist()
        })

print(f"\nNúmero total de errores: {len(errors)} (tasa de error: {len(errors)/len(test_labels):.2%})")
print("Ejemplos de errores críticos:")
for idx, err in enumerate(errors[:5], 1):
    print(f"\nCaso {idx}:")
    print(f"Etiqueta real: {err['True']}, Predicción: {err['Predicted']}")
    print(f"Texto original: {err['Text'][:100]!r}")
    print(f"Texto procesado: {err['Cleaned Text'][:100]!r}")
    print(f"Logits: {err['Logits']}")


Número total de errores: 696 (tasa de error: 8.11%)
Ejemplos de errores críticos:

Caso 1:
Etiqueta real: 1, Predicción: 0
Texto original: 'raytheon ceo sends obama another article about mounting unrest in libya'
Texto procesado: 'raytheon ceo sends obama another article about mounting unrest in libya'
Logits: [3.938194751739502, -3.5657200813293457]

Caso 2:
Etiqueta real: 1, Predicción: 0
Texto original: ' ncis to cease print edition'
Texto procesado: 'ncis to cease print edition'
Logits: [0.11919807642698288, 0.008703775703907013]

Caso 3:
Etiqueta real: 0, Predicción: 1
Texto original: 'paula abdul back at it'
Texto procesado: 'paula abdul back at it'
Logits: [-3.781426429748535, 3.4458022117614746]

Caso 4:
Etiqueta real: 1, Predicción: 0
Texto original: 'powerball super fans camping out before the big drawing dressed up as their favorite numbers'
Texto procesado: 'powerball super fans camping out before the big drawing dressed up as their favorite numbers'
Logits: [2.90584421157