In [1]:
import torch
import transformers

print(f"🚀 PyTorch 版本: {torch.__version__}")
print(f"🚀 Transformers 版本: {transformers.__version__}")
print(f"🚀 GPU 是否可用: {torch.cuda.is_available()} - {torch.cuda.get_device_name(0)}")


🚀 PyTorch 版本: 2.5.1+cu124
🚀 Transformers 版本: 4.48.2
🚀 GPU 是否可用: True - NVIDIA A100-SXM4-40GB


In [10]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# 🚀 选择 RoBERTa 预训练模型
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"  # 这个模型原本是 3 分类

# 🚀 加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# 🚀 强制加载模型，但忽略 `classifier` 头部
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=2,  # 你的任务是 2 分类
    ignore_mismatched_sizes=True  # 🚀 让 HuggingFace 自动调整分类层
)

print("\n✅ RoBERTa 加载完成，分类头已自动调整！")

# 🚀 检查 GPU 是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"\n✅ RoBERTa 加载完成，使用设备: {device}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



✅ RoBERTa 加载完成，分类头已自动调整！

✅ RoBERTa 加载完成，使用设备: cuda


In [11]:
import pandas as pd

# 🚀 读取数据
train_df = pd.read_csv("Train_Dataset.csv")
test_df = pd.read_csv("Test_Dataset.csv")

# 🚀 预览数据
print(train_df.head())

# 🚀 确保数据列名正确
train_texts = train_df["tweet"].tolist()
train_labels = train_df["sarcastic"].tolist()
test_texts = test_df["tweet"].tolist()
test_labels = test_df["sarcastic"].tolist()

print(f"\n✅ 训练集样本数: {len(train_texts)}, 测试集样本数: {len(test_texts)}")


                                               tweet  sarcastic
0  The only thing I got from college is a caffein...          1
1  I love it when professors draw a big question ...          1
2  Remember the hundred emails from companies whe...          1
3  Today my pop-pop told me I was not “forced” to...          1
4  @VolphanCarol @littlewhitty @mysticalmanatee I...          1

✅ 训练集样本数: 6934, 测试集样本数: 1400


In [12]:
# 🚀 对训练数据进行 Tokenization
train_encodings = tokenizer(
    train_texts, truncation=True, padding=True, max_length=128, return_tensors="pt"
)
test_encodings = tokenizer(
    test_texts, truncation=True, padding=True, max_length=128, return_tensors="pt"
)

# 🚀 迁移到 GPU
train_encodings = {key: val.to(device) for key, val in train_encodings.items()}
test_encodings = {key: val.to(device) for key, val in test_encodings.items()}

print("\n✅ Tokenization 完成！")



✅ Tokenization 完成！


In [13]:
from torch.utils.data import Dataset

class SarcasmDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)  # 🚀 **确保 labels 是 LongTensor**

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # 🚀 **让 labels 保持在 CPU**
        return item

    def __len__(self):
        return len(self.labels)

# ✅ 重新创建数据集
train_dataset = SarcasmDataset(train_encodings, train_labels)
test_dataset = SarcasmDataset(test_encodings, test_labels)

print("\n✅ 训练数据集和测试数据集已修正！")




✅ 训练数据集和测试数据集已修正！


In [15]:
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# 🚀 计算评价指标（F1-score & Accuracy）
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    return {
        "accuracy": accuracy_score(labels, pred),
        "f1_score": f1_score(labels, pred)
    }

training_args = TrainingArguments(
    output_dir='./res',
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs4',
    load_best_model_at_end=True,
    report_to="none",
    dataloader_pin_memory=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()





Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.5521,0.42595,0.828571,0.396985
2,0.4088,0.4909,0.797143,0.400844
3,0.2025,0.668337,0.756429,0.398589
4,0.0727,1.212822,0.749286,0.358318
5,0.0189,1.436749,0.749286,0.383128


TrainOutput(global_step=1085, training_loss=0.250978815720378, metrics={'train_runtime': 195.6407, 'train_samples_per_second': 177.213, 'train_steps_per_second': 5.546, 'total_flos': 2280515072332800.0, 'train_loss': 0.250978815720378, 'epoch': 5.0})

In [16]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

# 🚀 让 RoBERTa 进行预测
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)

# 🚀 生成分类报告
report = classification_report(test_labels, preds, target_names=["Not Sarcastic", "Sarcastic"])
print("\n📊 RoBERTa 分类报告:\n", report)

# 🚀 创建 DataFrame 以便保存结果
results_df = pd.DataFrame({
    "Text": test_texts,        # 测试集句子
    "True Label": test_labels, # 真实标签
    "Predicted Label": preds   # 预测结果
})

# 🚀 保存到 CSV 方便后续分析
results_df.to_csv("roberta_predictions.csv", index=False, encoding="utf-8")

print("\n✅ 预测结果已保存为 roberta_predictions.csv")



📊 RoBERTa 分类报告:
                precision    recall  f1-score   support

Not Sarcastic       0.90      0.90      0.90      1200
    Sarcastic       0.40      0.40      0.40       200

     accuracy                           0.83      1400
    macro avg       0.65      0.65      0.65      1400
 weighted avg       0.83      0.83      0.83      1400


✅ 预测结果已保存为 roberta_predictions.csv


In [17]:
import os

# 🚀 设定模型保存路径
save_path = "./roberta_sarcasm"

# 🚀 保存模型和 tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"\n✅ 训练好的 RoBERTa 模型已保存到 {save_path}")



✅ 训练好的 RoBERTa 模型已保存到 ./roberta_sarcasm


In [18]:
import shutil
from google.colab import files

# 🚀 压缩整个模型文件夹
shutil.make_archive("roberta_sarcasm", 'zip', "roberta_sarcasm")

# 🚀 下载到本地
files.download("roberta_sarcasm.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>