<a href="https://colab.research.google.com/github/SeanMuInCa/learn_python/blob/master/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  # 可选，主要用于数据可视化
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
import re

# 如果需要首次运行时下载 NLTK 资源
nltk.download('stopwords')

# 加载训练集、验证集和测试集
train_df = pd.read_csv('train1.csv')
valid_df = pd.read_csv('valid1.csv')
test_df  = pd.read_csv('test1.csv')

# 输出数据的基本信息，检查数据格式
print("训练集样本数:", train_df.shape[0])
print("验证集样本数:", valid_df.shape[0])
print("测试集样本数:", test_df.shape[0])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


FileNotFoundError: [Errno 2] No such file or directory: 'train1.csv'

In [9]:
# 定义映射字典
label_mapping = {
    "0": 0,
    "1": 1,
    "barely-true": 2,
    "half-true": 3,
    "mostly-true": 4,
    "pants-fire": 5
}

# 定义一个映射函数，将 label 转换为字符串后映射到数字
def map_label(label):
    # 将 label 转为字符串，去除两端空格后再映射
    label_str = str(label).strip()
    return label_mapping[label_str]

# 对训练集、验证集和测试集的 label 进行映射
train_df['label'] = train_df['label'].apply(map_label)
valid_df['label'] = valid_df['label'].apply(map_label)
test_df['label']  = test_df['label'].apply(map_label)

# 检查转换后的唯一标签
print("训练集唯一标签：", train_df['label'].unique())
print("验证集唯一标签：", valid_df['label'].unique())
print("测试集唯一标签：", test_df['label'].unique())


训练集唯一标签： [0 3 4 1 2 5]
验证集唯一标签： [2 5 0 3 1 4]
测试集唯一标签： [1 0 3 5 2 4]


In [13]:

!pip install transformers datasets

import pandas as pd
from datasets import Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch




# 检查各数据集的列名，确保文本和标签字段名称正确
print("训练集字段：", train_df.columns)
print("验证集字段：", valid_df.columns)
print("测试集字段：", test_df.columns)



# ------------------------------
# 3. 转换为 Hugging Face Dataset 格式
# ------------------------------

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset  = Dataset.from_pandas(test_df)

# ------------------------------
# 4. 文本分词（Tokenization）
# ------------------------------

# 加载 BERT 分词器
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# 定义分词函数，设置最大长度为 128（根据实际情况可调整）
def tokenize_function(examples):
    return tokenizer(examples['statement'], padding="max_length", truncation=True, max_length=128)

# 对数据集进行分词处理
train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)
test_dataset  = test_dataset.map(tokenize_function, batched=True)

# 设置数据格式为 PyTorch 张量，选择需要的字段
columns_to_return = ['input_ids', 'attention_mask', 'label']
train_dataset.set_format(type='torch', columns=columns_to_return)
valid_dataset.set_format(type='torch', columns=columns_to_return)
test_dataset.set_format(type='torch', columns=columns_to_return)

# ------------------------------
# 5. 定义评价指标函数
# ------------------------------

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# ------------------------------
# 6. 加载 BERT 模型并设置分类任务
# ------------------------------

# 设定 num_labels=6，因为标签已统一映射为 0～5
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# ------------------------------
# 7. 设置训练参数
# ------------------------------

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    report_to=[]  # 这行代码会禁用 wandb 等外部日志记录工具
)

# ------------------------------
# 8. 创建 Trainer 并开始训练
# ------------------------------

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

# 开始训练
trainer.train()

# ------------------------------
# 9. 在测试集上评估模型
# ------------------------------

results = trainer.evaluate(eval_dataset=test_dataset)
print("测试集结果:", results)


训练集字段： Index(['ID', 'label', 'statement', 'subject(s)', 'speaker',
       'speaker's job title', 'state info', 'party affiliation',
       'barely true counts', 'false counts', 'half true counts',
       'mostly true counts', 'pants on fire counts', 'context'],
      dtype='object')
验证集字段： Index(['ID', 'label', 'statement', 'subject(s)', 'speaker',
       'speaker's job title', 'state info', 'party affiliation',
       'barely true counts', 'false counts', 'half true counts',
       'mostly true counts', 'pants on fire counts', 'context'],
      dtype='object')
测试集字段： Index(['ID', 'label', 'statement', 'subject(s)', 'speaker',
       'speaker's job title', 'state info', 'party affiliation',
       'barely true counts', 'false counts', 'half true counts',
       'mostly true counts', 'pants on fire counts', 'context'],
      dtype='object')


Map:   0%|          | 0/10240 [00:00<?, ? examples/s]

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

Map:   0%|          | 0/1267 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
!pip install transformers datasets
from datasets import load_dataset

# 加载 LIAR 数据集，数据集包含 'train', 'validation', 'test' 三个划分
dataset = load_dataset("liar")

# 查看数据集结构（可选）
print(dataset)

# 定义标签映射函数，将字符串标签映射为数字
def map_label(example):
    mapping = {
        'pants-fire': 0,
        'false': 1,
        'barely-true': 2,
        'half-true': 3,
        'mostly-true': 4,
        'true': 5
    }
    # 如果 label 已经是整数，直接返回
    if isinstance(example["label"], int):
        return example
    # 如果是字符串，先去除空格后再映射
    example["label"] = mapping[example["label"].strip()]
    return example


# 应用映射到所有数据集划分
dataset = dataset.map(map_label)

# 导入 DistilBERT 分词器
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# 定义分词函数，这里假设文本字段名称为 "statement"
def tokenize_function(examples):
    return tokenizer(examples["statement"], padding="max_length", truncation=True, max_length=128)

# 对各个数据集进行分词处理
dataset = dataset.map(tokenize_function, batched=True)

# 设置格式为 PyTorch 张量，仅保留需要的字段
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# 导入 DistilBERT 模型用于文本分类，设定 num_labels 为 6
from transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)

# 定义训练参数，适当调整批次大小、训练轮数等以加快训练
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results_liar',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",   # 改为 'epoch' 与 evaluation_strategy 保持一致
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True,
    report_to=[]  # 禁用 wandb 等外部日志记录
)

# 定义评价指标函数
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# 使用 Trainer 进行微调
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    compute_metrics=compute_metrics
)

# 开始训练
trainer.train()

# 在测试集上评估模型效果
results = trainer.evaluate(eval_dataset=dataset["test"])
print("测试集结果:", results)


Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

liar.py:   0%|          | 0.00/6.41k [00:00<?, ?B/s]

The repository for liar contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/liar.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10269 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1283 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1284 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'],
        num_rows: 10269
    })
    test: Dataset({
        features: ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'],
        num_rows: 1283
    })
    validation: Dataset({
        features: ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'],
        num_rows: 1284
    })
})


Map:   0%|          | 0/10269 [00:00<?, ? examples/s]

Map:   0%|          | 0/1283 [00:00<?, ? examples/s]

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/10269 [00:00<?, ? examples/s]

Map:   0%|          | 0/1283 [00:00<?, ? examples/s]

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
