# 基于transformers的文本分类

## 导入相关包

In [17]:
!git clone https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english

Cloning into 'distilbert-base-uncased-finetuned-sst-2-english'...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


remote: Enumerating objects: 85, done.[K
remote: Total 85 (delta 0), reused 0 (delta 0), pack-reused 85 (from 1)[K
Unpacking objects: 100% (85/85), 401.62 KiB | 1.66 MiB/s, done.
Filtering content: 100% (5/5), 1.24 GiB | 17.28 MiB/s, done.


In [52]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset
import evaluate
import numpy as np
from transformers import pipeline
import re
import pandas as pd


## 加载数据集

In [4]:
tweets = Dataset.from_csv('train.csv')
tweets

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target'],
    num_rows: 7613
})

In [5]:
tweets[0]

{'id': 1,
 'keyword': None,
 'location': None,
 'text': 'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
 'target': 1}

In [6]:
tweets.features

{'id': Value(dtype='int64', id=None),
 'keyword': Value(dtype='string', id=None),
 'location': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'target': Value(dtype='int64', id=None)}

In [7]:
tweets.shape

(7613, 5)

In [8]:
splited_dataset = tweets.train_test_split(test_size=0.2, seed=42)
splited_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'target'],
        num_rows: 6090
    })
    test: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'target'],
        num_rows: 1523
    })
})

## 数据集预处理

In [10]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

In [11]:
def preprocess(examples):
    texts = examples['text']
    cleaned_texts = []

    for text in texts:
        text = re.sub(r'http\S+|www\S+|https\S+', '[URL]', text, flags=re.MULTILINE)
        text = re.sub(r'<.*?>', '', text)
        # 统一多个空格
        text = re.sub(r'\s+', ' ', text).strip()
        cleaned_texts.append(text)

    tokenized = tokenizer(
        cleaned_texts,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='np'
    )
        # 如果有标签，保留它们
    if 'target' in examples:
        tokenized['labels'] = examples['target']
        
    return tokenized


In [32]:
columns_to_remove = ['id', 'keyword', 'location', 'text', 'target'] 
tokenized_datasets = splited_dataset.map(
    preprocess,
    batched=True,
    remove_columns=columns_to_remove
)
tokenized_datasets

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6090
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1523
    })
})

## 创建模型

In [45]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english', num_labels = 2)

## 创建评估函数

In [34]:
accuracy = evaluate.load('accuracy')
f1 = evaluate.load('f1')

In [35]:
def eval_metric(pred):
    # 获取预测结果和真实标签
    logits, labels = pred
    # 将模型输出的logits转换为预测类别
    predictions = np.argmax(logits, axis=-1)
    
    # 计算准确率
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)

    # 计算F1值
    f1_score = f1.compute(predictions=predictions, references=labels, average='weighted')

    # 返回评估指标
    return {'accuracy': accuracy_score['accuracy'], 'f1': f1_score['f1']}

## 配置训练参数

In [48]:
args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=32,
    gradient_checkpointing=True,
    optim="adafactor",               # *** adafactor优化器 *** 
    evaluation_strategy='epoch',
    save_strategy='epoch',
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    logging_steps=100,
    save_total_limit=3
)



## 创建训练器

In [49]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['test'],
    compute_metrics = eval_metric,
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
)

## 模型训练

In [50]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.379832,0.532502,0.513841


KeyboardInterrupt: 

In [None]:
trainer.evaluate(eval_dataset=tokenized_datasets['test'])

## 模型预测

In [None]:
test_df = pd.read_csv('test.csv')
classifier = pipeline('text-classification', model='results/checkpoint-bes', tokenizer=tokenizer, device='mps')

In [None]:
# 对测试数据进行预测
test_texts = test_df['text'].tolist()

# 批量处理以加快速度
batch_size = 32
predictions = []

for i in range(0, len(test_texts), batch_size):
    batch_texts = test_texts[i:i+batch_size]
    batch_results = classifier(batch_texts)
    predictions.extend(batch_results)
    if i % 100 == 0:
        print(f"已处理 {i}/{len(test_texts)} 条数据")

print("预测完成!")

# 将预测结果转换为提交格式
predicted_labels = [1 if result['label'] == 'LABEL_1' else 0 for result in predictions]

#  创建提交文件
submission = pd.DataFrame({
    'id': test_df['id'],
    'target': predicted_labels
})

#  保存提交文件
submission.to_csv('prediction.csv', index=False)
