In [None]:
# 参考网址
https://github.com/blmoistawinde/hello_world/blob/master/huggingface_classification.ipynb

In [13]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"  # 在此我指定使用2号GPU，可根据需要调整
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments
from transformers import pipeline
from datasets import load_dataset

In [14]:
train_dataset = load_dataset("ag_news", split="train[:40000]")
dev_dataset = load_dataset("ag_news", split="train[40000:50000]")
test_dataset = load_dataset("ag_news", split="test")
print(train_dataset)
print(dev_dataset)
print(test_dataset)

Using custom data configuration default
Reusing dataset ag_news (/Users/lhj/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)
Using custom data configuration default
Reusing dataset ag_news (/Users/lhj/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)
Using custom data configuration default
Reusing dataset ag_news (/Users/lhj/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


Dataset({
    features: ['label', 'text'],
    num_rows: 40000
})
Dataset({
    features: ['label', 'text'],
    num_rows: 10000
})
Dataset({
    features: ['label', 'text'],
    num_rows: 7600
})


In [15]:
# 原始数据集包含text和label两个字段
train_dataset.features

{'label': ClassLabel(num_classes=4, names=['World', 'Sports', 'Business', 'Sci/Tech'], names_file=None, id=None),
 'text': Value(dtype='string', id=None)}

In [16]:
# 由于bert模型期望得到的标签的字段为labels而原始数据集中的名字是label，所以做一下调整。
# 下面的代码把label字段复制到labels。
train_dataset = train_dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
train_dataset[0]

Loading cached processed dataset at /Users/lhj/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-a71af80ff604ecf2.arrow


{'label': 2,
 'labels': 2,
 'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."}

In [17]:
dev_dataset = dev_dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
test_dataset = test_dataset.map(lambda examples: {'labels': examples['label']}, batched=True)

Loading cached processed dataset at /Users/lhj/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-bf0d4122b89150d7.arrow
Loading cached processed dataset at /Users/lhj/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-6e6e36e6400f3e4c.arrow


In [18]:
# 加载模型，tokenizer，并预处理数据
model_id = 'prajjwal1/bert-tiny'
# note that we need to specify the number of classes for this task
# we can directly use the metadata (num_classes) stored in the dataset
model = AutoModelForSequenceClassification.from_pretrained(model_id, 
            num_labels=train_dataset.features["label"].num_classes)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

In [19]:
# bert的方法对数据集做分词预处理，把所有序列补充或截断到256个token
MAX_LENGTH = 256
train_dataset = train_dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)
dev_dataset = dev_dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)
test_dataset = test_dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)

Loading cached processed dataset at /Users/lhj/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-43f38ac5550c93ca.arrow


  0%|          | 0/10 [00:00<?, ?ba/s]

Loading cached processed dataset at /Users/lhj/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548/cache-aa0d456a24691c94.arrow


In [20]:
# 为了放进pytorch模型训练，还要再声明格式和使用的字段
train_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
dev_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [21]:
train_dataset.features

{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'label': ClassLabel(num_classes=4, names=['World', 'Sports', 'Business', 'Sci/Tech'], names_file=None, id=None),
 'labels': Value(dtype='int64', id=None),
 'text': Value(dtype='string', id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [22]:
train_dataset[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([  101,  2813,  2358,  1012,  6468, 15020,  2067,  2046,  1996,  2304,
          1006, 26665

In [23]:
# 指定模型训练时，显示的验证指标
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [24]:
# 指定训练参数，使用trainer直接训练
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    learning_rate=3e-4,
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    do_train=True,
    do_eval=True,
    no_cuda=False,
    load_best_model_at_end=True,
    # eval_steps=100,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=dev_dataset,            # evaluation dataset
    compute_metrics=compute_metrics
)

train_out = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2807,0.311167,0.8964,0.897377,0.900655,0.897665
2,0.207,0.295274,0.901,0.902113,0.90356,0.902292
3,0.1326,0.317574,0.9012,0.902456,0.903191,0.902526


In [27]:
# 使用pipeline直接对文本进行预测
# pipeline可以直接加载训练好的模型和tokenizer，然后直接对文本进行分类预测，无需再自行预处理

# 我们把模型放回cpu来进行预测
model = model.cpu()

In [28]:
# 用sentiment-analysis来指定我们做的是文本分类任务（情感分析是一类代表性的文本分类任务），并指定我们之前训好的模型。
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [29]:
test_examples = load_dataset("ag_news", split="test[:10]")
test_examples[0]

Using custom data configuration default
Reusing dataset ag_news (/Users/lhj/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


{'label': 2,
 'text': "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul."}

In [30]:
result = classifier(test_examples[0]['text'])
result

[{'label': 'LABEL_2', 'score': 0.9229997396469116}]