In [6]:
%pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


# C11 分类任务表示模型微调

BERT + 分类头 联合训练。共同进行参数更新

1. 确定要预测的标签数量

In [7]:
from datasets import load_dataset


tomatoes = load_dataset('rotten_tomatoes')
train_data, test_data = tomatoes['train'], tomatoes['test']

In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_id = 'bert-base-cased'
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_id)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from transformers import DataCollatorWithPadding


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

In [10]:
import numpy as np
import evaluate


def f1_metric(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_score = evaluate.load('f1')
    f1 = f1_score.compute(predictions=predictions, references=labels)['f1']
    return {'f1': f1}


In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    'models/1',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy='epoch',
    report_to='none',
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=f1_metric,
    data_collator=data_collator,
)

trainer.evaluate()

trainer.train()

trainer.evaluate()

  trainer = Trainer(


Step,Training Loss
500,0.4091


{'eval_loss': 0.3756716251373291,
 'eval_model_preparation_time': 0.0015,
 'eval_f1': 0.8547328959700093,
 'eval_runtime': 2.6011,
 'eval_samples_per_second': 409.833,
 'eval_steps_per_second': 25.759,
 'epoch': 1.0}

### 对比：部分冻结

冻结层数和质量间的平衡

也收到训练轮次的影响

In [13]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_id = 'bert-base-cased'
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_id)

for name, _ in model.named_parameters():
    print(name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

In [15]:
from transformers import Trainer


for name, param in model.named_parameters():
    if name.startswith('classifier'):
        param.requires_grad = True
    else:
        param.requires_grad = False


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=f1_metric,
    data_collator=data_collator,
)

trainer.train()
trainer.evaluate()

  trainer = Trainer(


Step,Training Loss
500,0.7006


{'eval_loss': 0.6833685040473938,
 'eval_f1': 0.6294326241134752,
 'eval_runtime': 2.4782,
 'eval_samples_per_second': 430.153,
 'eval_steps_per_second': 27.036,
 'epoch': 1.0}

In [16]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer

model_id = 'bert-base-cased'
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_id)

for idx, (_, param) in enumerate(model.named_parameters()):
    if idx < 165:
        param.requires_grad = False
    else:
        param.requires_grad = True


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=f1_metric,
    data_collator=data_collator,
)

trainer.train()
trainer.evaluate()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,0.4783


{'eval_loss': 0.41214412450790405,
 'eval_f1': 0.8107588856868396,
 'eval_runtime': 2.4722,
 'eval_samples_per_second': 431.198,
 'eval_steps_per_second': 27.102,
 'epoch': 1.0}

## 少样本分类

缺乏现成标注数据的分类任务

核心思想：通过为每个类别精选标注少量高质量数据点来完成模型训练

### SetFit 框架

三个关键阶段：

- 采样训练数据：对标注数据的类内与类间样本选择，生成包含正例与负例的句子对
- 微调嵌入模型：利用生成的训练数据，对预训练的嵌入模型进行微调
- 训练分类器：在优化后的嵌入模型的基础上构建分类头，并使用之前生成的训练数据对其进行训练

可用于零样本分类：利用标注名称生成合成样本模拟分类任务

In [17]:
%pip install setfit

Collecting setfit
  Downloading setfit-1.1.2-py3-none-any.whl.metadata (12 kB)
Downloading setfit-1.1.2-py3-none-any.whl (75 kB)
Installing collected packages: setfit
Successfully installed setfit-1.1.2
Note: you may need to restart the kernel to use updated packages.


In [21]:
import setfit
from setfit import sample_dataset, SetFitModel


sampled_train_data = sample_dataset(tomatoes['train'], num_samples=16)
# 默认逻辑回归，自定义分类头参数查看官方api文档
model = SetFitModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')
args = setfit.TrainingArguments(
    num_epochs=3,
    num_iterations=20,
)
args.eval_strategy = args.evaluation_strategy

trainer = setfit.Trainer(
    model=model,
    args=args,
    train_dataset=sampled_train_data,
    eval_dataset=test_data,
    metric='f1',
)
trainer.train()
trainer.evaluate()

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Map:   0%|          | 0/32 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 1280
  Batch size = 16
  Num epochs = 3


Step,Training Loss,Validation Loss


***** Running evaluation *****


{'f1': 0.8465909090909091}

## 基于掩码语言建模的继续预训练

继续预训练（continued pretraining）：解决预训练可能基于通用数据，缺少专业领域的理解。目标是更新子句表示，使其更好地适应特定领域的词汇

掩码策略：
* token掩码：随机掩码指定比例的token，可能导致某个单词部分掩码
* 整词掩码：预测整词比token更困难，提升准确和精细语义理解

In [22]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained('bert-base-cased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

tokenized_train = train_data.map(preprocess_function, batched=True).remove_columns(['label'])
tokenized_test = test_data.map(preprocess_function, batched=True).remove_columns(['label'])


In [24]:
from transformers import (
    DataCollatorForLanguageModeling, # token掩码
    DataCollatorForWholeWordMask, # 整词掩码
)

data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
training_args = TrainingArguments(
    'models/2',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy='epoch',
    report_to='none',
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

tokenizer.save_pretrained('models/mlm')

trainer.train()

model.save_pretrained('models/mlm')

  trainer = Trainer(
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
500,3.1015
1000,2.905
1500,2.8282
2000,2.7196
2500,2.6498
3000,2.6292
3500,2.5779
4000,2.5242
4500,2.5158
5000,2.4822


In [25]:
from transformers import pipeline


print('===== 1. bert-base-cased =====')

pipe = pipeline('fill-mask', model='bert-base-cased')
preds = pipe('What a horrible [MASK]!')

for pred in preds:
    print(f'>>> {pred["sequence"]}')

print('\n===== 2. after continued pretraining =====')

pipe = pipeline('fill-mask', model='models/mlm')
preds = pipe('What a horrible [MASK]!')

for pred in preds:
    print(f'>>> {pred["sequence"]}')

===== 1. bert-base-cased =====


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Device set to use cuda:0


>>> What a horrible idea!
>>> What a horrible dream!
>>> What a horrible thing!
>>> What a horrible day!
>>> What a horrible thought!

===== 2. after continued pretraining =====
>>> What a horrible movie!
>>> What a horrible mess!
>>> What a horrible film!
>>> What a horrible story!
>>> What a horrible thing!


## 命名实体识别（NER）

识别句子里token/单词的细粒度分类

可以用于：处理敏感数据（去标识化、匿名化任务）

挑战：分词过程中将原始标签与对应的子词元对齐

### 一些数据集

* `wnut_17`：聚焦新兴和罕见实体
* `tner/mit_movie_trivia`：演员、情节与配乐
* `tner/mit_restaurant`：设施、菜品和菜系

In [None]:
from datasets import load_dataset

# 使用trust_remote_code=True 一直报错，先把数据下载本地解决
dataset = load_dataset(path='./datasets/conll2003')

example = dataset['train'][0]
example

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [47]:
# B 起始
# I 内部
# PER 人名
# ORG 组织
# LOC 地点
# MISC 杂项
# O 非实体
label2id = {
  "O": 0,         
  "B-ORG": 1,
  "B-MISC": 2,  
  "B-PER": 3,
  "I-PER": 4,
  "B-LOC": 5,
  "I-ORG": 6,
  "I-MISC": 7,
  "I-LOC": 8
}

id2label = {v: k for k, v in label2id.items()}
id2label

{0: 'O',
 1: 'B-ORG',
 2: 'B-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-LOC',
 6: 'I-ORG',
 7: 'I-MISC',
 8: 'I-LOC'}

In [48]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModelForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(id2label), id2label=id2label, label2id=label2id)

token_ids = tokenizer(example['tokens'], is_split_into_words=True)['input_ids']
sub_tokens = tokenizer.convert_ids_to_tokens(token_ids)
sub_tokens

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [49]:
def align_labels(examples):
    token_ids = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
    )
    labels = examples['ner_tags']

    updated_labels = []
    for idx, label in enumerate(labels):
        word_ids = token_ids.word_ids(batch_index=idx)
        prev_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx != prev_word_idx:
                prev_word_idx = word_idx
                updated_label = -100 if word_idx is None else label[word_idx]
                label_ids.append(updated_label)
            elif word_idx is None:
                label_ids.append(-100)
            else:
                updated_label = label[word_idx]
                if updated_label % 2 == 1:
                    updated_label += 1
                label_ids.append(updated_label)
        
        updated_labels.append(label_ids)
    
    token_ids['labels'] = updated_labels
    return token_ids

In [50]:
tokenized = dataset.map(align_labels, batched=True)

print(example['ner_tags'])
print(tokenized['train'][0]['labels'])

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [52]:
%pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: seqeval
  Building wheel for seqeval (pyproject.toml): started
  Building wheel for seqeval (pyproject.toml): finished with status 'done'
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16283 sha256=bfc3ea0d10ca9908ff4ab10e7a9f206e881176097827eeaff1ae8eeb26cd9940
  Stored in directory: c:\users\dita\appdata\local\pip\cache\wheels\14\cf\a7\8f28ef376d707ff10e3922899482a2f23ef3002f8a952f47ac
Successfully built seqeval
Installing collec

In [53]:
import evaluate

seqeval = evaluate.load('seqeval')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=2)

    true_predictions = []
    true_labels = []

    for prediction, label in zip(predictions, labels):
        for token_prediction, token_label in zip(prediction, label):
            if token_label != -100:
                true_predictions.append(id2label[token_prediction])
                true_labels.append(id2label[token_label])
        
    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {'f1': results['overall_f1']}

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    'models/3',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy='epoch',
    report_to='none',
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

trainer.train()
trainer.evaluate()

In [None]:
from transformers import pipeline

trainer.save_model('models/ner_3')
token_classifier = pipeline('token-classification', model='models/ner_3')
token_classifier('My name is Ellie and I live in Berlin')