In [2]:
import pandas as pd
from datasets import DatasetDict
from collections import defaultdict
from datasets import get_dataset_config_names, load_dataset
from torch.nn import CrossEntropyLoss

# 查看数据集
xtreme_subsets = get_dataset_config_names('xtreme')
print(f"XTREME has {len(xtreme_subsets)} configurations")

panx_subsets = [s for s in xtreme_subsets if s.startswith('PAN')]
print(panx_subsets[:3])

# 构建真实的瑞士语语料库
langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059]
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    ds = load_dataset('xtreme', name=f"PAN-X.{lang}")
    for split in ds:
        # 使用 select 方法选取从索引 0 开始到计算出的数量减 1 的行
        panx_ch[lang][split] = (
            ds[split].shuffle(seed=0).select(range(int(frac * ds[split].num_rows)))
        )

# 查看训练集中每种语言的实例数量
df = pd.DataFrame({lang: [panx_ch[lang]["train"].num_rows,
                          panx_ch[lang]["test"].num_rows,
                          panx_ch[lang]["validation"].num_rows] for lang in langs}, index=["Train", "Test", "Validation"])
print(df)

# 从德语语料库中抽一个样本
element = panx_ch["de"]["train"][0]
for key, value in element.items():
    print(f"{key}: {value}")

XTREME has 183 configurations
['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']
               de    fr    it    en
Train       12580  4580  1680  1180
Test         6290  2290   840   590
Validation   6290  2290   840   590
tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


In [3]:
for key, value in panx_ch["de"]["train"].features.items():
    print(f"{key}: {value}")
    
# 提取特征列表
tags = panx_ch["de"]["train"].features["ner_tags"].feature
print(tags)

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [4]:
def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

panx_de = panx_ch["de"].map(create_tag_names)

de_example = panx_de["train"][0]
df = pd.DataFrame([de_example["tokens"], de_example["ner_tags"], de_example["ner_tags_str"]])
print(df)

      0           1   2    3         4      5   6    7           8   \
0  2.000  Einwohnern  an  der  Danziger  Bucht  in  der  polnischen   
1      0           0   0    0         5      6   0    0           5   
2      O           O   O    O     B-LOC  I-LOC   O    O       B-LOC   

             9        10 11  
0  Woiwodschaft  Pommern  .  
1             5        6  0  
2         B-LOC    I-LOC  O  


In [5]:
# 计算每个拆分中每个实体的频率
from collections import Counter

split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1
df = pd.DataFrame.from_dict(split2freqs, orient="index")
print(df)

             LOC   ORG   PER
train       6186  5366  5810
validation  3172  2683  2893
test        3180  2573  3071


In [6]:
# 多语言词元化技术
from transformers import AutoTokenizer

bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

# 比较SentencePiece与WordPiece
text = "Jack Sparrow loves New York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()
print(bert_tokens)
print(xlmr_tokens)

# 反词元化序列（无歧义的）
xlmr_sentence = "".join(xlmr_tokens).replace(u"\u2581", " ")
print(xlmr_sentence)

['[CLS]', 'Jack', 'Spa', '##rrow', 'loves', 'New', 'York', '!', '[SEP]']
['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '!', '</s>']
<s> Jack Sparrow loves New York!</s>


In [32]:
# 自定义Hugging Face Transformers库模型类
# 创建用于词元分类的自定义模型
# 构建用于词元分类的XLM-R类
import torch
import torch.nn as nn
from transformers import AutoConfig
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()
    
    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None, 
                num_items_in_batch=None, **kwargs):
        outputs = self.roberta(input_ids, token_type_ids, attention_mask, **kwargs)
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # view 方法对张量进行 reshape 操作
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return TokenClassifierOutput(loss, logits, outputs.hidden_states, outputs.attentions)
    
# 加载自定义模型
# 建立映射关系
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                                num_labels=tags.num_classes, id2label=index2tag, label2id=tag2index)
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device)
    
# 测试
input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
df = pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=["Token", "Input IDs"])
print(df)

             0      1      2      3      4  5     6      7   8     9
Token      <s>  ▁Jack  ▁Spar    row  ▁love  s  ▁New  ▁York   !  </s>
Input IDs    0  21763  37456  15555   5161  7  2356   5753  38     2


In [33]:
# 提取预测结果
outputs = xlmr_model(input_ids.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
print("Number of tokens in sequence: ", len(xlmr_tokens))
print("Shape of outputs: ", outputs.shape)
print("Shape of predictions: ", predictions.shape)

# 查看预测结果
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
df = pd.DataFrame([xlmr_tokens, preds], index=["Tokens", "Tags"])
print(df)


# 封装函数
def tag_text(text, tags, model, tokenizer):
    tokens = tokenizer(text).tokens()
    input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
    outputs = model(input_ids)[0]
    predictions = torch.argmax(outputs, dim=2)
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

Number of tokens in sequence:  10
Shape of outputs:  torch.Size([1, 10, 7])
Shape of predictions:  torch.Size([1, 10])
            0      1      2      3      4      5      6      7      8      9
Tokens    <s>  ▁Jack  ▁Spar    row  ▁love      s   ▁New  ▁York      !   </s>
Tags    I-LOC  I-LOC  I-LOC  I-LOC  I-LOC  I-LOC  I-LOC  I-LOC  I-LOC  I-LOC


In [34]:
# 构建编码的数据集
# function(examples: Dict[str, List]) -> Dict[str, List]
words, labels = de_example["tokens"], de_example["ner_tags"]
tokenized_input = xlmr_tokenizer(de_example["tokens"], return_tensors="pt", is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"][0])
df = pd.DataFrame([tokens], index=["Tokens"])
print(df)

# 掩码子词表示
word_ids = tokenized_input.word_ids()
df = pd.DataFrame([tokens, word_ids], index=["Tokens", "Word IDs"])
print(df)

previous_word_idx = None
label_ids = []
for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])
    previous_word_idx = word_idx

labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]
df = pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)
print(df)


# 函数封装
def tokenize_and_align_labels(examples):
    # truncation=True：启用截断，确保输入长度不超过模型的最大长度
    tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True,
                      remove_columns=["tokens", "ner_tags", "langs"])


panx_de_encoded = encode_panx_dataset(panx_ch["de"])
print(panx_de_encoded)

         0       1           2  3    4     5     6   7    8      9   ...   15  \
Tokens  <s>  ▁2.000  ▁Einwohner  n  ▁an  ▁der  ▁Dan  zi  ger  ▁Buch  ...  ▁Wo   

       16   17      18   19    20 21 22 23    24  
Tokens  i  wod  schaft  ▁Po  mmer  n  ▁  .  </s>  

[1 rows x 25 columns]
            0       1           2  3    4     5     6   7    8      9   ...  \
Tokens     <s>  ▁2.000  ▁Einwohner  n  ▁an  ▁der  ▁Dan  zi  ger  ▁Buch  ...   
Word IDs  None       0           1  1    2     3     4   4    4      5  ...   

           15 16   17      18   19    20  21  22  23    24  
Tokens    ▁Wo  i  wod  schaft  ▁Po  mmer   n   ▁   .  </s>  
Word IDs    9  9    9       9   10    10  10  11  11  None  

[2 rows x 25 columns]
             0       1           2     3    4     5      6     7     8   \
Tokens      <s>  ▁2.000  ▁Einwohner     n  ▁an  ▁der   ▁Dan    zi   ger   
Word IDs   None       0           1     1    2     3      4     4     4   
Label IDs  -100       0           0  -100  

In [35]:
# 性能度量
import numpy as np
from seqeval.metrics import classification_report

y_true = [["0", "0", "0", "B-MISC", "I-MISC", "I-MISC", "0"], ["B-MISC", "I-PER", "0"]]
y_pred = [["0", "0", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "0"], ["B-MISC", "I-PER", "0"]]
print(classification_report(y_true, y_pred))


# 将模型输出转换成seqeval所期望的列表的函数
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []
    
    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])
        labels_list.append(example_labels)
        preds_list.append(example_preds)
    return preds_list, labels_list



              precision    recall  f1-score   support

        MISC       0.50      0.50      0.50         2
         PER       1.00      1.00      1.00         1
           _       0.00      0.00      0.00         1

   micro avg       0.50      0.50      0.50         4
   macro avg       0.50      0.50      0.50         4
weighted avg       0.50      0.50      0.50         4



In [None]:
# 微调XLM-RoBERTa
# 在PAN-X数据集的德语子集上进行微调，然后评估在其他语言上的零样本性能
from huggingface_hub import login
from seqeval.metrics import f1_score
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

# 登录到 Hugging Face 平台

num_epochs = 3
batch_size = 24
path = 'D:/pycharm/python项目/PyTorch/Runs/'
logging_steps = len(panx_de_encoded["train"]) // batch_size
model_name = f"{xlmr_model_name}-finetuned-panx-de"
training_args = TrainingArguments(
    output_dir=path + model_name, log_level="error", num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch", save_steps=1e6, weight_decay=0.01,
    disable_tqdm=False, logging_steps=logging_steps, push_to_hub=True
)


# 定义如何在验证集上计算指标
def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions, eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}


# 模型初始化
def model_init():
    return XLMRobertaForTokenClassification.from_pretrained(
        xlmr_model_name, config=xlmr_config).to(device)


# 定义数据整理器
data_collator = DataCollatorForTokenClassification(tokenizer=xlmr_tokenizer)

trainer = Trainer(
    model_init=model_init, args=training_args,
    compute_metrics=compute_metrics, train_dataset=panx_de_encoded["train"],
    eval_dataset=panx_de_encoded["validation"], tokenizer=xlmr_tokenizer
)

trainer.train()
trainer.push_to_hub(commit_message="Training Completed!")