In [4]:
# import torch
# torch.set_default_tensor_type('torch.cuda.FloatTensor')

In [5]:
from huggingface_hub import notebook_login
# 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
# notebook_login()

Make sure your version of Transformers is at least 4.11.0 since the functionality was introduced in that version:

In [6]:
import transformers

print(transformers.__version__)

4.36.2


In [7]:
from transformers.utils import send_example_telemetry

send_example_telemetry("token_classification_notebook", framework="pytorch")

# Fine-tuning a model on a token classification task

In [8]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
# model_checkpoint = "bert-base-uncased"
model_checkpoint = "roberta-base"
batch_size = 16

## Loading the dataset

In [9]:
from datasets import load_dataset, load_metric

In [None]:
from datasets import Dataset, DatasetDict, ClassLabel, Features, Sequence


def process_text_to_conll(text):
    sentences = []
    current_sentence = {'tokens': [], 'ner_tags': []}

    for line in text.split('\n'):
        if line.strip() == '':
            if current_sentence['tokens']:
                sentences.append(current_sentence)
                current_sentence = {'tokens': [], 'ner_tags': []}
        else:
            parts = line.split()
            try:
                token, ner_tag = parts[0], parts[1]
                current_sentence['tokens'].append(token)
                current_sentence['ner_tags'].append(ner_tag)
            except:
                pass

    return sentences

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Read content from a text file
file_path = 'trainset.txt'
raw_text = read_text_file(file_path)

# Process text data into CoNLL-2003 format
sentences = process_text_to_conll(raw_text)

# Split data into training and validation sets
train_size = int(0.8 * len(sentences))
valid_size = int(0.2 * len(sentences))
# test_size = len(sentences) - train_size - valid_size

# Create a DatasetDict object
dataset_dict = DatasetDict()

# Create Dataset objects using lists
dataset_dict['train'] = Dataset.from_dict({'tokens': [s['tokens'] for s in sentences[:train_size]],
                                           'ner_tags': [s['ner_tags'] for s in sentences[:train_size]]})
dataset_dict['validation'] = Dataset.from_dict({'tokens': [s['tokens'] for s in sentences[train_size:train_size+valid_size]],
                                                'ner_tags': [s['ner_tags'] for s in sentences[train_size:train_size+valid_size]]})
# dataset_dict['test'] = Dataset.from_dict({'tokens': [s['tokens'] for s in sentences[-test_size:]],
#                                           'ner_tags': [s['ner_tags'] for s in sentences[-test_size:]]})
# Here, 'ner_tags' need to be converted to category indices to match ClassLabel requirements
label_names = ['O', 'B-title', 'I-title', 'B-author', 'I-author', 'B-time', 'I-time', 'B-plat', 'I-plat','B-version', 'I-version']
label_mapping = {label: idx for idx, label in enumerate(label_names)}

# Create 'ner_tags' feature using ClassLabel
ner_feature = ClassLabel(names=label_names, num_classes=len(label_names))

from datasets import Dataset, DatasetDict, ClassLabel, Features, Sequence, Value

# Define a transformCamScore function that scales 'ner_tags' to card category labels
def convert_labels(example):
    example['ner_tags'] = [label_mapping[tag] for tag in example['ner_tags']]
    return example

features = Features({
    'tokens': Sequence(Value(dtype="string")),
    'ner_tags': Sequence(Value(dtype="int64")),
})

dataset_dict['train'] = dataset_dict['train'].map(convert_labels, features=features)
dataset_dict['validation'] = dataset_dict['validation'].map(convert_labels, features=features)
# dataset_dict['test'] = dataset_dict['test'].map(convert_labels, features=features)

datasets = dataset_dict


The `datasets` object itself is [`DatasetDict`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasetdict), which contains one key for the training, validation and test set.

In [11]:
datasets

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 61201
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 15300
    })
})

We can see the training, validation and test sets all have a column for the tokens (the input texts split into words) and one column of labels for each kind of task we introduced before.

To access an actual element, you need to select a split first, then give an index:

In [12]:
datasets["train"][0]

{'tokens': ['##############'], 'ner_tags': [0]}

The labels are already coded as integer ids to be easily usable by our model, but the correspondence with the actual categories is stored in the `features` of the dataset:

In [13]:
# label_list = datasets["train"].features[f"{task}_tags"].feature.names
label_list = ['O', 'B-title', 'I-title', 'B-author', 'I-author', 'B-time', 'I-time', 'B-plat', 'I-plat','B-version', 'I-version']

In [14]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [15]:
show_random_elements(datasets["train"])

Unnamed: 0,tokens,ner_tags
0,[127.0.0.1],[0]
1,[--],[0]
2,[-----],[0]
3,"[<?xml, version=""1.0"", encoding=""UTF-8""?>]","[0, 0, 0]"
4,[}],[0]
5,"[*, File, Vuln, checkuser.php]","[0, 0, 0, 0]"
6,"[injection., One, of, the, default, delegate's, command, is, used, to, handle, https]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,[##],[0]
8,"[#, Including, alignment, of, opcodes, in, memory]","[0, 0, 0, 0, 0, 0, 0]"
9,"[#15, pc, 000100d1, /system/lib/libutils.so, (_ZN7android6Thread11_threadLoopEPv+112)]","[0, 0, 0, 0, 0]"


## Preprocessing the data

In [16]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

The following assertion ensures that our tokenizer is a fast tokenizers (backed by Rust) from the 🤗 Tokenizers library. Those fast tokenizers are available for almost all models, and we will need some of the special features they have for our preprocessing.

In [17]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

You can check which type of models have a fast tokenizer available and which don't on the [big table of models](https://huggingface.co/transformers/index.html#bigtable).

You can directly call this tokenizer on one sentence:

In [18]:
label_all_tokens = True

We are now preparing to write a function for preprocessing samples. We provide the "truncation=True" parameter (to truncate texts larger than the maximum size allowed by the model) and "is_split_into_words=True" (as shown above) to the "tokenizer." Then, we align the labels with the token IDs using the strategy we have chosen.

In [19]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

This function works with one or several examples. In the case of several examples, the tokenizer will return a list of lists for each key:

In [20]:
tokenize_and_align_labels(datasets['train'][:5])

{'input_ids': [[0, 849, 49727, 49629, 10431, 2], [0, 849, 16161, 21058, 13497, 4832, 9318, 10699, 9017, 41614, 96, 35892, 2], [0, 849, 2], [0, 849, 16161, 21058, 14338, 4832, 287, 3592, 219, 1728, 6282, 2010, 2711, 2], [0, 849, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, -100], [-100, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2, -100], [-100, 0, -100], [-100, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, -100], [-100, 0, -100]]}

To apply this function on all the sentences (or pairs of sentences) in our dataset, we just use the `map` method of our `dataset` object we created earlier. This will apply the function on all the elements of all the splits in `dataset`, so our training, validation and testing data will be preprocessed in one single command.

In [None]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

## Fine-tuning the model

In [22]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list), hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


To instantiate a `Trainer`, we will need to define three more things. The most important is the [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments), which is a class that contains all the attributes to customize the training. It requires one folder name, which will be used to save the checkpoints of the model, and all other arguments are optional:

In [23]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{task}-{'ARG'}",
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    label_smoothing_factor=0.1,
    push_to_hub=False,
    load_best_model_at_end=True
)

We require a data collator that will batch processed examples together while applying padding to make them all the same size (each batch will be padded to the length of its longest example). The Transformers library includes a data collator for this task that not only pads the inputs but also the labels:

In [24]:
from transformers import DataCollatorForTokenClassification,  EarlyStoppingCallback

data_collator = DataCollatorForTokenClassification(tokenizer)

The final aspect we define for our "Trainer" is how to compute metrics based on predictions. Here, we utilize the [`seq...` metric, loaded via the datasets library, which is commonly used for evaluating results on the CONLL datasets.

In [None]:
metric = load_metric("seqeval")

This metric takes list of labels for the predictions and references:

In [26]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [27]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10, early_stopping_threshold=0.01)]
)

We can now finetune our model by just calling the `train` method:

In [28]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
500,0.6457,0.629996,0.684858,0.448942,0.542356,0.962803
1000,0.6039,0.60648,0.568824,0.466409,0.512551,0.964163
1500,0.5886,0.602304,0.619705,0.65603,0.63735,0.968286
2000,0.5859,0.609778,0.608768,0.429123,0.503399,0.964023
2500,0.5871,0.595035,0.724974,0.587504,0.64904,0.97049
3000,0.5804,0.601393,0.786508,0.654014,0.714168,0.972371
3500,0.5736,0.599042,0.773484,0.437017,0.558489,0.969036
4000,0.5733,0.59596,0.713327,0.606819,0.655776,0.970199
4500,0.5757,0.591644,0.756885,0.618576,0.680776,0.971536
5000,0.5739,0.601114,0.709233,0.683742,0.696254,0.968722


TrainOutput(global_step=6000, training_loss=0.586022933959961, metrics={'train_runtime': 904.0753, 'train_samples_per_second': 676.946, 'train_steps_per_second': 42.319, 'total_flos': 3088773359073066.0, 'train_loss': 0.586022933959961, 'epoch': 1.57})

The `evaluate` method allows you to evaluate again on the evaluation dataset or on another dataset:

In [None]:
trainer.evaluate()

In [30]:
# import torch
# loaded_model = AutoModelForTokenClassification.from_pretrained('/home/dwj/WY/bert/store_model/roberta-base-finetuned-ner/checkpoint-4000')
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# loaded_model.to(device)
# trainer.model = loaded_model

To get the precision/recall/f1 computed for each category now that we have finished training, we can apply the same function as before on the result of the `predict` method:

In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets['validation'])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

You can now upload the result of the training to the Hub, just execute this instruction:

In [None]:
from tabulate import tabulate

result_tuples = [
    ('author', results['author']['precision'], results['author']['recall'],
     results['author']['f1'], results['author']['number']),
    ('plat', results['plat']['precision'], results['plat']['recall'],
     results['plat']['f1'], results['plat']['number']),
    ('time', results['time']['precision'], results['time']['recall'],
     results['time']['f1'], results['time']['number']),
    ('title', results['title']['precision'], results['title']['recall'],
     results['title']['f1'], results['title']['number']),
    ('version', results['version']['precision'], results['version']['recall'],
     results['version']['f1'], results['version']['number']),
    ('overall', results['overall_precision'], results['overall_recall'],
     results['overall_f1'], results['author']['number']+results['plat']['number']+results['time']['number']+results['title']['number']+results['version']['number'])  # 'overall' does not have a 'number' field
]


result_table = tabulate(result_tuples, headers=['Label', 'Precision', 'Recall', 'F1-Score', 'Support'])

print(result_table)


In [None]:
# import os
# import random
# import shutil
# from transformers import pipeline
# import json
# import numpy as np

# def read():
#     file_path = []
#     for file in os.listdir('app_data'):
#         if file.endswith('.txt'):
#             file_path.append(os.path.join( 'app_data', file))
#     return file_path

# def extract(filepath_list):

#     label_names = ['O', 'B-title', 'I-title', 'B-author', 'I-author', 'B-time', 'I-time','B-reference', 'B-plat', 'I-plat','B-version', 'I-version']
#     model_checkpoint = 'bert/store_model/roberta-base-finetuned-ner/checkpoint-38000'

#     token_classifier = pipeline(
#         "token-classification", model=model_checkpoint, aggregation_strategy="simple"
#     )

#     sumnum = len(filepath_list)
#     countnum = 0
#     for filepath in filepath_list:

#         countnum += 1
#         print(f'process：{countnum}/{sumnum}', end='\r')

#         with open(filepath, 'r', encoding='utf-8') as file:
#             file_content = file.read()

#         results = token_classifier(file_content)

#         for item in results:
#             item['entity_group'] = label_names[int(item['entity_group'].split('_')[1])]

#         
#         final_results = []
#         pre_label = ""
#         for result in results:
#             label = result['entity_group']
#             if label == 'O':
#                 continue
#             ordlabel = label.split('-')[0]
#             nerlabel = label.split('-')[1]
#             if nerlabel !=  pre_label:
#                 result['entity_group'] = nerlabel
#                 final_results.append(result)
#             elif ordlabel == 'I':
#                 final_results[-1]['word'] += ' '+result['word']
#                 final_results[-1]['end'] = result['end']
#             pre_label = nerlabel

#         
#         for result in final_results:
#             for key, value in result.items():
#                 if isinstance(value, np.float32):
#                     result[key] = value.item()

#         directory, filename = os.path.split(filepath)
#         print(os.path.join(directory, filename[:-5] + '_POC_aspects.json'))
#         with open(os.path.join(directory, filename[:-5] + '_POC_aspects.json'), "w") as file:
#             json.dump(final_results, file, indent=4)


# if __name__ == "__main__":
#     filepath_list = read()
#     extract(filepath_list)