In [14]:
!pip install transformers datasets torch scikit-learn==1.2.2 scipy==1.10.1  seqeval==1.2.2 pytorch-crf==0.7.2



In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
%cd /content/drive/MyDrive/BERT-BiGRU-CRF

/content/drive/MyDrive/BERT-BiGRU-CRF


In [17]:
import os
import json
import torch
import numpy as np
import torch.nn as nn
from tqdm import tqdm
from torchcrf import CRF
from datasets import load_dataset
from torch.utils.data import DataLoader
from seqeval.metrics import classification_report
from transformers import BertModel, BertConfig, BertTokenizer, AdamW, get_linear_schedule_with_warmup, AutoTokenizer, DataCollatorForTokenClassification

In [18]:
tag_2_id = {'B-application': 0, 'B-cve id': 1, 'B-edition': 2, 'B-file': 3, 'B-function': 4, 'B-hardware': 5, 'B-language': 6, 'B-method': 7, 'B-os': 8, 'B-parameter': 9, 'B-programming language': 10, 'B-relevant_term': 11, 'B-update': 12, 'B-vendor': 13, 'B-version': 14, 'I-application': 15, 'I-edition': 16, 'I-hardware': 17, 'I-os': 18, 'I-relevant_term': 19, 'I-update': 20, 'I-vendor': 21, 'I-version': 22, 'O': 23}
id_2_tag = {0: 'B-application', 1: 'B-cve id', 2: 'B-edition', 3: 'B-file', 4: 'B-function', 5: 'B-hardware', 6: 'B-language', 7: 'B-method', 8: 'B-os', 9: 'B-parameter', 10: 'B-programming language', 11: 'B-relevant_term', 12: 'B-update', 13: 'B-vendor', 14: 'B-version', 15: 'I-application', 16: 'I-edition', 17: 'I-hardware', 18: 'I-os', 19: 'I-relevant_term', 20: 'I-update', 21: 'I-vendor', 22: 'I-version', 23: 'O'}

In [19]:
label_names = [v for k,v in id_2_tag.items()]

In [20]:
class NerConfig:
    def __init__(self):
        self.bert_dir = "thongnef/bert-finetuned-ner-cti"

        self.output_dir = "./checkpoint/"
        if not os.path.exists(self.output_dir):
            os.mkdir(self.output_dir)

        self.bio_labels = label_names
        self.num_labels = len(self.bio_labels)
        self.label2id = tag_2_id
        self.id2label = id_2_tag

        self.max_seq_len = 512
        self.epochs = 5
        self.train_batch_size = 8
        self.dev_batch_size = 8
        self.bert_learning_rate = 2e-5
        self.crf_learning_rate = 3e-3
        self.adam_epsilon = 1e-8
        self.weight_decay = 0.01
        self.warmup_proportion = 0.01
        self.save_step = 500

In [21]:
import torch.nn as nn
from transformers import BertModel, BertConfig
from torchcrf import CRF  # Make sure you have the torchcrf package installed

class ModelOutput:
  def __init__(self, logits, labels, loss=None):
    self.logits = logits
    self.labels = labels
    self.loss = loss

class BertNer(nn.Module):
    def __init__(self, args):
        super(BertNer, self).__init__()
        self.bert = BertModel.from_pretrained(args.bert_dir)
        self.bert_config = BertConfig.from_pretrained(args.bert_dir)
        hidden_size = self.bert_config.hidden_size
        self.gru_hidden = 128
        self.max_seq_len = args.max_seq_len
        self.gru = nn.GRU(hidden_size, self.gru_hidden, 1, bidirectional=True, batch_first=True, dropout=0.1)
        self.linear = nn.Linear(self.gru_hidden * 2, args.num_labels)
        self.crf = CRF(args.num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        seq_out = bert_output[0]  # [batchsize, max_len, 768]
        batch_size = seq_out.size(0)
        seq_out, _ = self.gru(seq_out)
        seq_out = seq_out.contiguous().view(-1, self.gru_hidden * 2)
        seq_out = seq_out.contiguous().view(batch_size, self.max_seq_len, -1)
        seq_out = self.linear(seq_out)

        logits = self.crf.decode(seq_out, mask=attention_mask.bool())
        loss = None
        if labels is not None:
            loss = -self.crf(seq_out, labels, mask=attention_mask.bool(), reduction='mean')
        model_output = ModelOutput(logits, labels, loss)
        return model_output


In [22]:
class Trainer:
    def __init__(self,
                 output_dir=None,
                 model=None,
                 train_loader=None,
                 save_step=500,
                 dev_loader=None,
                 test_loader=None,
                 optimizer=None,
                 schedule=None,
                 epochs=1,
                 device="cpu",
                 id2label=None):
        self.output_dir = output_dir
        self.model = model
        self.train_loader = train_loader
        self.dev_loader = dev_loader
        self.test_loader = test_loader
        self.epochs = epochs
        self.device = device
        self.optimizer = optimizer
        self.schedule = schedule
        self.id2label = id2label
        self.save_step = save_step
        self.total_step = len(self.train_loader) * self.epochs

    def train(self):
        global_step = 1
        for epoch in range(1, self.epochs + 1):
            for step, batch_data in enumerate(self.train_loader):
                self.model.train()
                for key, value in batch_data.items():
                    batch_data[key] = value.to(self.device)
                input_ids = batch_data["input_ids"]
                attention_mask = batch_data["attention_mask"]
                labels = batch_data["labels"]
                output = self.model(input_ids, attention_mask, labels)
                loss = output.loss
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                self.schedule.step()
                print(f"【train】{epoch}/{self.epochs} {global_step}/{self.total_step} loss:{loss.item()}")
                global_step += 1
                if global_step % self.save_step == 0:
                    torch.save(self.model.state_dict(), os.path.join(self.output_dir, "pytorch_model_ner.bin"))


        torch.save(self.model.state_dict(), os.path.join(self.output_dir, "pytorch_model_ner.bin"))

    def test(self):
        self.model.load_state_dict(torch.load(os.path.join(self.output_dir, "pytorch_model_ner.bin")))
        self.model.eval()
        preds = []
        trues = []
        for step, batch_data in enumerate(tqdm(self.test_loader)):
            for key, value in batch_data.items():
                batch_data[key] = value.to(self.device)
            input_ids = batch_data["input_ids"]
            attention_mask = batch_data["attention_mask"]
            labels = batch_data["labels"]
            output = self.model(input_ids, attention_mask, labels)
            logits = output.logits
            attention_mask = attention_mask.detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()

            batch_size = input_ids.size(0)
            for i in range(batch_size):
                length = sum(attention_mask[i])
                logit = logits[i][1:length]
                logit = [self.id2label[i] for i in logit]
                label = labels[i][1:length]
                label = [self.id2label[i] for i in label]
                preds.append(logit)
                trues.append(label)

        report = classification_report(trues, preds)
        return report

In [23]:
def build_optimizer_and_scheduler(args, model, t_total):
    module = (
        model.module if hasattr(model, "module") else model
    )

    no_decay = ["bias", "LayerNorm.weight"]
    model_param = list(module.named_parameters())

    bert_param_optimizer = []
    other_param_optimizer = []

    for name, para in model_param:
        space = name.split('.')
        # print(name)
        if space[0] == 'bert_module' or space[0] == "bert":
            bert_param_optimizer.append((name, para))
        else:
            other_param_optimizer.append((name, para))

    optimizer_grouped_parameters = [
        # bert other module
        {"params": [p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay)],
         "weight_decay": args.weight_decay, 'lr': args.bert_learning_rate},
        {"params": [p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay)],
         "weight_decay": 0.0, 'lr': args.bert_learning_rate},

        {"params": [p for n, p in other_param_optimizer if not any(nd in n for nd in no_decay)],
         "weight_decay": args.weight_decay, 'lr': args.crf_learning_rate},
        {"params": [p for n, p in other_param_optimizer if any(nd in n for nd in no_decay)],
         "weight_decay": 0.0, 'lr': args.crf_learning_rate},
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.bert_learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=int(args.warmup_proportion * t_total), num_training_steps=t_total
    )

    return optimizer, scheduler

In [24]:
#process data
def preprocess_data():
  args = NerConfig()
  raw_datasets = load_dataset("thongnef/dataset_dacn")
  # tokenizer = BertTokenizer.from_pretrained(args.bert_dir)
  tokenizer = AutoTokenizer.from_pretrained(args.bert_dir)

  converted_dict = {0:15, 1:1,2:16, 5:17,8:18, 10:19,12:20,13:21, 14:22 }

  def align_labels_with_tokens(labels, word_ids):
      new_labels = []
      current_word = None
      for word_id in word_ids:
          if word_id != current_word:
              # Start of a new word!
              current_word = word_id
              label = -100 if word_id is None else labels[word_id]
              new_labels.append(label)
          elif word_id is None:
              # Special token
              new_labels.append(-100)
          else:
              # Same word as previous token
              label = labels[word_id]
              if label in converted_dict.keys():
                label = converted_dict[label]
              # if label % 2 == 1:
              #     label += 1
              new_labels.append(label)
      new_labels = [0 if x == -100 else x for x in new_labels]
      return new_labels

  def tokenize_and_align_labels(examples):
      tokenized_inputs = tokenizer(
          examples["words"], truncation=True, is_split_into_words=True, padding="max_length"
      )
      all_labels = examples["tag"]
      new_labels = []
      for i, labels in enumerate(all_labels):
          word_ids = tokenized_inputs.word_ids(i)
          new_labels.append(align_labels_with_tokens(labels, word_ids))

      tokenized_inputs["labels"] = new_labels
      return tokenized_inputs

  tokenized_datasets = raw_datasets.map(
      tokenize_and_align_labels,
      batched=True,
      remove_columns=raw_datasets["train"].column_names,
  )

  tokenized_datasets = tokenized_datasets.remove_columns("token_type_ids")
  data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

  return tokenized_datasets, data_collator


In [None]:
def main():
    args = NerConfig()

    with open(os.path.join(args.output_dir, "ner_args.json"), "w") as fp:
        json.dump(vars(args), fp, ensure_ascii=False, indent=2)

    # tokenizer = BertTokenizer.from_pretrained(args.bert_dir)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenized_datasets, data_collator = preprocess_data()

    train_dataset = tokenized_datasets["train"]
    dev_dataset = tokenized_datasets["test"]
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=args.train_batch_size, num_workers=2, collate_fn=data_collator)
    dev_loader = DataLoader(dev_dataset, shuffle=False, batch_size=args.dev_batch_size, num_workers=2, collate_fn=data_collator)

    model = BertNer(args)

    # for name,_ in model.named_parameters():
    #   print(name)

    model.to(device)
    t_toal = len(train_loader) * args.epochs
    optimizer, schedule = build_optimizer_and_scheduler(args, model, t_toal)

    train = Trainer(
        output_dir=args.output_dir,
        model=model,
        train_loader=train_loader,
        dev_loader=dev_loader,
        test_loader=dev_loader,
        optimizer=optimizer,
        schedule=schedule,
        epochs=args.epochs,
        device=device,
        id2label=args.id2label
    )

    train.train()

    report = train.test()
    print(report)


In [None]:
main()

Downloading readme:   0%|          | 0.00/519 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/504k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13794 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3449 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Map:   0%|          | 0/13794 [00:00<?, ? examples/s]

Map:   0%|          | 0/3449 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at thongnef/bert-finetuned-ner-cti and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


【train】1/5 1/8625 loss:215.50807189941406
【train】1/5 2/8625 loss:209.6156463623047
【train】1/5 3/8625 loss:214.47344970703125
【train】1/5 4/8625 loss:190.8831787109375
【train】1/5 5/8625 loss:189.1452178955078
【train】1/5 6/8625 loss:136.12506103515625
【train】1/5 7/8625 loss:173.97988891601562
【train】1/5 8/8625 loss:143.08042907714844
【train】1/5 9/8625 loss:137.7901153564453
【train】1/5 10/8625 loss:91.0816650390625
【train】1/5 11/8625 loss:105.46607208251953
【train】1/5 12/8625 loss:112.46464538574219
【train】1/5 13/8625 loss:76.31756591796875
【train】1/5 14/8625 loss:51.03349304199219
【train】1/5 15/8625 loss:64.00466918945312
【train】1/5 16/8625 loss:72.38188934326172
【train】1/5 17/8625 loss:48.23876190185547
【train】1/5 18/8625 loss:40.95394515991211
【train】1/5 19/8625 loss:44.46210479736328
【train】1/5 20/8625 loss:45.74129867553711
【train】1/5 21/8625 loss:45.71696472167969
【train】1/5 22/8625 loss:27.30414390563965
【train】1/5 23/8625 loss:26.835397720336914
【train】1/5 24/8625 loss:39.754219055

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


【train】2/5 1726/8625 loss:0.2060394287109375
【train】2/5 1727/8625 loss:0.022064208984375
【train】2/5 1728/8625 loss:0.6235427856445312
【train】2/5 1729/8625 loss:0.2217388153076172
【train】2/5 1730/8625 loss:0.5642509460449219
【train】2/5 1731/8625 loss:0.031040191650390625
【train】2/5 1732/8625 loss:0.011674880981445312
【train】2/5 1733/8625 loss:0.060611724853515625
【train】2/5 1734/8625 loss:0.07981491088867188
【train】2/5 1735/8625 loss:2.3578224182128906
【train】2/5 1736/8625 loss:0.07183074951171875
【train】2/5 1737/8625 loss:0.015819549560546875
【train】2/5 1738/8625 loss:0.2506122589111328
【train】2/5 1739/8625 loss:0.042446136474609375
【train】2/5 1740/8625 loss:0.28762054443359375
【train】2/5 1741/8625 loss:0.032073974609375
【train】2/5 1742/8625 loss:0.2930564880371094
【train】2/5 1743/8625 loss:0.3788261413574219
【train】2/5 1744/8625 loss:0.023932456970214844
【train】2/5 1745/8625 loss:0.06066250801086426
【train】2/5 1746/8625 loss:3.6529712677001953
【train】2/5 1747/8625 loss:0.0440502166748

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


【train】3/5 3451/8625 loss:0.16763687133789062
【train】3/5 3452/8625 loss:0.03437042236328125
【train】3/5 3453/8625 loss:0.06625580787658691
【train】3/5 3454/8625 loss:0.06462383270263672
【train】3/5 3455/8625 loss:0.062489986419677734
【train】3/5 3456/8625 loss:0.038177490234375
【train】3/5 3457/8625 loss:1.8698458671569824
【train】3/5 3458/8625 loss:0.9884781837463379
【train】3/5 3459/8625 loss:0.0466461181640625
【train】3/5 3460/8625 loss:0.04527854919433594
【train】3/5 3461/8625 loss:0.04677581787109375
【train】3/5 3462/8625 loss:9.616887092590332
【train】3/5 3463/8625 loss:0.568603515625
【train】3/5 3464/8625 loss:0.20224761962890625
【train】3/5 3465/8625 loss:0.9265539646148682
【train】3/5 3466/8625 loss:0.1338825225830078
【train】3/5 3467/8625 loss:0.2316265106201172
【train】3/5 3468/8625 loss:0.4732551574707031
【train】3/5 3469/8625 loss:0.11868572235107422
【train】3/5 3470/8625 loss:0.0406794548034668
【train】3/5 3471/8625 loss:0.04187583923339844
【train】3/5 3472/8625 loss:0.1990203857421875
【trai

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


【train】4/5 5176/8625 loss:0.008899688720703125
【train】4/5 5177/8625 loss:0.011945724487304688
【train】4/5 5178/8625 loss:0.014469146728515625
【train】4/5 5179/8625 loss:0.013640880584716797
【train】4/5 5180/8625 loss:0.02573394775390625
【train】4/5 5181/8625 loss:0.17346954345703125
【train】4/5 5182/8625 loss:0.24460935592651367
【train】4/5 5183/8625 loss:0.011477947235107422
【train】4/5 5184/8625 loss:0.020721435546875
【train】4/5 5185/8625 loss:0.01692962646484375
【train】4/5 5186/8625 loss:0.08298683166503906
【train】4/5 5187/8625 loss:0.017120361328125
【train】4/5 5188/8625 loss:0.02257537841796875
【train】4/5 5189/8625 loss:0.021051406860351562
【train】4/5 5190/8625 loss:0.009410858154296875
【train】4/5 5191/8625 loss:0.01685333251953125
【train】4/5 5192/8625 loss:0.06995487213134766
【train】4/5 5193/8625 loss:0.010730743408203125
【train】4/5 5194/8625 loss:0.027591705322265625
【train】4/5 5195/8625 loss:0.007537841796875
【train】4/5 5196/8625 loss:0.024379730224609375
【train】4/5 5197/8625 loss:0.00

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


【train】5/5 6901/8625 loss:0.04242706298828125
【train】5/5 6902/8625 loss:0.0146636962890625
【train】5/5 6903/8625 loss:0.01165008544921875
【train】5/5 6904/8625 loss:0.018711090087890625
【train】5/5 6905/8625 loss:0.009113311767578125
【train】5/5 6906/8625 loss:0.01165771484375
【train】5/5 6907/8625 loss:0.026325225830078125
【train】5/5 6908/8625 loss:0.010577678680419922
【train】5/5 6909/8625 loss:1.0321712493896484
【train】5/5 6910/8625 loss:0.008510589599609375
【train】5/5 6911/8625 loss:0.01134490966796875
【train】5/5 6912/8625 loss:0.026737213134765625
【train】5/5 6913/8625 loss:0.09107208251953125
【train】5/5 6914/8625 loss:0.8375301361083984
【train】5/5 6915/8625 loss:0.014652252197265625
【train】5/5 6916/8625 loss:0.013289451599121094
【train】5/5 6917/8625 loss:0.013456583023071289
【train】5/5 6918/8625 loss:0.018033981323242188
【train】5/5 6919/8625 loss:0.041046142578125
【train】5/5 6920/8625 loss:0.021697998046875
【train】5/5 6921/8625 loss:0.0390324592590332
【train】5/5 6922/8625 loss:0.0148029

  0%|          | 0/432 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 432/432 [03:44<00:00,  1.92it/s]


                      precision    recall  f1-score   support

         application       0.95      0.96      0.96      7162
              cve id       1.00      1.00      1.00      3600
             edition       0.78      0.79      0.79       124
                file       0.96      1.00      0.98      6484
            function       0.95      0.99      0.97      2293
            hardware       0.74      0.78      0.76       108
            language       0.00      0.00      0.00         1
              method       0.89      1.00      0.94       226
                  os       0.94      0.96      0.95       671
           parameter       0.99      0.96      0.98       661
programming language       0.97      1.00      0.98        28
       relevant_term       1.00      1.00      1.00     18945
              update       0.94      0.94      0.94       804
              vendor       0.96      0.96      0.96      2027
             version       0.98      0.99      0.98      5673

      

In [25]:
def main():
    args = NerConfig()

    with open(os.path.join(args.output_dir, "ner_args.json"), "w") as fp:
        json.dump(vars(args), fp, ensure_ascii=False, indent=2)

    # tokenizer = BertTokenizer.from_pretrained(args.bert_dir)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenized_datasets, data_collator = preprocess_data()

    train_dataset = tokenized_datasets["train"]
    dev_dataset = tokenized_datasets["test"]
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=args.train_batch_size, num_workers=2, collate_fn=data_collator)
    dev_loader = DataLoader(dev_dataset, shuffle=False, batch_size=args.dev_batch_size, num_workers=2, collate_fn=data_collator)

    model = BertNer(args)

    # for name,_ in model.named_parameters():
    #   print(name)

    model.to(device)
    t_toal = len(train_loader) * args.epochs
    optimizer, schedule = build_optimizer_and_scheduler(args, model, t_toal)

    train = Trainer(
        output_dir=args.output_dir,
        model=model,
        train_loader=train_loader,
        dev_loader=dev_loader,
        test_loader=dev_loader,
        optimizer=optimizer,
        schedule=schedule,
        epochs=1,
        device=device,
        id2label=args.id2label
    )

    train.train()

    report = train.test()
    formatted_report = "{:.4f}".format(report)
    print(formatted_report)


In [None]:
main()

Some weights of BertModel were not initialized from the model checkpoint at thongnef/bert-finetuned-ner-cti and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


【train】1/1 1/1725 loss:247.07492065429688
【train】1/1 2/1725 loss:229.73257446289062
【train】1/1 3/1725 loss:223.47769165039062
【train】1/1 4/1725 loss:119.93244934082031
【train】1/1 5/1725 loss:147.331298828125
【train】1/1 6/1725 loss:156.26150512695312
【train】1/1 7/1725 loss:195.14013671875
【train】1/1 8/1725 loss:126.95335388183594
【train】1/1 9/1725 loss:97.16738891601562
【train】1/1 10/1725 loss:100.99333190917969
【train】1/1 11/1725 loss:100.18412017822266
【train】1/1 12/1725 loss:75.75473022460938
【train】1/1 13/1725 loss:73.33938598632812
【train】1/1 14/1725 loss:70.06056213378906
【train】1/1 15/1725 loss:78.22708129882812
【train】1/1 16/1725 loss:49.06952667236328
【train】1/1 17/1725 loss:34.7338981628418
【train】1/1 18/1725 loss:44.83311462402344
【train】1/1 19/1725 loss:42.13813781738281
【train】1/1 20/1725 loss:30.590778350830078
【train】1/1 21/1725 loss:30.695833206176758
【train】1/1 22/1725 loss:38.36725616455078
【train】1/1 23/1725 loss:29.522146224975586
【train】1/1 24/1725 loss:32.356910705